From 21251c29c2fd74fcbb4af56ebdeeeb85be4f43a0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 21 Sep 2012 17:22:30 -0400 Subject: [PATCH 01/35] Off-by-one error in sliding window manifests itself at end of a coverage region dropping the last covered base. --- .../sting/gatk/walkers/compression/reducereads/ReduceReads.java | 2 +- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 0def4e582..1beee3cbe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -263,7 +263,7 @@ public class ReduceReads extends ReadWalker, ReduceRea if (debugLevel == 1) System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd()); - // we write the actual alignment starts to their respectiv alignment shift tags in the temporary + // we write the actual alignment starts to their respective alignment shift tags in the temporary // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart()); read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 5820dc5f5..b486905e6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -612,7 +612,7 @@ public class SlidingWindow { finalizedReads = closeVariantRegions(regions, true); if (!windowHeader.isEmpty()) { - finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size() - 1, false)); + finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } From dcd31e654d8f91eeccdcef39f0d8072507b7aa31 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 21 Sep 2012 17:26:00 -0400 Subject: [PATCH 02/35] Turn off RR tests while I debug --- .../reducereads/ReduceReadsIntegrationTest.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 3f1cc7a3c..db8ea4eb8 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -21,33 +21,33 @@ public class ReduceReadsIntegrationTest extends WalkerTest { executeTest(testName, spec); } - @Test(enabled = true) + @Test(enabled = false) public void testDefaultCompression() { RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f"); } - @Test(enabled = true) + @Test(enabled = false) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76"); } - @Test(enabled = true) + @Test(enabled = false) public void testHighCompression() { RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047"); } - @Test(enabled = true) + @Test(enabled = false) public void testLowCompression() { RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63"); } - @Test(enabled = true) + @Test(enabled = false) public void testIndelCompression() { RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973"); } - @Test(enabled = true) + @Test(enabled = false) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb"))); @@ -61,7 +61,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { * * This bam is simplified to replicate the exact bug with the three provided intervals. */ - @Test(enabled = true) + @Test(enabled = false) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6"))); @@ -71,7 +71,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get * filtered out. */ - @Test(enabled = true) + @Test(enabled = false) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2"))); From ab8fa8f359fe61eab4c34ec596cbf9d021e4a7dd Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 21 Sep 2012 20:48:12 -0400 Subject: [PATCH 03/35] Bug fix: AlleleCount stratification in VariantEval didn't support higher ploidy and was producing bad tables --- .../sting/gatk/walkers/varianteval/VariantEval.java | 4 ++++ .../gatk/walkers/varianteval/stratifications/AlleleCount.java | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 01237ade3..a73e125ad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -171,6 +171,9 @@ public class VariantEval extends RodWalker implements TreeRedu @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; + @Argument(shortName="ploidy", fullName="samplePloidy", doc="Per-sample ploidy (number of chromosomes per sample)", required=false) + protected int ploidy = VariantContextUtils.DEFAULT_PLOIDY; + @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; @@ -574,6 +577,7 @@ public class VariantEval extends RodWalker implements TreeRedu public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } + public int getSamplePloidy() { return ploidy; } public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } public static String getAllSampleName() { return ALL_SAMPLE_NAME; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index fbd6371f3..e6efd4482 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -27,9 +27,9 @@ public class AlleleCount extends VariantStratifier { if ( getVariantEvalWalker().getEvals().size() != 1 && !getVariantEvalWalker().mergeEvals ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); - // There are 2 x n sample chromosomes for diploids + // There are ploidy x n sample chromosomes // TODO -- generalize to handle multiple ploidy - nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * 2; + nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy(); if ( nchrom < 2 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample"); From 133085469f3299c0d2030e827cebbec11deb9d4d Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 12 Sep 2012 13:00:29 -0400 Subject: [PATCH 04/35] Experimental, downsampler-friendly read shard balancer -Only used when experimental downsampling is enabled -Persists read iterators across shards, creating a new set only when we've exhausted the current BAM file region(s). This prevents the engine from revisiting regions discarded by the downsamplers / filters, as could happen in the old implementation. -SAMDataSource no longer tracks low-level file positions in experimental mode. Can strip out all related code when the engine fork is collapsed. -Defensive implementation that assumes BAM file regions coming out of the BAM Schedule can overlap; should be able to improve performance if we can prove they cannot possibly overlap. -Tests a bit on the extreme side (~8 minute runtime) for now; will scale these back once confidence in the code is gained --- .../src/net/sf/samtools/GATKBAMFileSpan.java | 31 +++ .../sting/gatk/GenomeAnalysisEngine.java | 10 +- .../sting/gatk/ReadProperties.java | 11 + .../gatk/datasources/reads/BAMScheduler.java | 13 +- .../reads/ExperimentalReadShardBalancer.java | 179 +++++++++++++++ .../gatk/datasources/reads/FilePointer.java | 63 +++++- .../gatk/datasources/reads/ReadShard.java | 71 +++++- .../datasources/reads/ReadShardBalancer.java | 2 + .../gatk/datasources/reads/SAMDataSource.java | 92 ++++---- .../sting/gatk/datasources/reads/Shard.java | 7 + .../reads/DownsamplerBenchmark.java | 2 + ...ExperimentalReadShardBalancerUnitTest.java | 203 ++++++++++++++++++ .../reads/SAMDataSourceUnitTest.java | 166 +------------- ...usIteratorByStateExperimentalUnitTest.java | 1 + .../LocusIteratorByStateUnitTest.java | 1 + 15 files changed, 630 insertions(+), 222 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java index ffc40067a..665b098e5 100644 --- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java +++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java @@ -125,6 +125,37 @@ public class GATKBAMFileSpan extends BAMFileSpan { return size; } + /** + * Get a GATKChunk representing the "extent" of this file span, from the start of the first + * chunk to the end of the last chunk.The chunks list must be sorted in order to use this method. + * + * @return a GATKChunk representing the extent of this file span, or a GATKChunk representing + * a span of size 0 if there are no chunks + */ + public GATKChunk getExtent() { + validateSorted(); // TODO: defensive measure: may be unnecessary + + List chunks = getChunks(); + if ( chunks.isEmpty() ) { + return new GATKChunk(0L, 0L); + } + + return new GATKChunk(chunks.get(0).getChunkStart(), chunks.get(chunks.size() - 1).getChunkEnd()); + } + + /** + * Validates the list of chunks to ensure that they appear in sorted order. + */ + private void validateSorted() { + List chunks = getChunks(); + for ( int i = 1; i < chunks.size(); i++ ) { + if ( chunks.get(i).getChunkStart() < chunks.get(i-1).getChunkEnd() ) { + throw new ReviewedStingException(String.format("Chunk list is unsorted; chunk %s is before chunk %s", chunks.get(i-1), chunks.get(i))); + + } + } + } + /** * Computes the union of two FileSpans. * @param other FileSpan to union with this one. diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 8071fe5dc..077d208d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -548,6 +548,7 @@ public class GenomeAnalysisEngine { */ protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. @@ -582,10 +583,15 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); } + // Use the experimental ReadShardBalancer if experimental downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ? + new ExperimentalReadShardBalancer() : + new ReadShardBalancer(); + if(intervals == null) - return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + return readsDataSource.createShardIteratorOverAllReads(readShardBalancer); else - return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer()); + return readsDataSource.createShardIteratorOverIntervals(intervals, readShardBalancer); } else throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index e1ada93cc..c37def397 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -30,6 +30,7 @@ import java.util.List; public class ReadProperties { private final Collection readers; private final SAMFileHeader header; + private final SAMFileHeader.SortOrder sortOrder; private final SAMFileReader.ValidationStringency validationStringency; private final DownsamplingMethod downsamplingMethod; private final ValidationExclusion exclusionList; @@ -64,6 +65,14 @@ public class ReadProperties { return header; } + /** + * Gets the sort order of the reads + * @return the sort order of the reads + */ + public SAMFileHeader.SortOrder getSortOrder() { + return sortOrder; + } + /** * How strict should validation be? * @return Stringency of validation. @@ -130,6 +139,7 @@ public class ReadProperties { */ public ReadProperties( Collection samFiles, SAMFileHeader header, + SAMFileHeader.SortOrder sortOrder, boolean useOriginalBaseQualities, SAMFileReader.ValidationStringency strictness, DownsamplingMethod downsamplingMethod, @@ -140,6 +150,7 @@ public class ReadProperties { byte defaultBaseQualities) { this.readers = samFiles; this.header = header; + this.sortOrder = sortOrder; this.validationStringency = strictness; this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index ebfef5dc1..d0e310d3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -124,7 +124,18 @@ public class BAMScheduler implements Iterator { */ private FilePointer generatePointerOverEntireFileset() { FilePointer filePointer = new FilePointer(); - Map currentPosition = dataSource.getCurrentPosition(); + Map currentPosition; + + // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling + // TODO: clean this up once the experimental downsampling engine fork collapses + if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) { + currentPosition = dataSource.getInitialReaderPositions(); + } + else { + currentPosition = dataSource.getCurrentPosition(); + + } + for(SAMReaderID reader: dataSource.getReaderIDs()) filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); return filePointer; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java new file mode 100644 index 000000000..73719cbb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.SAMFileSpan; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.*; + +/** + * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards + * + * @author David Roazen + */ +public class ExperimentalReadShardBalancer extends ShardBalancer { + + private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class); + + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer = null; + + /** + * Iterator over the reads from the current file pointer. The same iterator will be + * used to fill all shards associated with a given file pointer + */ + private PeekableIterator currentFilePointerReadsIterator = null; + + { + if ( filePointers.hasNext() ) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if ( ! hasNext() ) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + private void advance() { + nextShard = null; + + // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away + while ( nextShard == null && currentFilePointer != null ) { + + // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): + if ( currentFilePointerReadsIterator != null && ! currentFilePointerReadsIterator.hasNext() ) { + do { + advanceFilePointer(); + } while ( currentFilePointer != null && isEmpty(currentFilePointer.fileSpans) ); // skip empty file pointers + + // We'll need to create a fresh iterator for this file pointer when we create the first + // shard for it below. + currentFilePointerReadsIterator = null; + } + + // At this point if currentFilePointer is non-null we know it is also non-empty. Our + // currentFilePointerReadsIterator may be null or non-null depending on whether or not + // this is our first shard for this file pointer. + if ( currentFilePointer != null ) { + Shard shard = new ReadShard(parser,readsDataSource,currentFilePointer.fileSpans,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + + // Create a new reads iterator only when we've just advanced to a new file pointer. It's + // essential that the iterators persist across all shards that share the same file pointer + // to allow the downsampling to work properly. + if ( currentFilePointerReadsIterator == null ) { + currentFilePointerReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); + } + + if ( currentFilePointerReadsIterator.hasNext() ) { + shard.fill(currentFilePointerReadsIterator); + nextShard = shard; + } + } + } + } + + private void advanceFilePointer() { + FilePointer previousFilePointer = currentFilePointer; + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + + // TODO: This is a purely defensive measure to guard against the possibility of overlap + // TODO: between FilePointers. When overlap is detected, remove the overlapping regions from + // TODO: the newly-current FilePointer. + // TODO: If we later discover that overlap is theoretically impossible, this step becomes + // TODO: unnecessary and should be removed. + if ( currentFilePointer != null && previousFilePointer != null && + previousFilePointer.hasFileSpansOverlappingWith(currentFilePointer) ) { + + logger.debug(String.format("%s: found consecutive overlapping FilePointers [%s] and [%s]", getClass().getSimpleName(), previousFilePointer, currentFilePointer)); + + Map previousFileSpans = previousFilePointer.getFileSpans(); + Map trimmedFileSpans = new HashMap(currentFilePointer.getFileSpans().size()); + + for ( Map.Entry fileSpanEntry : currentFilePointer.getFileSpans().entrySet() ) { + // find the corresponding file span from the previous FilePointer + SAMFileSpan previousFileSpan = previousFileSpans.get(fileSpanEntry.getKey()); + + if ( previousFileSpan == null ) { + // no match, so no trimming required + trimmedFileSpans.put(fileSpanEntry.getKey(), fileSpanEntry.getValue()); + } + else { + // match, so remove any overlapping regions (regions before the start of the + // region immediately following the previous file span) + SAMFileSpan trimmedSpan = fileSpanEntry.getValue().removeContentsBefore(previousFileSpan.getContentsFollowing()); + trimmedFileSpans.put(fileSpanEntry.getKey(), trimmedSpan); + } + } + + // Replace the current file pointer with its trimmed equivalent + currentFilePointer = new FilePointer(trimmedFileSpans, currentFilePointer.locations); + } + } + + /** + * Detects whether the list of file spans contain any read data. + * @param selectedSpans Mapping of readers to file spans. + * @return True if file spans are completely empty; false otherwise. + */ + private boolean isEmpty(Map selectedSpans) { + for(SAMFileSpan fileSpan: selectedSpans.values()) { + if(!fileSpan.isEmpty()) + return false; + } + return true; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index df7827250..b0fbc05bf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -50,16 +50,37 @@ public class FilePointer { public FilePointer(final GenomeLoc... locations) { this.locations.addAll(Arrays.asList(locations)); + this.isRegionUnmapped = checkUnmappedStatus(); + } + + public FilePointer( Map fileSpans, List locations ) { + this.fileSpans.putAll(fileSpans); + this.locations.addAll(locations); + this.isRegionUnmapped = checkUnmappedStatus(); + } + + private boolean checkUnmappedStatus() { boolean foundMapped = false, foundUnmapped = false; - for(GenomeLoc location: locations) { - if(GenomeLoc.isUnmapped(location)) + + for( GenomeLoc location: locations ) { + if ( GenomeLoc.isUnmapped(location) ) foundUnmapped = true; else foundMapped = true; } - if(foundMapped && foundUnmapped) + if ( foundMapped && foundUnmapped ) throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); - this.isRegionUnmapped = foundUnmapped; + + return foundUnmapped; + } + + /** + * Returns an immutable view of this FilePointer's file spans + * + * @return an immutable view of this FilePointer's file spans + */ + public Map getFileSpans() { + return Collections.unmodifiableMap(fileSpans); } /** @@ -98,7 +119,13 @@ public class FilePointer { } public void addLocation(final GenomeLoc location) { - locations.add(location); + this.locations.add(location); + checkUnmappedStatus(); + } + + public void addLocations( final List locations ) { + this.locations.addAll(locations); + checkUnmappedStatus(); } public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { @@ -216,6 +243,32 @@ public class FilePointer { combined.addFileSpans(initialElement.getKey(),fileSpan); } + /** + * Returns true if any of the file spans in this FilePointer overlap their counterparts in + * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region + * from the start of the first chunk to the end of the last chunk). + * + * @param other the FilePointer against which to check overlap with this FilePointer + * @return true if any file spans overlap their counterparts in other, otherwise false + */ + public boolean hasFileSpansOverlappingWith( FilePointer other ) { + for ( Map.Entry thisFilePointerEntry : fileSpans.entrySet() ) { + GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue()); + + SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey()); + if ( otherEntry == null ) { + continue; // no counterpart for this file span in other + } + GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry); + + if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) { + return true; + } + } + + return false; + } + @Override public String toString() { StringBuilder builder = new StringBuilder(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index def27b20d..47b0c9833 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -1,17 +1,15 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.samtools.SAMFileSpan; -import net.sf.samtools.SAMRecord; +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @@ -103,6 +101,67 @@ public class ReadShard extends Shard { reads.add(read); } + /** + * Fills this shard's buffer with reads from the iterator passed in + * + * @param readIter Iterator from which to draw the reads to fill the shard + */ + @Override + public void fill( PeekableIterator readIter ) { + if( ! buffersReads() ) + throw new ReviewedStingException("Attempting to fill a non-buffering shard."); + + SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder(); + SAMRecord read = null; + + while( ! isBufferFull() && readIter.hasNext() ) { + final SAMRecord nextRead = readIter.peek(); + if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { + // only add reads to the shard if they are on the same contig + read = readIter.next(); + addRead(read); + } else { + break; + } + } + + // If the reads are sorted in coordinate order, ensure that all reads + // having the same alignment start become part of the same shard, to allow + // downsampling to work better across shard boundaries. Note that because our + // read stream has already been fed through the positional downsampler, which + // ensures that at each alignment start position there are no more than dcov + // reads, we're in no danger of accidentally creating a disproportionately huge + // shard + if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) { + while ( readIter.hasNext() ) { + SAMRecord additionalRead = readIter.peek(); + + // Stop filling the shard as soon as we encounter a read having a different + // alignment start or contig from the last read added in the earlier loop + // above, or an unmapped read + if ( read == null || + additionalRead.getReadUnmappedFlag() || + ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || + additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { + break; + } + + addRead(readIter.next()); + } + } + + // If the reads are sorted in queryname order, ensure that all reads + // having the same queryname become part of the same shard. + if( sortOrder == SAMFileHeader.SortOrder.queryname ) { + while( readIter.hasNext() ) { + SAMRecord nextRead = readIter.peek(); + if( read == null || ! read.getReadName().equals(nextRead.getReadName()) ) + break; + addRead(readIter.next()); + } + } + } + /** * Creates an iterator over reads stored in this shard's read cache. * @return diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java index 311c7874f..18fafb95d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -34,6 +34,8 @@ import java.util.NoSuchElementException; /** * Divide up large file pointers containing reads into more manageable subcomponents. + * + * TODO: delete this class once the experimental downsampling engine fork collapses */ public class ReadShardBalancer extends ShardBalancer { /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 437813f19..bf0d45f83 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -99,6 +99,8 @@ public class SAMDataSource { /** * How far along is each reader? + * + * TODO: delete this once the experimental downsampling engine fork collapses */ private final Map readerPositions = new HashMap(); @@ -154,8 +156,6 @@ public class SAMDataSource { */ private final ThreadAllocation threadAllocation; - private final boolean expandShardsForDownsampling; - /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. @@ -297,6 +297,7 @@ public class SAMDataSource { readProperties = new ReadProperties( samFiles, mergedHeader, + sortOrder, useOriginalBaseQualities, strictness, downsamplingMethod, @@ -306,11 +307,6 @@ public class SAMDataSource { includeReadsWithDeletionAtLoci, defaultBaseQualities); - expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null && - readProperties.getDownsamplingMethod().useExperimentalDownsampling && - readProperties.getDownsamplingMethod().type != DownsampleType.NONE && - readProperties.getDownsamplingMethod().toCoverage != null; - // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -384,7 +380,10 @@ public class SAMDataSource { /** * Retrieves the current position within the BAM file. * @return A mapping of reader to current position. + * + * TODO: delete this once the experimental downsampling engine fork collapses */ + @Deprecated public Map getCurrentPosition() { return readerPositions; } @@ -467,19 +466,15 @@ public class SAMDataSource { } /** - * Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places? + * Legacy method to fill the given buffering shard with reads. + * + * Shard.fill() is used instead of this method when experimental downsampling is enabled + * + * TODO: delete this method once the experimental downsampling engine fork collapses * - * @return true if we are using expanded shards, otherwise false - */ - public boolean usingExpandedShards() { - return expandShardsForDownsampling; - } - - - /** - * Fill the given buffering shard with reads. * @param shard Shard to fill. */ + @Deprecated public void fillShard(Shard shard) { if(!shard.buffersReads()) throw new ReviewedStingException("Attempting to fill a non-buffering shard."); @@ -503,31 +498,6 @@ public class SAMDataSource { } } - // If the reads are sorted in coordinate order, ensure that all reads - // having the same alignment start become part of the same shard, to allow - // downsampling to work better across shard boundaries. Note that because our - // read stream has already been fed through the positional downsampler, which - // ensures that at each alignment start position there are no more than dcov - // reads, we're in no danger of accidentally creating a disproportionately huge - // shard - if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) { - while ( iterator.hasNext() ) { - SAMRecord additionalRead = iterator.next(); - - // Stop filling the shard as soon as we encounter a read having a different - // alignment start or contig from the last read added in the earlier loop - // above, or an unmapped read - if ( read == null || - additionalRead.getReadUnmappedFlag() || - ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || - additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { - break; - } - shard.addRead(additionalRead); - noteFilePositionUpdate(positionUpdates, additionalRead); - } - } - // If the reads are sorted in queryname order, ensure that all reads // having the same queryname become part of the same shard. if(sortOrder == SAMFileHeader.SortOrder.queryname) { @@ -547,6 +517,10 @@ public class SAMDataSource { readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue()); } + /* + * TODO: delete this method once the experimental downsampling engine fork collapses + */ + @Deprecated private void noteFilePositionUpdate(Map positionMapping, SAMRecord read) { GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); positionMapping.put(read.getFileSource().getReader(),endChunk); @@ -557,8 +531,7 @@ public class SAMDataSource { return shard.iterator(); } else { - SAMReaders readers = resourcePool.getAvailableReaders(); - return getIterator(readers,shard,shard instanceof ReadShard); + return getIterator(shard); } } @@ -578,13 +551,44 @@ public class SAMDataSource { /** * Initialize the current reader positions + * + * TODO: delete this once the experimental downsampling engine fork collapses + * * @param readers */ + @Deprecated private void initializeReaderPositions(SAMReaders readers) { for(SAMReaderID id: getReaderIDs()) readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); } + /** + * Get the initial reader positions across all BAM files + * + * @return the start positions of the first chunk of reads for all BAM files + */ + public Map getInitialReaderPositions() { + Map initialPositions = new HashMap(); + SAMReaders readers = resourcePool.getAvailableReaders(); + + for ( SAMReaderID id: getReaderIDs() ) { + initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); + } + + resourcePool.releaseReaders(readers); + return initialPositions; + } + + /** + * Get an iterator over the data types specified in the shard. + * + * @param shard The shard specifying the data limits. + * @return An iterator over the selected data. + */ + public StingSAMIterator getIterator( Shard shard ) { + return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); + } + /** * Get an iterator over the data types specified in the shard. * @param readers Readers from which to load data. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java index f8d941784..e22a7a54d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMFileSpan; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.ReadMetrics; @@ -203,6 +204,12 @@ public abstract class Shard implements HasGenomeLocation { */ public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + /** + * Fills the shard with reads. Can only do this with shards that buffer reads + * @param readIter Iterator from which to draw the reads to fill the shard + */ + public void fill( PeekableIterator readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + /** * Gets the iterator over the elements cached in the shard. * @return diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index d2bfabacf..61c1c51b4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import com.google.caliper.Param; import net.sf.picard.filter.FilteringIterator; +import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; @@ -71,6 +72,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { SAMFileReader reader = new SAMFileReader(inputFile); ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), reader.getFileHeader(), + SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.SILENT, downsampling.create(), diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java new file mode 100644 index 000000000..b68956c0b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +public class ExperimentalReadShardBalancerUnitTest extends BaseTest { + + /** + * Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries + * at inappropriate places, such as within an alignment start position + */ + private static class ExperimentalReadShardBalancerTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + private int expectedReadCount; + + private SAMFileHeader header; + private SAMReaderID testBAM; + + public ExperimentalReadShardBalancerTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(ExperimentalReadShardBalancerTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + createTestBAM(); + + SAMDataSource dataSource = new SAMDataSource(Arrays.asList(testBAM), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + SAMFileReader.ValidationStringency.SILENT, + null, + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + int totalReadsSeen = 0; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + totalReadsSeen++; + + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + + Assert.assertEquals(totalReadsSeen, expectedReadCount, "did not encounter the expected number of reads"); + } + + private void createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + File testBAMFile; + try { + testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + testBAMFile.deleteOnExit(); + } + catch ( IOException e ) { + throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage())); + } + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + testBAM = new SAMReaderID(testBAMFile, new Tags()); + + new File(testBAM.getSamFilePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getSamFilePath() + ".bai").deleteOnExit(); + } + } + + @DataProvider(name = "ExperimentalReadShardBalancerTestDataProvider") + public Object[][] createExperimentalReadShardBalancerTests() { + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) { + // The first value will result in no downsampling at all, the others in some downsampling + for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.MAX_READS * 10, ReadShard.MAX_READS, ReadShard.MAX_READS / 2) ) { + new ExperimentalReadShardBalancerTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + } + + return ExperimentalReadShardBalancerTest.getTests(ExperimentalReadShardBalancerTest.class); + } + + @Test(dataProvider = "ExperimentalReadShardBalancerTestDataProvider") + public void runExperimentalReadShardBalancerTest( ExperimentalReadShardBalancerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 9df849940..0ed485cd2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -29,30 +29,21 @@ import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import org.testng.Assert; import java.io.File; import java.io.FileNotFoundException; -import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -66,165 +57,12 @@ import static org.testng.Assert.*; */ public class SAMDataSourceUnitTest extends BaseTest { + // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource + private List readers; private IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; - - /*********************************** - * Tests for the fillShard() method - ***********************************/ - - /** - * Tests to ensure that the fillShard() method does not place shard boundaries at inappropriate places, - * such as within an alignment start position - */ - private static class SAMDataSourceFillShardBoundaryTest extends TestDataProvider { - private int numContigs; - private int numStacksPerContig; - private int stackSize; - private int numUnmappedReads; - private DownsamplingMethod downsamplingMethod; - - private SAMFileHeader header; - - public SAMDataSourceFillShardBoundaryTest( int numContigs, - int numStacksPerContig, - int stackSize, - int numUnmappedReads, - int downsamplingTargetCoverage ) { - super(SAMDataSourceFillShardBoundaryTest.class); - - this.numContigs = numContigs; - this.numStacksPerContig = numStacksPerContig; - this.stackSize = stackSize; - this.numUnmappedReads = numUnmappedReads; - - this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); - - setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", - getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); - } - - public void run() { - SAMDataSource dataSource = new SAMDataSource(Arrays.asList(createTestBAM()), - new ThreadAllocation(), - null, - new GenomeLocParser(header.getSequenceDictionary()), - false, - SAMFileReader.ValidationStringency.SILENT, - null, - downsamplingMethod, - new ValidationExclusion(), - new ArrayList(), - false); - - Assert.assertTrue(dataSource.usingExpandedShards()); - - Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - - SAMRecord readAtEndOfLastShard = null; - - for ( Shard shard : shardIterator ) { - int numContigsThisShard = 0; - SAMRecord lastRead = null; - - for ( SAMRecord read : shard.iterator() ) { - if ( lastRead == null ) { - numContigsThisShard = 1; - } - else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { - numContigsThisShard++; - } - - // If the last read from the previous shard is not unmapped, we have to make sure - // that no reads in this shard start at the same position - if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { - Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && - readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), - String.format("Reads from alignment start position %d:%d are split across multiple shards", - read.getReferenceIndex(), read.getAlignmentStart())); - } - - lastRead = read; - } - - // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) - Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); - - readAtEndOfLastShard = lastRead; - } - } - - private SAMReaderID createTestBAM() { - header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); - SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); - readGroup.setSample("testSample"); - header.addReadGroup(readGroup); - ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, - "foo", - numContigs, - numStacksPerContig, - stackSize, - stackSize, - 1, - 100, - 50, - 150, - numUnmappedReads); - - File testBAMFile; - try { - testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); - testBAMFile.deleteOnExit(); - } - catch ( IOException e ) { - throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage())); - } - - SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); - for ( SAMRecord read : artificialReads ) { - bamWriter.addAlignment(read); - } - bamWriter.close(); - - return new SAMReaderID(testBAMFile, new Tags()); - } - } - - @DataProvider(name = "SAMDataSourceFillShardTestDataProvider") - public Object[][] createSAMDataSourceFillShardBoundaryTests() { - // Take downsampling out of the equation for these tests -- we are only interested in whether the - // shard boundaries occur at the right places in the read stream, and removing downsampling as a - // factor simplifies that task (note that we still need to provide a specific downsampling method with - // experimental downsampling enabled to trigger the shard expansion behavior, for now) - int downsamplingTargetCoverage = ReadShard.MAX_READS * 10; - - for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { - for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { - // Use crucial read shard boundary values as the stack sizes - for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) { - for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) { - new SAMDataSourceFillShardBoundaryTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); - } - } - } - } - - return SAMDataSourceFillShardBoundaryTest.getTests(SAMDataSourceFillShardBoundaryTest.class); - } - - // TODO: re-enable these tests once the issues with filepointer ordering + the downsamplers are worked out - @Test(dataProvider = "SAMDataSourceFillShardTestDataProvider", enabled = false) - public void testSAMDataSourceFillShard( SAMDataSourceFillShardBoundaryTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } - - - // TODO: the legacy tests below should really be replaced with a more comprehensive suite of tests for SAMDataSource - /** * This function does the setup of our parser, before each method call. *

diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java index 9d592cd26..a49a602c6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -502,6 +502,7 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), + SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, downsamplingMethod, diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index a5ead5665..83913fa76 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -333,6 +333,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), + SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, null, From 34eed20aa61b0815ad3bed8de17469b58dcc0cce Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 21 Sep 2012 22:22:59 -0400 Subject: [PATCH 05/35] PerSampleDownsamplingReadsIterator: fix for incorrect use of DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL Notify all downsamplers in our pool of the current global genomic position every DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL position changes, not every single positional change after that threshold is first reached. --- .../downsampling/PerSampleDownsamplingReadsIterator.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java index 8b2034460..5275c471e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java @@ -158,10 +158,10 @@ public class PerSampleDownsamplingReadsIterator implements StingSAMIterator { numPositionalChanges++; } - // If the number of times we've changed position exceeds a certain threshold, inform all - // downsamplers of the current position in the read stream. This is to prevent downsamplers - // for samples with sparser reads than others from getting stuck too long in a pending state. - if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) { + // Periodically inform all downsamplers of the current position in the read stream. This is + // to prevent downsamplers for samples with sparser reads than others from getting stuck too + // long in a pending state. + if ( numPositionalChanges > 0 && numPositionalChanges % DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL == 0 ) { for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { perSampleDownsampler.signalNoMoreReadsBefore(read); updateEarliestPendingRead(perSampleDownsampler); From e077347cc21ec10584fafeb0f475822f4876dcad Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 21 Sep 2012 22:27:45 -0400 Subject: [PATCH 06/35] Re-allow running the GATK with experimental downsampling It's now possible to run with experimental downsampling enabled using the --enable_experimental_downsampling engine argument. This is scheduled to become the GATK-wide default next week after diff engine output for failing tests has been examined. --- .../org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 077d208d5..67e5ad95b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -445,11 +445,6 @@ public class GenomeAnalysisEngine { GATKArgumentCollection argCollection = this.getArguments(); boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; - // until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling - if ( useExperimentalDownsampling ) { - throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!"); - } - DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); From f6a22e5f50930992d503aeb9559e6b93109c137e Mon Sep 17 00:00:00 2001 From: David Roazen Date: Sat, 22 Sep 2012 01:05:40 -0400 Subject: [PATCH 07/35] ExperimentalReadShardBalancerUnitTest was being skipped; fixed TestNG skips tests when an exception occurs in a data provider, which is what was happening here. This was due to an AWFUL AWFUL use of a non-final static for ReadShard.MAX_READS. This is fine if you assume only one instance of SAMDataSource, but with multiple tests creating multiple SAMDataSources, and each one overwriting ReadShard.MAX_READS, you have a recipe for problems. As a result of this the test ran fine individually, but not as part of the unit test suite. Quick fix for now to get the tests running -- this "mutable static" interface should really be refactored away though, when I have time. --- .../sting/gatk/datasources/reads/ReadShard.java | 16 +++++++++++++++- .../gatk/datasources/reads/SAMDataSource.java | 2 +- .../ExperimentalReadShardBalancerUnitTest.java | 8 ++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 47b0c9833..662c7526b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -34,10 +34,21 @@ import java.util.*; * @version 0.1 */ public class ReadShard extends Shard { + + /** + * Default read shard buffer size + */ + public static final int DEFAULT_MAX_READS = 10000; + /** * What is the maximum number of reads per BAM file which should go into a read shard. + * + * TODO: this non-final static variable should either be made final or turned into an + * TODO: instance variable somewhere -- as both static and mutable it wreaks havoc + * TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource + * TODO: changes this value) */ - public static int MAX_READS = 10000; + public static int MAX_READS = DEFAULT_MAX_READS; /** * The reads making up this shard. @@ -51,6 +62,9 @@ public class ReadShard extends Shard { /** * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface * until we know what effect tuning this parameter has. + * + * TODO: this mutable static interface is awful and breaks tests -- need to refactor + * * @param bufferSize New maximum number */ static void setReadBufferSize(final int bufferSize) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index bf0d45f83..8562ace98 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -252,7 +252,7 @@ public class SAMDataSource { validationStringency = strictness; this.removeProgramRecords = removeProgramRecords; if(readBufferSize != null) - ReadShard.setReadBufferSize(readBufferSize); + ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests else { // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java index b68956c0b..0807f36dc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java @@ -90,7 +90,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { new GenomeLocParser(header.getSequenceDictionary()), false, SAMFileReader.ValidationStringency.SILENT, - null, + ReadShard.DEFAULT_MAX_READS, // reset ReadShard.MAX_READS to ReadShard.DEFAULT_MAX_READS for each test downsamplingMethod, new ValidationExclusion(), new ArrayList(), @@ -180,10 +180,10 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { // Use crucial read shard boundary values as the stack sizes - for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) { - for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) { + for ( int stackSize : Arrays.asList(ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS / 2 + 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS - 1, ReadShard.DEFAULT_MAX_READS + 1, ReadShard.DEFAULT_MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS * 2) ) { // The first value will result in no downsampling at all, the others in some downsampling - for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.MAX_READS * 10, ReadShard.MAX_READS, ReadShard.MAX_READS / 2) ) { + for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.DEFAULT_MAX_READS * 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS / 2) ) { new ExperimentalReadShardBalancerTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); } } From 60b93acf7d86e1a032ec954832a5859bd923ee9b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 22 Sep 2012 21:32:29 -0400 Subject: [PATCH 08/35] RR bug: we need to test that the mapping and base quals are >= the MIN values and not just >. This was causing us to drop Q20 bases. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index b486905e6..13d90358b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -674,7 +674,7 @@ public class SlidingWindow { // check if the read is either before or inside the variant region if (read.getSoftStart() <= refStop) { // check if the read is inside the variant region - if (read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) { + if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) { // check if the read contains the het site if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) { int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); @@ -682,7 +682,7 @@ public class SlidingWindow { byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos]; // check if base passes the filters! - if (qual > MIN_BASE_QUAL_TO_COUNT) { + if (qual >= MIN_BASE_QUAL_TO_COUNT) { // check which haplotype this read represents and take the index of it from the list of headers if (haplotypeHeaderMap.containsKey(base)) { haplotype = haplotypeHeaderMap.get(base); From ced652b3dd4be0ea8d1aa4450eaa2c334d828745 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 22 Sep 2012 21:50:10 -0400 Subject: [PATCH 09/35] RR bug: we need to call removeFromHeader() for reads that were used in creating a polyploid consensus or else they are reused later in creating synthetic reads. In the worst case, this bug caused the tool to create 2 copies of the reduced read. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 1 + 1 file changed, 1 insertion(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 13d90358b..19b4826bf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -719,6 +719,7 @@ public class SlidingWindow { } for (GATKSAMRecord read : toRemove) { + removeFromHeader(windowHeader, read); readsInWindow.remove(read); } return hetReads; From 25e3ea879ab09a0fd896ab608c4d73bbc03ca7b2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 22 Sep 2012 22:16:35 -0400 Subject: [PATCH 10/35] Oops, missed this test before when updating md5s --- .../gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 99b62fa8d..1f418f736 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -349,7 +349,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("1a4d856bfe53d9acee0ea303c4b83bb1")); + Arrays.asList("c7792e27477ecf99893a76ecbac5c2f9")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } From 344083051bcee7cb5481a673fc18d58337711a4a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 22 Sep 2012 23:07:28 -0400 Subject: [PATCH 11/35] Reverting the fix to the generalized ploidy exact model since it cannot handle it computationally. Will file this in the JIRA. --- .../genotyper/GeneralPloidyExactAFCalculationModel.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java index 87572b804..5662d82d6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java @@ -281,7 +281,9 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula if (!Double.isInfinite(log10LofK)) newPool.add(set); - if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { + // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + //if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { + if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { if ( VERBOSE ) System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLikelihoodSeen.maxLog10L); return log10LofK; From 74bb4e2739e2af89254c62224860a14de9adf361 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 22 Sep 2012 23:24:55 -0400 Subject: [PATCH 12/35] Fixing the VariantContextUtilsUnitTest --- .../VariantContextUtilsUnitTest.java | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 95e8458c8..114104d42 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -598,8 +598,8 @@ public class VariantContextUtilsUnitTest extends BaseTest { private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { super(RepeatDetectorTest.class); - this.ref = "N" + ref; // add a dummy base for the event here this.isTrueRepeat = isTrueRepeat; + this.ref = ref; List alleles = new LinkedList(); final Allele refAllele = Allele.create(refAlleleString, true); @@ -609,7 +609,7 @@ public class VariantContextUtilsUnitTest extends BaseTest { alleles.add(alt); } - VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles); + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); this.vc = builder.make(); } @@ -620,31 +620,31 @@ public class VariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "RepeatDetectorTest") public Object[][] makeRepeatDetectorTest() { - new RepeatDetectorTest(true, "AAC", "-", "A"); - new RepeatDetectorTest(true, "AAC", "A", "-"); - new RepeatDetectorTest(false, "AAC", "AA", "-"); - new RepeatDetectorTest(false, "AAC", "-", "C"); + new RepeatDetectorTest(true, "NAAC", "N", "NA"); + new RepeatDetectorTest(true, "NAAC", "NA", "N"); + new RepeatDetectorTest(false, "NAAC", "NAA", "N"); + new RepeatDetectorTest(false, "NAAC", "N", "NC"); new RepeatDetectorTest(false, "AAC", "A", "C"); // running out of ref bases => false - new RepeatDetectorTest(false, "AAC", "-", "CAGTA"); + new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); // complex repeats - new RepeatDetectorTest(true, "ATATATC", "-", "AT"); - new RepeatDetectorTest(true, "ATATATC", "-", "ATA"); - new RepeatDetectorTest(true, "ATATATC", "-", "ATAT"); - new RepeatDetectorTest(true, "ATATATC", "AT", "-"); - new RepeatDetectorTest(false, "ATATATC", "ATA", "-"); - new RepeatDetectorTest(false, "ATATATC", "ATAT", "-"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); // multi-allelic - new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATAT"); - new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATA"); - new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATAT"); - new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATA"); // two As - new RepeatDetectorTest(false, "ATATATC", "AT", "-", "ATC"); // false - new RepeatDetectorTest(false, "ATATATC", "AT", "-", "CC"); // false - new RepeatDetectorTest(false, "ATATATC", "AT", "ATAT", "CC"); // false + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false return RepeatDetectorTest.getTests(RepeatDetectorTest.class); } From 0187f04a906f1b4b4b93446d2b68ccf4c8befff7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 23 Sep 2012 00:39:19 -0400 Subject: [PATCH 13/35] Proper fix for a previous RR bug fix: only remove reads from the header if they were actually used in the creation of the polyploid consensus. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 19b4826bf..997eca1ed 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -698,8 +698,11 @@ public class SlidingWindow { LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype); addToHeader(header, read); } + + removeFromHeader(windowHeader, read); } } + // we remove all reads before and inside the variant region from the window toRemove.add(read); } @@ -719,7 +722,6 @@ public class SlidingWindow { } for (GATKSAMRecord read : toRemove) { - removeFromHeader(windowHeader, read); readsInWindow.remove(read); } return hetReads; From 1509153b4bb04f61e9f37b8a13309138fb228c68 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 23 Sep 2012 00:47:40 -0400 Subject: [PATCH 14/35] Adding my little walker to assess reduced bam coverage against the original bam because it's turning out to be very useful. --- .../walkers/qc/AssessReducedCoverage.java | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java new file mode 100755 index 000000000..fd407d105 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.*; + +/** + * Emits intervals present in either the original or reduced bam but not the other. + * + *

Input

+ *

+ * The original and reduced BAM files. + *

+ * + *

Output

+ *

+ * A list of intervals present in one bam but not the other. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -I:original original.bam \
+ *   -I:reduced reduced.bam \
+ *   -R ref.fasta \
+ *   -T AssessReducedCoverage \
+ *   -o output.intervals
+ * 
+ * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) +@Hidden +public class AssessReducedCoverage extends LocusWalker implements TreeReducible { + + private static final String original = "original"; + private static final String reduced = "reduced"; + + @Output + protected PrintStream out; + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) + public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; + + public void initialize() {} + + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if ( tracker == null ) + return null; + + Set tags = getAllTags(context.getBasePileup()); + return (tags.contains(original) && !tags.contains(reduced)) || + (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; + } + + private Set getAllTags(final ReadBackedPileup pileup) { + + final Set tags = new HashSet(10); + + for ( final PileupElement p : pileup ) { + if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 ) + tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); + } + + return tags; + } + + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} \ No newline at end of file From ef680e1e13864bead13767c82b20bdd6e304237f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 24 Sep 2012 11:14:18 -0400 Subject: [PATCH 15/35] RR fix: push the header removal all the way into the inner loops so that we literally remove a read from the general header only if it was added to the polyploid header. Add comments. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 997eca1ed..d55560a70 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -696,10 +696,11 @@ public class SlidingWindow { currentHaplotype++; } LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype); + // add to the polyploid header addToHeader(header, read); + // remove from the standard header so that we don't double count it + removeFromHeader(windowHeader, read); } - - removeFromHeader(windowHeader, read); } } From 6a73265a06e75e1c63447f7318834c4b9fb36aad Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 24 Sep 2012 13:29:37 -0400 Subject: [PATCH 16/35] RR bug: we were adding synthetic reads from the header only before the variant region, which meant that reads that overlap the variant region but that weren't used for the consensus (because e.g. of low base quality for the spanning base) were never being used at all. Instead, add synthetic reads from before and spanning the variant region. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index d55560a70..6d6cbce04 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -546,7 +546,7 @@ public class SlidingWindow { List allReads = compressVariantRegion(start, stop); List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; - result.addAll(addToSyntheticReads(windowHeader, 0, start, false)); + result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary From 9464dfdbf2a7d26c4e04ab7993c698bc440a3c7b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 24 Sep 2012 14:06:07 -0400 Subject: [PATCH 17/35] Don't penalize the reduced reads for spanning deletions (when surrounding base quals are Q2s) --- .../sting/gatk/walkers/qc/AssessReducedCoverage.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java index fd407d105..d38c11594 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java @@ -99,7 +99,7 @@ public class AssessReducedCoverage extends LocusWalker imp final Set tags = new HashSet(10); for ( final PileupElement p : pileup ) { - if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 ) + if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); } From 10a6b57be6f5e56f92a2bf4a1e7775540a2e376c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 22 Sep 2012 13:21:55 -0400 Subject: [PATCH 18/35] Fix thread name: should be master executor not input --- .../broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index b014695da..d83a23c0f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -92,7 +92,7 @@ public class NanoScheduler { runningMapJobSlots = new Semaphore(this.bufferSize); this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); - this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); + this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-master-thread-%d")); } // start timing the time spent outside of the nanoScheduler From 09bbd2c4c3846715fceada347584ca75d058b91a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 22 Sep 2012 13:22:27 -0400 Subject: [PATCH 19/35] Include exception in VCFWriter when one is found when rethrowing as ReviewedStingException --- .../sting/utils/variantcontext/writer/VCFWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index f5306b6da..f2d34fe85 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -250,7 +250,7 @@ class VCFWriter extends IndexingVariantContextWriter { mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to " + getStreamName()); + throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e); } } From 4749fc114ff3337ae6b9ddc4bfd2ae30390de7d3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 22 Sep 2012 16:22:42 -0400 Subject: [PATCH 22/35] Temp. disable -nt > 1 and -nct > 1 while bugs are worked out --- .../org/broadinstitute/sting/gatk/executive/MicroScheduler.java | 2 ++ .../sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index a256c8a97..1555da494 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -145,6 +145,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + "available processors on this machine %d", threadAllocation.getTotalNumThreads(), Runtime.getRuntime().availableProcessors())); + if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) + throw new UserException("The GATK currently doesn't support running with both -nt > 1 and -nct > 1"); } if ( threadAllocation.getNumDataThreads() > 1 ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index d19a58b3a..674b0d4de 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -19,6 +19,8 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nt : Arrays.asList(1, 2) ) for ( final int nct : Arrays.asList(1, 2) ) { + if ( nt > 1 && nct > 1 ) + continue; // TODO -- remove me when we support -nct and -nt together // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); tests.add(new Object[]{ "BOTH", "081d077786ac0af24e9f97259a55209c", nt, nct }); From a6b3497eacebb8d7d06684675744761dce9af044 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 23 Sep 2012 18:02:48 -0400 Subject: [PATCH 23/35] Fixes GSA-515 Nanoscheduler GSA-577 -nt and -nct together appear to not close resources properly -- Fixes monster bug in the way that traversal engines interacted with the NanoScheduler via the output tracker. -- ThreadLocalOutputTracker is now a ThreadBasedOutputTracker that associates via a map from a master thread -> the storage map. Lookups occur by walking through threads in the same thread group, not just the thread itself (TBD -- should have a map from ThreadGroup instead) -- Removed unnecessary debug statement in GenomeLocParser -- nt and nct officially work together now --- .../executive/HierarchicalMicroScheduler.java | 48 ++++- .../gatk/executive/LinearMicroScheduler.java | 4 +- .../sting/gatk/executive/MicroScheduler.java | 52 +++-- .../sting/gatk/executive/ShardTraverser.java | 27 ++- .../gatk/io/ThreadBasedOutputTracker.java | 182 ++++++++++++++++++ .../gatk/io/ThreadLocalOutputTracker.java | 151 --------------- .../storage/VariantContextWriterStorage.java | 7 +- .../sting/utils/GenomeLocParser.java | 2 - .../NanoSchedulerIntegrationTest.java | 2 - 9 files changed, 283 insertions(+), 192 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 01c4315f2..dca2ecb7b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -7,13 +7,13 @@ import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; +import org.broadinstitute.sting.gatk.io.ThreadBasedOutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.threading.EfficiencyMonitoringThreadFactory; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.util.Collection; @@ -39,7 +39,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** * A thread local output tracker for managing output per-thread. */ - private ThreadLocalOutputTracker outputTracker = new ThreadLocalOutputTracker(); + private ThreadBasedOutputTracker outputTracker = new ThreadBasedOutputTracker(); private final Queue reduceTasks = new LinkedList(); @@ -93,11 +93,23 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar final int nThreadsToUse = threadAllocation.getNumDataThreads(); if ( threadAllocation.monitorThreadEfficiency() ) { - final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse); - setThreadEfficiencyMonitor(monitoringThreadFactory); - this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory); - } else { - this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); + throw new UserException.BadArgumentValue("nt", "Cannot monitor thread efficiency with -nt, sorry"); + } + + this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, new UniqueThreadGroupThreadFactory()); + } + + /** + * Creates threads for HMS each with a unique thread group. Critical to + * track outputs via the ThreadBasedOutputTracker. + */ + private static class UniqueThreadGroupThreadFactory implements ThreadFactory { + int counter = 0; + + @Override + public Thread newThread(Runnable r) { + final ThreadGroup group = new ThreadGroup("HMS-group-" + counter++); + return new Thread(group, r); } } @@ -253,6 +265,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar protected void mergeExistingOutput( boolean wait ) { long startTime = System.currentTimeMillis(); +// logger.warn("MergingExistingOutput"); +// printOutputMergeTasks(); + // Create a list of the merge tasks that will be performed in this run of the mergeExistingOutput(). Queue mergeTasksInSession = new LinkedList(); while( !outputMergeTasks.isEmpty() ) { @@ -266,8 +281,12 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar mergeTasksInSession.add(traverser); } +// logger.warn("Selected things to merge:"); +// printOutputMergeTasks(mergeTasksInSession); + // Actually run through, merging the tasks in the working queue. for( ShardTraverser traverser: mergeTasksInSession ) { + //logger.warn("*** Merging " + traverser.getIntervalsString()); if( !traverser.isComplete() ) traverser.waitForComplete(); @@ -312,11 +331,24 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar reduceTree.addEntry(traverseResult); outputMergeTasks.add(traverser); +// logger.warn("adding merge task"); +// printOutputMergeTasks(); + // No more data? Let the reduce tree know so it can finish processing what it's got. if (!isShardTraversePending()) reduceTree.complete(); } + private synchronized void printOutputMergeTasks() { + printOutputMergeTasks(outputMergeTasks); + } + + private synchronized void printOutputMergeTasks(final Queue tasks) { + logger.info("Output merge tasks " + tasks.size()); + for ( final ShardTraverser traverser : tasks ) + logger.info(String.format("\t%s: complete? %b", traverser.getIntervalsString(), traverser.isComplete())); + } + /** Pulls the next reduce from the queue and runs it. */ protected void queueNextTreeReduce( Walker walker ) { if (reduceTasks.size() == 0) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 09b18bfe1..5b94e0767 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -61,7 +61,7 @@ public class LinearMicroScheduler extends MicroScheduler { boolean done = walker.isDone(); int counter = 0; - final TraversalEngine traversalEngine = borrowTraversalEngine(); + final TraversalEngine traversalEngine = borrowTraversalEngine(this); for (Shard shard : shardStrategy ) { if ( done || shard == null ) // we ran out of shards that aren't owned break; @@ -97,7 +97,7 @@ public class LinearMicroScheduler extends MicroScheduler { Object result = accumulator.finishTraversal(); outputTracker.close(); - returnTraversalEngine(traversalEngine); + returnTraversalEngine(this, traversalEngine); cleanup(); executionIsDone(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 1555da494..5b1230c78 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -51,10 +51,7 @@ import javax.management.MBeanServer; import javax.management.ObjectName; import java.io.File; import java.lang.management.ManagementFactory; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -94,6 +91,11 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ final LinkedList availableTraversalEngines = new LinkedList(); + /** + * Engines that have been allocated to a key already. + */ + final HashMap allocatedTraversalEngines = new HashMap(); + /** * Counts the number of instances of the class that are currently alive. */ @@ -145,8 +147,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + "available processors on this machine %d", threadAllocation.getTotalNumThreads(), Runtime.getRuntime().availableProcessors())); - if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) - throw new UserException("The GATK currently doesn't support running with both -nt > 1 and -nct > 1"); +// if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) +// throw new UserException("The GATK currently doesn't support running with both -nt > 1 and -nct > 1"); } if ( threadAllocation.getNumDataThreads() > 1 ) { @@ -391,21 +393,37 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } /** - * Returns a traversal engine suitable for use in this thread. + * Returns a traversal engine suitable for use, associated with key * - * Pops the next available engine from the available ones maintained by this + * Key is an arbitrary object that is used to retrieve the same traversal + * engine over and over. This can be important in the case where the + * traversal engine has data associated with it in some other context, + * and we need to ensure that the context always sees the same traversal + * engine. This happens in the HierarchicalMicroScheduler, where you want + * the a thread executing traversals to retrieve the same engine each time, + * as outputs are tracked w.r.t. that engine. + * + * If no engine is associated with key yet, pops the next available engine + * from the available ones maintained by this * microscheduler. Note that it's a runtime error to pop a traversal engine * from this scheduler if there are none available. Callers that * once pop'd an engine for use must return it with returnTraversalEngine * + * @param key the key to associate with this engine * @return a non-null TraversalEngine suitable for execution in this scheduler */ @Ensures("result != null") - protected synchronized TraversalEngine borrowTraversalEngine() { - if ( availableTraversalEngines.isEmpty() ) - throw new IllegalStateException("no traversal engines were available"); - else { - return availableTraversalEngines.pop(); + protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { + if ( key == null ) throw new IllegalArgumentException("key cannot be null"); + + final TraversalEngine engine = allocatedTraversalEngines.get(key); + if ( engine == null ) { + if ( availableTraversalEngines.isEmpty() ) + throw new IllegalStateException("no traversal engines were available"); + allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); + return allocatedTraversalEngines.get(key); + } else { + return engine; } } @@ -413,14 +431,18 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * Return a borrowed traversal engine to this MicroScheduler, for later use * in another traversal execution * + * @param key the key used to id the engine, provided to the borrowTraversalEngine function * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. */ - protected synchronized void returnTraversalEngine(final TraversalEngine traversalEngine) { + protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { if ( traversalEngine == null ) throw new IllegalArgumentException("Attempting to push a null traversal engine"); if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); + if ( ! allocatedTraversalEngines.containsKey(key) ) + throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); - availableTraversalEngines.push(traversalEngine); + // note there's nothing to actually do here, but a function implementation + // might want to do something } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index e6f539614..6d165f76a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -4,9 +4,10 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; +import org.broadinstitute.sting.gatk.io.ThreadBasedOutputTracker; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.concurrent.Callable; @@ -29,7 +30,7 @@ public class ShardTraverser implements Callable { final private HierarchicalMicroScheduler microScheduler; final private Walker walker; final private Shard shard; - final private ThreadLocalOutputTracker outputTracker; + final private ThreadBasedOutputTracker outputTracker; private OutputMergeTask outputMergeTask; /** our log, which we want to capture anything from this class */ @@ -43,7 +44,7 @@ public class ShardTraverser implements Callable { public ShardTraverser( HierarchicalMicroScheduler microScheduler, Walker walker, Shard shard, - ThreadLocalOutputTracker outputTracker) { + ThreadBasedOutputTracker outputTracker) { this.microScheduler = microScheduler; this.walker = walker; this.shard = shard; @@ -51,13 +52,15 @@ public class ShardTraverser implements Callable { } public Object call() { - final TraversalEngine traversalEngine = microScheduler.borrowTraversalEngine(); + final Object traversalEngineKey = Thread.currentThread(); + final TraversalEngine traversalEngine = microScheduler.borrowTraversalEngine(traversalEngineKey); + try { final long startTime = System.currentTimeMillis(); - // this is CRITICAL -- initializes the thread-local output maps in the parent thread, - // so that any subthreads created by the traversal itself are shared... - outputTracker.getStorageAndInitializeIfNecessary(); + // this is CRITICAL -- initializes output maps in this master thread, + // so that any subthreads created by the traversal itself can access this map + outputTracker.initializeStorage(); Object accumulator = walker.reduceInit(); final WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), @@ -85,12 +88,20 @@ public class ShardTraverser implements Callable { } finally { synchronized(this) { complete = true; - microScheduler.returnTraversalEngine(traversalEngine); + microScheduler.returnTraversalEngine(traversalEngineKey, traversalEngine); notifyAll(); } } } + /** + * Return a human readable string describing the intervals this traverser is operating on + * @return + */ + public String getIntervalsString() { + return Utils.join(",", shard.getGenomeLocs()); + } + /** * Has this traversal completed? * @return True if completed, false otherwise. diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java new file mode 100644 index 000000000..f26d0c954 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.io; + +import org.broadinstitute.sting.gatk.executive.OutputMergeTask; +import org.broadinstitute.sting.gatk.io.storage.Storage; +import org.broadinstitute.sting.gatk.io.storage.StorageFactory; +import org.broadinstitute.sting.gatk.io.stubs.Stub; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * An output tracker that can either track its output per-thread or directly. + * + * This output tracker doesn't use thread local values, but rather looks up the + * storage map via the thread's group. This is necessary in the case where + * there's a master thread that creates the output map, and spawns subthreads + * that actually do work. As long as those subthreads are spawned in the + * thread group of the master thread, this tracker will properly find the + * storage map associated with the master thread in the group, and return + * the map to all subthreads. + * + * @author mhanna, depristo + * @version 0.2 + */ +public class ThreadBasedOutputTracker extends OutputTracker { + /** + * A map from thread ID of the master thread to the storage map from + * Stub to Storage objects + */ + private Map> threadsToStorage = new HashMap>(); + + /** + * A total hack. If bypass = true, bypass thread local storage and write directly + * to the target file. Used to handle output during initialize() and onTraversalDone(). + */ + private boolean bypass = false; + public void bypassThreadLocalStorage(boolean bypass) { + this.bypass = bypass; + } + + /** + * Initialize the storage map for this thread. + * + * Checks if there's a thread local binding for this thread, and if + * not initializes the map for it. This map is then + * populated with stub -> storage bindings according to the + * superclasses' outputs map. + * + * Must be called within the master thread to create a map associated with + * the master thread ID. + */ + public synchronized void initializeStorage() { + final long threadID = Thread.currentThread().getId(); + Map threadLocalOutputStreams = threadsToStorage.get(threadID); + + if( threadLocalOutputStreams == null ) { + threadLocalOutputStreams = new HashMap(); + threadsToStorage.put( threadID, threadLocalOutputStreams ); + } + + for ( final Stub stub : outputs.keySet() ) { + final Storage target = StorageFactory.createStorage(stub, createTempFile(stub)); + threadLocalOutputStreams.put(stub, target); + } + } + + @Override + public T getStorage( final Stub stub ) { + Storage target; + + if (bypass) { + target = outputs.get(stub); + if( target == null ) { + target = StorageFactory.createStorage(stub); + outputs.put(stub, target); + } + } + else { + final Map threadLocalOutputStreams = findStorage(Thread.currentThread()); + target = threadLocalOutputStreams.get(stub); + + // make sure something hasn't gone wrong, and we somehow find a map that doesn't include our stub + if ( target == null ) + throw new ReviewedStingException("target isn't supposed to be null for " + Thread.currentThread() + + " id " + Thread.currentThread().getId() + " map is " + threadLocalOutputStreams); + } + + return (T)target; + } + + + final Thread[] members = new Thread[1000]; // TODO -- dangerous -- fixme + private synchronized Map findStorage(final Thread thread) { + final Map map = threadsToStorage.get(thread.getId()); + if ( map != null ) { + return map; + } else { + final ThreadGroup tg = thread.getThreadGroup(); + final int nInfo = tg.enumerate(members); + if ( nInfo == members.length ) + throw new ReviewedStingException("too many threads in thread-group " + tg + " to safely get info. " + + "Maximum allowed threads is " + members.length); + + for ( int i = 0; i < nInfo; i++ ) { + final Map map2 = threadsToStorage.get(members[i].getId()); + if ( map2 != null ) + return map2; + } + + // something is terribly wrong, we have a storage lookup for a thread that doesn't have + // any map data associated with it! + throw new ReviewedStingException("Couldn't find storage map associated with thread " + thread + " id " + thread.getId()); + } + } + + /** + * Close down any existing temporary files which have been opened. + */ + public synchronized OutputMergeTask closeStorage() { + final Map threadLocalOutputStreams = findStorage(Thread.currentThread()); + + if( threadLocalOutputStreams == null || threadLocalOutputStreams.isEmpty() ) + return null; + + final OutputMergeTask outputMergeTask = new OutputMergeTask(); + for( Map.Entry entry: threadLocalOutputStreams.entrySet() ) { + final Stub stub = entry.getKey(); + final Storage storageEntry = entry.getValue(); + + storageEntry.close(); + outputMergeTask.addMergeOperation(getTargetStream(stub), storageEntry); + } + +// logger.info("Closing " + Thread.currentThread().getId() + " => " + threadLocalOutputStreams); + threadLocalOutputStreams.clear(); + + return outputMergeTask; + } + + /** + * Creates a temporary file for a stub of the given type. + * @param stub Stub for which to create a temporary file. + * @param Type of the stub to accept. + * @return A temp file, or throw an exception if the temp file cannot be created. + */ + private File createTempFile( Stub stub ) { + try { + return File.createTempFile( stub.getClass().getName(), null ); + } catch( IOException ex ) { + throw new UserException.BadTmpDir("Unable to create temporary file for stub: " + stub.getClass().getName() ); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java deleted file mode 100644 index e1e42a9a1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.io; - -import org.broadinstitute.sting.gatk.executive.OutputMergeTask; -import org.broadinstitute.sting.gatk.io.storage.Storage; -import org.broadinstitute.sting.gatk.io.storage.StorageFactory; -import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -/** - * An output tracker that can either track its output per-thread or directly, - * - * @author mhanna, depristo - * @version 0.2 - */ -public class ThreadLocalOutputTracker extends OutputTracker { - /** - * Thread-local storage for output streams. - * - * MUST BE A INHERITABLE THREAD LOCAL - * -- NanoScheduler creates subthreads, and these threads must inherit the binding from their parent - */ - private ThreadLocal> storage = new InheritableThreadLocal>(); - - /** - * A total hack. If bypass = true, bypass thread local storage and write directly - * to the target file. Used to handle output during initialize() and onTraversalDone(). - */ - private boolean bypass = false; - public void bypassThreadLocalStorage(boolean bypass) { - this.bypass = bypass; - } - - /** - * Initialize the storage map for this thread, if necessary. - * - * Checks if there's a thread local binding for this thread, and if - * not initializes it. - * - * Particularly useful in the case where we want to initialize the map in - * a parent thread but have it used available to all the children via - * the InheritedThreadLocal map. - * - * @return the storage - */ - public Map getStorageAndInitializeIfNecessary() { - Map threadLocalOutputStreams = storage.get(); - - if( threadLocalOutputStreams == null ) { - threadLocalOutputStreams = new HashMap(); - storage.set( threadLocalOutputStreams ); - } - - return threadLocalOutputStreams; - } - - public T getStorage( Stub stub ) { - Storage target; - - if(bypass) { - target = outputs.get(stub); - if( target == null ) { - target = StorageFactory.createStorage(stub); - outputs.put(stub, target); - } - } - else { - final Map threadLocalOutputStreams = getStorageAndInitializeIfNecessary(); - - target = threadLocalOutputStreams.get(stub); - if( target == null ) { - target = StorageFactory.createStorage(stub, createTempFile(stub)); - threadLocalOutputStreams.put(stub, target); - } - } - - return (T)target; - } - - /** - * Close down any existing temporary files which have been opened. - */ - public OutputMergeTask closeStorage() { - Map threadLocalOutputStreams = storage.get(); - - if( threadLocalOutputStreams == null || threadLocalOutputStreams.isEmpty() ) - return null; - - OutputMergeTask outputMergeTask = new OutputMergeTask(); - for( Map.Entry entry: threadLocalOutputStreams.entrySet() ) { - Stub stub = entry.getKey(); - Storage storageEntry = entry.getValue(); - - storageEntry.close(); - outputMergeTask.addMergeOperation(getTargetStream(stub),storageEntry); - } - - threadLocalOutputStreams.clear(); - - return outputMergeTask; - } - - /** - * Creates a temporary file for a stub of the given type. - * @param stub Stub for which to create a temporary file. - * @param Type of the stub to accept. - * @return A temp file, or throw an exception if the temp file cannot be created. - */ - private File createTempFile( Stub stub ) { - File tempFile = null; - - try { - tempFile = File.createTempFile( stub.getClass().getName(), null ); - //tempFile.deleteOnExit(); - } - catch( IOException ex ) { - throw new UserException.BadTmpDir("Unable to create temporary file for stub: " + stub.getClass().getName() ); - } - - return tempFile; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index 28ea69f4c..c6438cfdb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -89,7 +89,7 @@ public class VariantContextWriterStorage implements Storage 1 && nct > 1 ) - continue; // TODO -- remove me when we support -nct and -nt together // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); tests.add(new Object[]{ "BOTH", "081d077786ac0af24e9f97259a55209c", nt, nct }); From 3e8d9928287b2f7614976a4b539a86baaf5f4c8d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 23 Sep 2012 18:13:44 -0400 Subject: [PATCH 24/35] Remove bad error test from MicroScheduler, as it's no longer applicable. --- .../sting/gatk/executive/MicroScheduler.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 5b1230c78..07d9df79a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -319,10 +319,11 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * pointers to the traversal engines */ public synchronized void shutdownTraversalEngines() { - if ( availableTraversalEngines.size() != allCreatedTraversalEngines.size() ) - throw new IllegalStateException("Shutting down TraversalEngineCreator but not all engines " + - "have been returned. Expected " + allCreatedTraversalEngines.size() + " but only " + availableTraversalEngines.size() - + " have been returned"); + // no longer applicable because engines are allocated to keys now +// if ( availableTraversalEngines.size() != allCreatedTraversalEngines.size() ) +// throw new IllegalStateException("Shutting down TraversalEngineCreator but not all engines " + +// "have been returned. Expected " + allCreatedTraversalEngines.size() + " but only " + availableTraversalEngines.size() +// + " have been returned"); for ( final TraversalEngine te : allCreatedTraversalEngines) te.shutdown(); From 9fd30d6f1c326bd0625a5b7fef24751dc1d03f80 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 23 Sep 2012 18:19:10 -0400 Subject: [PATCH 25/35] When writing the initial commit for nt + nct I realized this class was really just a ThreadGroupOutputTracker -- The code is cleaner and the logical more obvious now. --- .../executive/HierarchicalMicroScheduler.java | 6 ++-- .../sting/gatk/executive/ShardTraverser.java | 6 ++-- ...ker.java => ThreadGroupOutputTracker.java} | 28 ++++++------------- 3 files changed, 14 insertions(+), 26 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/io/{ThreadBasedOutputTracker.java => ThreadGroupOutputTracker.java} (86%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index dca2ecb7b..31f2a469c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -7,7 +7,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; -import org.broadinstitute.sting.gatk.io.ThreadBasedOutputTracker; +import org.broadinstitute.sting.gatk.io.ThreadGroupOutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -39,7 +39,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** * A thread local output tracker for managing output per-thread. */ - private ThreadBasedOutputTracker outputTracker = new ThreadBasedOutputTracker(); + private ThreadGroupOutputTracker outputTracker = new ThreadGroupOutputTracker(); private final Queue reduceTasks = new LinkedList(); @@ -101,7 +101,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** * Creates threads for HMS each with a unique thread group. Critical to - * track outputs via the ThreadBasedOutputTracker. + * track outputs via the ThreadGroupOutputTracker. */ private static class UniqueThreadGroupThreadFactory implements ThreadFactory { int counter = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 6d165f76a..d9a694846 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -4,7 +4,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.io.ThreadBasedOutputTracker; +import org.broadinstitute.sting.gatk.io.ThreadGroupOutputTracker; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.Utils; @@ -30,7 +30,7 @@ public class ShardTraverser implements Callable { final private HierarchicalMicroScheduler microScheduler; final private Walker walker; final private Shard shard; - final private ThreadBasedOutputTracker outputTracker; + final private ThreadGroupOutputTracker outputTracker; private OutputMergeTask outputMergeTask; /** our log, which we want to capture anything from this class */ @@ -44,7 +44,7 @@ public class ShardTraverser implements Callable { public ShardTraverser( HierarchicalMicroScheduler microScheduler, Walker walker, Shard shard, - ThreadBasedOutputTracker outputTracker) { + ThreadGroupOutputTracker outputTracker) { this.microScheduler = microScheduler; this.walker = walker; this.shard = shard; diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java rename to public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java index f26d0c954..fdfe494a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadBasedOutputTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java @@ -51,12 +51,12 @@ import java.util.Map; * @author mhanna, depristo * @version 0.2 */ -public class ThreadBasedOutputTracker extends OutputTracker { +public class ThreadGroupOutputTracker extends OutputTracker { /** * A map from thread ID of the master thread to the storage map from * Stub to Storage objects */ - private Map> threadsToStorage = new HashMap>(); + private Map> threadsToStorage = new HashMap>(); /** * A total hack. If bypass = true, bypass thread local storage and write directly @@ -79,12 +79,12 @@ public class ThreadBasedOutputTracker extends OutputTracker { * the master thread ID. */ public synchronized void initializeStorage() { - final long threadID = Thread.currentThread().getId(); - Map threadLocalOutputStreams = threadsToStorage.get(threadID); + final ThreadGroup group = Thread.currentThread().getThreadGroup(); + Map threadLocalOutputStreams = threadsToStorage.get(group); if( threadLocalOutputStreams == null ) { threadLocalOutputStreams = new HashMap(); - threadsToStorage.put( threadID, threadLocalOutputStreams ); + threadsToStorage.put( group, threadLocalOutputStreams ); } for ( final Stub stub : outputs.keySet() ) { @@ -118,27 +118,15 @@ public class ThreadBasedOutputTracker extends OutputTracker { } - final Thread[] members = new Thread[1000]; // TODO -- dangerous -- fixme private synchronized Map findStorage(final Thread thread) { - final Map map = threadsToStorage.get(thread.getId()); + final Map map = threadsToStorage.get(thread.getThreadGroup()); + if ( map != null ) { return map; } else { - final ThreadGroup tg = thread.getThreadGroup(); - final int nInfo = tg.enumerate(members); - if ( nInfo == members.length ) - throw new ReviewedStingException("too many threads in thread-group " + tg + " to safely get info. " + - "Maximum allowed threads is " + members.length); - - for ( int i = 0; i < nInfo; i++ ) { - final Map map2 = threadsToStorage.get(members[i].getId()); - if ( map2 != null ) - return map2; - } - // something is terribly wrong, we have a storage lookup for a thread that doesn't have // any map data associated with it! - throw new ReviewedStingException("Couldn't find storage map associated with thread " + thread + " id " + thread.getId()); + throw new ReviewedStingException("Couldn't find storage map associated with thread " + thread + " in group " + thread.getThreadGroup()); } } From 0b488cce669ac294a9d3212d5d19423ca256dc7a Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 24 Sep 2012 14:45:53 -0400 Subject: [PATCH 26/35] ExperimentalReadShardBalancer: close() exhausted iterators Fixes a truly awful SAMReaders resource leak reported by Eric -- thanks Eric! --- .../gatk/datasources/reads/ExperimentalReadShardBalancer.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java index 73719cbb0..4d1d2a533 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java @@ -89,6 +89,10 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): if ( currentFilePointerReadsIterator != null && ! currentFilePointerReadsIterator.hasNext() ) { + + // Close the old, exhausted chain of iterators to release resources + currentFilePointerReadsIterator.close(); + do { advanceFilePointer(); } while ( currentFilePointer != null && isEmpty(currentFilePointer.fileSpans) ); // skip empty file pointers From 3f44b3e01939e2a5f4ca33cdaf05548a64e5efd4 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 24 Sep 2012 15:38:07 -0400 Subject: [PATCH 27/35] Update DataProcessingPipelineTest MD5s --- .../sting/queue/pipeline/DataProcessingPipelineTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 3fb9e0efa..19f00ac62 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -41,7 +41,7 @@ class DataProcessingPipelineTest { " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf", " -test ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "60d39ae909fdd049920b54e0965b6d3c" + spec.fileMD5s += testOut -> "45d97df6d291695b92668e8a55c54cd0" PipelineTest.executeTest(spec) } @@ -60,7 +60,7 @@ class DataProcessingPipelineTest { " -bwa /home/unix/carneiro/bin/bwa", " -bwape ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "61ca3237afdfabf78ee27a5bb80dae59" + spec.fileMD5s += testOut -> "6e70efbe6bafc3fedd60bd406bd201db" PipelineTest.executeTest(spec) } From 11a71e0390c9fc96628976b794d737c1e25ef5e3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 24 Sep 2012 21:46:14 -0400 Subject: [PATCH 28/35] RR bug: when determining the most common base at a position, break ties by which base has the highest sum of base qualities. Otherwise, sites with 1 Q2 N and 1 Q30 C are ending up as Ns in the consensus. I think perhaps we don't even care about which base has the most observations - it should just be determined by which has the highest sum of base qualities - but I'm not sure that's what users would expect. --- .../compression/reducereads/BaseCounts.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 0e434b4af..53c36c3f9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -7,7 +7,7 @@ import java.util.EnumMap; import java.util.Map; /** - * An object to keep track of the number of occurences of each base and it's quality. + * An object to keep track of the number of occurrences of each base and it's quality. * * User: depristo * Date: 4/8/11 @@ -83,8 +83,6 @@ import java.util.Map; } } - - @Ensures("result >= 0") public int getCount(byte base) { return getCount(BaseIndex.byteToBase(base)); @@ -183,7 +181,7 @@ import java.util.Map; public BaseIndex baseIndexWithMostCounts() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; for (BaseIndex i : counts.keySet()) - if (counts.get(i) > counts.get(maxI)) + if (hasHigherCount(i, maxI)) maxI = i; return maxI; } @@ -192,17 +190,23 @@ import java.util.Map; public BaseIndex baseIndexWithMostCountsWithoutIndels() { BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS; for (BaseIndex index : counts.keySet()) - if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts)) + if (index.isNucleotide() && hasHigherCount(index, mostCounts)) mostCounts = index; return mostCounts; } + private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) { + final int targetCount = counts.get(targetIndex); + final int testCount = counts.get(testIndex); + return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) ); + } + @Ensures("result >=0") public int totalCountWithoutIndels() { int sum = 0; - for (BaseIndex index : counts.keySet()) - if (index.isNucleotide()) - sum += counts.get(index); + for (Map.Entry entry : counts.entrySet()) + if (entry.getKey().isNucleotide()) + sum += entry.getValue(); return sum; } @@ -222,6 +226,6 @@ import java.util.Map; } public Object[] countsArray() { - return (Object []) counts.values().toArray(); + return counts.values().toArray(); } } From 55cdf4f9b77dae730eb6ffa2af4e07a48b462726 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 27 Sep 2012 00:13:32 -0400 Subject: [PATCH 33/35] Commit changes in Variants To Binary Ped to the stable repository to be available prior to next release. --- .../variantutils/VariantsToBinaryPed.java | 179 ++++++++++++------ .../variantcontext/GenotypeLikelihoods.java | 27 +++ .../VariantsToBinaryPedIntegrationTest.java | 117 ++++++++++++ .../GenotypeLikelihoodsUnitTest.java | 26 +++ 4 files changed, 296 insertions(+), 53 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 3fba8fa77..37fc96681 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -34,6 +37,28 @@ public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + /** + * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This + * is what Plink describes as a fam file. An example fam file is (note that there is no header): + * + * CEUTrio NA12878 NA12891 NA12892 2 -9 + * CEUTrio NA12891 UNKN1 UNKN2 2 -9 + * CEUTrio NA12892 UNKN3 UNKN4 1 -9 + * + * where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) + * + * An alternate format is a two-column key-value file + * + * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 + * NA12891 fid=CEUTrio;sex=2;phenotype=-9 + * NA12892 fid=CEUTrio;sex=1;phenotype=-9 + * + * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. + * + * Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the + * command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the + * alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data. + */ @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; @@ -76,47 +101,11 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; + initializeValidator(); + writeBedHeader(); + Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers - // write magic bits into the ped file - try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); - // ultimately, the bed will be in individual-major mode - } catch (IOException e) { - throw new ReviewedStingException("error writing to output file."); - } - // write to the fam file, the first six columns of the standard ped file - // first, load data from the input meta data file - Map> metaValues = new HashMap>(); - logger.debug("Reading in metadata..."); - try { - if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { - for ( String line : new XReadLines(metaDataFile) ) { - String[] famSplit = line.split("\\t"); - String sid = famSplit[1]; - outFam.printf("%s%n",line); - } - } else { - for ( String line : new XReadLines(metaDataFile) ) { - logger.debug(line); - String[] split = line.split("\\t"); - String sampleID = split[0]; - String keyVals = split[1]; - HashMap values = new HashMap(); - for ( String kvp : keyVals.split(";") ) { - String[] kvp_split = kvp.split("="); - values.put(kvp_split[0],kvp_split[1]); - } - metaValues.put(sampleID,values); - } - } - } catch (FileNotFoundException e) { - throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); - } // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype int dummyID = 0; // increments for dummy parental and family IDs used // want to be especially careful to maintain order here @@ -126,21 +115,29 @@ public class VariantsToBinaryPed extends RodWalker { continue; } for ( String sample : header.getValue().getGenotypeSamples() ) { - Map mVals = metaValues.get(sample); - if ( mVals == null ) { - throw new UserException("No metadata provided for sample "+sample); + if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + Map mVals = sampleMetaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); + } else { + // even if a fam file is input, we can't diverge the bed file from the fam file, which + // could lead to a malformed plink trio. Fail fast if there's any extra sample in the VCF. + if ( ! sampleMetaValues.containsKey(sample) ) { + throw new UserException("No metadata provided for sample "+sample); + } } - if ( ! mVals.containsKey("phenotype") ) { - throw new UserException("No phenotype data provided for sample "+sample); - } - String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); - String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); - String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); - String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; - String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); try { - File temp = File.createTempFile(sample, ".tmp"); + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); printMap.put(sample,new PrintStream(temp)); tempFiles.put(sample,temp); } catch (IOException e) { @@ -216,6 +213,7 @@ public class VariantsToBinaryPed extends RodWalker { // reset the buffer for this sample genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } + byteCount = 0; } genotypeCount = 0; } @@ -305,7 +303,7 @@ public class VariantsToBinaryPed extends RodWalker { private byte getFlippedEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { + if ( ! checkGQIsGood(g) ) { b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_VAR; @@ -320,6 +318,16 @@ public class VariantsToBinaryPed extends RodWalker { return (byte) (b << (2*offset)); } + private boolean checkGQIsGood(Genotype genotype) { + if ( genotype.hasGQ() ) { + return genotype.getGQ() >= minGenotypeQuality; + } else if ( genotype.hasLikelihoods() ) { + return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + } + + return false; + } + private static String getID(VariantContext v) { if ( v.hasID() ) { return v.getID(); @@ -337,4 +345,69 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } + + private void initializeValidator() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + } + + private void writeBedHeader() { + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + } + + private Map> parseMetaData() { + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + logger.debug("Reading in metadata..."); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\s+"); + if ( famSplit.length != 6 ) { + throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + } + String sid = famSplit[1]; + String fid = famSplit[0]; + String mom = famSplit[2]; + String dad = famSplit[3]; + String sex = famSplit[4]; + String pheno = famSplit[5]; + HashMap values = new HashMap(); + values.put("mom",mom); + values.put("dad",dad); + values.put("fid",fid); + values.put("sex",sex); + values.put("phenotype",pheno); + metaValues.put(sid,values); + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); + String[] split = line.split("\\s+"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + + return metaValues; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7b4256b70..641eb5449 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.EnumMap; +import java.util.List; public class GenotypeLikelihoods { private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; @@ -167,10 +168,36 @@ public class GenotypeLikelihoods { //Return the neg log10 Genotype Quality (GQ) for the given genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype + + /** + * This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context. + * Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List) in place of it. + * + * If you **know** you're biallelic, use getGQLog10FromLikelihoods directly. + * @param genotype - actually a genotype type (no call, hom ref, het, hom var) + * @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best). + */ + @Deprecated public double getLog10GQ(GenotypeType genotype){ return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); } + @Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"}) + private double getLog10GQ(List genotypeAlleles,List contextAlleles) { + int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0)); + int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1)); + int plIndex = calculatePLindex(allele1Index,allele2Index); + return getGQLog10FromLikelihoods(plIndex,getAsVector()); + } + + public double getLog10GQ(Genotype genotype, List vcAlleles ) { + return getLog10GQ(genotype.getAlleles(),vcAlleles); + } + + public double getLog10GQ(Genotype genotype, VariantContext context) { + return getLog10GQ(genotype,context.getAlleles()); + } + public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ if(likelihoods == null) return Double.NEGATIVE_INFINITY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java new file mode 100644 index 000000000..a75da6cf9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -0,0 +1,117 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/20/12 + * Time: 9:57 PM + * To change this template use File | Settings | File Templates. + */ +public class VariantsToBinaryPedIntegrationTest extends WalkerTest { + + public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/"; + + public static String baseTestString(String inputVCF, String inputMetaData, int gq) { + return "-T VariantsToBinaryPed -R " + b37KGReference + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + + @Test + public void testNA12878Alone() { + String testName = "testNA12878Alone"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testNA12878AloneMetaData() { + String testName = "testNA12878AloneMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrio() { + String testName = "testCEUTrio"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrioMetaData() { + String testName = "testCEUTrioMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testMalformedFam() { + String testName = "testMalformedFam"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } + + @Test + public void testFailFast() { + String testName = "testFailFast"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } + + @Test + public void testFailFastMeta() { + String testName = "testFailFastMeta"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.metadata.txt",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + + } +} + + diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 69f42e1f9..4ce32cee7 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; +import java.util.Arrays; import java.util.EnumMap; +import java.util.List; /** @@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest { double [] v = new double[]{-10.5, -1.25, -5.11}; final static String vGLString = "-10.50,-1.25,-5.11"; final static String vPLString = "93,0,39"; + double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC @Test public void testFromVector2() { @@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest { } } + // this test is completely broken, the method is wrong. + public void testGetQualFromLikelihoodsMultiAllelicBroken() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + double actualGQ = gl.getLog10GQ(GenotypeType.HET); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + + public void testGetQualFromLikelihoodsMultiAllelic() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + Allele ref = Allele.create(BaseUtils.A,true); + Allele alt1 = Allele.create(BaseUtils.C); + Allele alt2 = Allele.create(BaseUtils.T); + List allAlleles = Arrays.asList(ref,alt1,alt2); + List gtAlleles = Arrays.asList(alt1,alt2); + GenotypeBuilder gtBuilder = new GenotypeBuilder(); + gtBuilder.alleles(gtAlleles); + double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { From e82946e5c95712e1e358168106e81aff903da02f Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 25 Sep 2012 11:37:17 -0400 Subject: [PATCH 34/35] ExperimentalReadShardBalancer: create one monolithic FilePointer per contig Merge all FilePointers for each contig into a single, merged, optimized FilePointer representing all regions to visit in all BAM files for a given contig. This helps us in several ways: -It allows us to create a single, persistent set of iterators for each contig, finally and definitively eliminating all Shard/FilePointer boundary issues for the new experimental ReadWalker downsampling -We no longer need to track low-level file positions in the sharding system (which was no longer possible anyway given the new experimental downsampling system) -We no longer revisit BAM file chunks that we've visited in the past -- all BAM file access is purely sequential -We no longer need to constantly recreate our full chain of read iterators There are also potential dangers: -We hold more BAM index data in memory at once. Given that we merge and optimize the index data during the merge, and only hold one contig's worth of data at a time, this does not appear to be a major issue. TODO: confirm this! -With a huge number of samples and intervals, the FilePointer merge operation might become expensive. With the latest implementation, this does not appear to be an issue even with a huge number of intervals (for one sample, at least), but if it turns out to be a problem for > 1 sample there are things we can do. Still TODO: unit tests for the new FilePointer.union() method --- .../reads/ExperimentalReadShardBalancer.java | 160 ++++++++++-------- .../gatk/datasources/reads/FilePointer.java | 94 +++++++++- 2 files changed, 179 insertions(+), 75 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java index 4d1d2a533..6c064cf86 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java @@ -25,15 +25,40 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMFileSpan; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import java.util.*; /** - * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards + * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. + * + * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig + * together into one monolithic FilePointer, create one persistent set of read iterators over + * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to + * fill read shards with reads. + * + * This strategy has several important advantages: + * + * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole + * contig will have regions that overlap with other FilePointers on the same contig, due + * to the limited granularity of BAM index data. By creating only one FilePointer per contig, + * we avoid having to track how much of each file region we've visited (as we did in the + * former implementation), we avoid expensive non-sequential access patterns in the files, + * and we avoid having to repeatedly re-create our iterator chain for every small region + * of interest. + * + * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single + * persistent set of read iterators (which include the downsampling iterator(s)) per contig, + * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never + * loses crucial state information while downsampling within a contig. + * + * TODO: There is also at least one important disadvantage: + * + * 1. We load more BAM index data into memory at once, and this work is done upfront before processing + * the next contig, creating a delay before traversal of each contig. This delay may be + * compensated for by the gains listed in #1 above, and we may be no worse off overall in + * terms of total runtime, but we need to verify this empirically. * * @author David Roazen */ @@ -55,17 +80,16 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { /** * The file pointer currently being processed. */ - private FilePointer currentFilePointer = null; + private FilePointer currentContigFilePointer = null; /** - * Iterator over the reads from the current file pointer. The same iterator will be + * Iterator over the reads from the current contig's file pointer. The same iterator will be * used to fill all shards associated with a given file pointer */ - private PeekableIterator currentFilePointerReadsIterator = null; + private PeekableIterator currentContigReadsIterator = null; { - if ( filePointers.hasNext() ) - currentFilePointer = filePointers.next(); + createNextContigFilePointer(); advance(); } @@ -85,93 +109,87 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { nextShard = null; // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away - while ( nextShard == null && currentFilePointer != null ) { + while ( nextShard == null && currentContigFilePointer != null ) { // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): - if ( currentFilePointerReadsIterator != null && ! currentFilePointerReadsIterator.hasNext() ) { + if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { // Close the old, exhausted chain of iterators to release resources - currentFilePointerReadsIterator.close(); + currentContigReadsIterator.close(); - do { - advanceFilePointer(); - } while ( currentFilePointer != null && isEmpty(currentFilePointer.fileSpans) ); // skip empty file pointers + // Advance to the FilePointer for the next contig + createNextContigFilePointer(); // We'll need to create a fresh iterator for this file pointer when we create the first // shard for it below. - currentFilePointerReadsIterator = null; + currentContigReadsIterator = null; } - // At this point if currentFilePointer is non-null we know it is also non-empty. Our - // currentFilePointerReadsIterator may be null or non-null depending on whether or not + // At this point our currentContigReadsIterator may be null or non-null depending on whether or not // this is our first shard for this file pointer. - if ( currentFilePointer != null ) { - Shard shard = new ReadShard(parser,readsDataSource,currentFilePointer.fileSpans,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + if ( currentContigFilePointer != null ) { + Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - // Create a new reads iterator only when we've just advanced to a new file pointer. It's - // essential that the iterators persist across all shards that share the same file pointer + // Create a new reads iterator only when we've just advanced to the file pointer for the next + // contig. It's essential that the iterators persist across all shards that share the same contig // to allow the downsampling to work properly. - if ( currentFilePointerReadsIterator == null ) { - currentFilePointerReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); + if ( currentContigReadsIterator == null ) { + currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); } - if ( currentFilePointerReadsIterator.hasNext() ) { - shard.fill(currentFilePointerReadsIterator); + if ( currentContigReadsIterator.hasNext() ) { + shard.fill(currentContigReadsIterator); nextShard = shard; } } } } - private void advanceFilePointer() { - FilePointer previousFilePointer = currentFilePointer; - currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; - - // TODO: This is a purely defensive measure to guard against the possibility of overlap - // TODO: between FilePointers. When overlap is detected, remove the overlapping regions from - // TODO: the newly-current FilePointer. - // TODO: If we later discover that overlap is theoretically impossible, this step becomes - // TODO: unnecessary and should be removed. - if ( currentFilePointer != null && previousFilePointer != null && - previousFilePointer.hasFileSpansOverlappingWith(currentFilePointer) ) { - - logger.debug(String.format("%s: found consecutive overlapping FilePointers [%s] and [%s]", getClass().getSimpleName(), previousFilePointer, currentFilePointer)); - - Map previousFileSpans = previousFilePointer.getFileSpans(); - Map trimmedFileSpans = new HashMap(currentFilePointer.getFileSpans().size()); - - for ( Map.Entry fileSpanEntry : currentFilePointer.getFileSpans().entrySet() ) { - // find the corresponding file span from the previous FilePointer - SAMFileSpan previousFileSpan = previousFileSpans.get(fileSpanEntry.getKey()); - - if ( previousFileSpan == null ) { - // no match, so no trimming required - trimmedFileSpans.put(fileSpanEntry.getKey(), fileSpanEntry.getValue()); - } - else { - // match, so remove any overlapping regions (regions before the start of the - // region immediately following the previous file span) - SAMFileSpan trimmedSpan = fileSpanEntry.getValue().removeContentsBefore(previousFileSpan.getContentsFollowing()); - trimmedFileSpans.put(fileSpanEntry.getKey(), trimmedSpan); - } - } - - // Replace the current file pointer with its trimmed equivalent - currentFilePointer = new FilePointer(trimmedFileSpans, currentFilePointer.locations); - } - } - /** - * Detects whether the list of file spans contain any read data. - * @param selectedSpans Mapping of readers to file spans. - * @return True if file spans are completely empty; false otherwise. + * Aggregate all FilePointers for the next contig together into one monolithic FilePointer + * to avoid boundary issues with visiting the same file regions more than once (since more + * granular FilePointers will have regions that overlap with other nearby FilePointers due + * to the nature of BAM indices). + * + * By creating one persistent set of iterators per contig we also avoid boundary artifacts + * in the engine-level downsampling. + * + * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for + * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely + * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should + * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for + * TODO: locus traversals). */ - private boolean isEmpty(Map selectedSpans) { - for(SAMFileSpan fileSpan: selectedSpans.values()) { - if(!fileSpan.isEmpty()) - return false; + private void createNextContigFilePointer() { + currentContigFilePointer = null; + List nextContigFilePointers = new ArrayList(); + + logger.info("Loading BAM index data for next contig"); + + while ( filePointers.hasNext() ) { + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the + // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge + if ( nextContigFilePointers.isEmpty() || + (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && + nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || + (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { + + nextContigFilePointers.add(filePointers.next()); + } + else { + break; // next FilePointer is on a different contig or has different mapped/unmapped status, + // save it for next time + } + } + + if ( ! nextContigFilePointers.isEmpty() ) { + currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); + } + + if ( currentContigFilePointer != null ) { + logger.info("Done loading BAM index data for next contig"); + logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); } - return true; } public void remove() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index b0fbc05bf..50f4e0273 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.GATKChunk; import net.sf.samtools.SAMFileSpan; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; @@ -48,15 +50,19 @@ public class FilePointer { */ protected final boolean isRegionUnmapped; - public FilePointer(final GenomeLoc... locations) { - this.locations.addAll(Arrays.asList(locations)); + public FilePointer( List locations ) { + this.locations.addAll(locations); this.isRegionUnmapped = checkUnmappedStatus(); + validateLocations(); + } + + public FilePointer( final GenomeLoc... locations ) { + this(Arrays.asList(locations)); } public FilePointer( Map fileSpans, List locations ) { + this(locations); this.fileSpans.putAll(fileSpans); - this.locations.addAll(locations); - this.isRegionUnmapped = checkUnmappedStatus(); } private boolean checkUnmappedStatus() { @@ -74,6 +80,22 @@ public class FilePointer { return foundUnmapped; } + private void validateLocations() { + if ( isRegionUnmapped ) { + return; + } + + Integer previousContigIndex = null; + + for ( GenomeLoc location : locations ) { + if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { + throw new ReviewedStingException("File pointers must contain intervals from at most one contig"); + } + + previousContigIndex = location.getContigIndex(); + } + } + /** * Returns an immutable view of this FilePointer's file spans * @@ -91,6 +113,16 @@ public class FilePointer { return Collections.unmodifiableList(locations); } + /** + * Returns the index of the contig into which this FilePointer points (a FilePointer can represent + * regions in at most one contig). + * + * @return the index of the contig into which this FilePointer points + */ + public int getContigIndex() { + return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + } + @Override public boolean equals(final Object other) { if(!(other instanceof FilePointer)) @@ -121,11 +153,13 @@ public class FilePointer { public void addLocation(final GenomeLoc location) { this.locations.add(location); checkUnmappedStatus(); + validateLocations(); } public void addLocations( final List locations ) { this.locations.addAll(locations); checkUnmappedStatus(); + validateLocations(); } public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { @@ -243,6 +277,58 @@ public class FilePointer { combined.addFileSpans(initialElement.getKey(),fileSpan); } + /** + * Efficiently generate the union of the n FilePointers passed in. Much more efficient than + * combining two FilePointers at a time using the combine() method above. + * + * IMPORTANT: the FilePointers to be unioned must either all represent regions on the + * same contig, or all be unmapped, since we cannot create FilePointers with a mix of + * contigs or with mixed mapped/unmapped regions. + * + * @param filePointers the FilePointers to union + * @param parser our GenomeLocParser + * @return the union of the FilePointers passed in + */ + public static FilePointer union( List filePointers, GenomeLocParser parser ) { + if ( filePointers == null || filePointers.isEmpty() ) { + return new FilePointer(); + } + + Map> fileChunks = new HashMap>(); + List locations = new ArrayList(); + + // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections + for ( FilePointer filePointer : filePointers ) { + locations.addAll(filePointer.getLocations()); + + for ( Map.Entry fileSpanEntry : filePointer.getFileSpans().entrySet() ) { + GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue(); + + if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) { + fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks()); + } + else { + fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks()); + } + } + } + + // Now sort and merge the intervals + List sortedMergedLocations = new ArrayList(); + sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, IntervalMergingRule.ALL)); + + // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing + // the sorted, merged union of the chunks for that file + Map mergedFileSpans = new HashMap(fileChunks.size()); + for ( Map.Entry> fileChunksEntry : fileChunks.entrySet() ) { + List unmergedChunks = fileChunksEntry.getValue(); + mergedFileSpans.put(fileChunksEntry.getKey(), + (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan())); + } + + return new FilePointer(mergedFileSpans, sortedMergedLocations); + } + /** * Returns true if any of the file spans in this FilePointer overlap their counterparts in * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region From e740977994595818c973dfc06ed92ab19861cd1a Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 27 Sep 2012 17:59:02 -0400 Subject: [PATCH 35/35] GATK Engine: do not merge FilePointers that span multiple contigs This affects both the non-experimental and experimental engine paths, and so may break tests, but this is a necessary change. --- .../sting/gatk/datasources/reads/IntervalSharder.java | 9 ++++++++- .../sting/gatk/datasources/reads/LocusShardBalancer.java | 6 ++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index f78693c27..cc0a371ea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -73,8 +73,15 @@ public class IntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) + + while ( wrappedIterator.hasNext() && + current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && + (current.getContigIndex() == wrappedIterator.peek().getContigIndex() || current.isRegionUnmapped) && + current.minus(wrappedIterator.peek()) == 0 ) { + current = current.combine(parser,wrappedIterator.next()); + } + return current; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java index 585b63457..e1bf2d98e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java @@ -42,8 +42,10 @@ public class LocusShardBalancer extends ShardBalancer { public Shard next() { FilePointer current = filePointers.next(); - while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0) - current = current.combine(parser,filePointers.next()); + + // FilePointers have already been combined as necessary at the IntervalSharder level. No + // need to do so again here. + return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans); }