From ec05ecef60c5332c75b4815af1f2dd3a54610642 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 11 Jan 2013 18:05:45 -0500 Subject: [PATCH 01/34] getAdaptorBoundary returns an int, not an Integer, as this was taking 30% of the allocation effort for LIBS --- .../sting/utils/clipping/ReadClipper.java | 4 ++-- .../sting/utils/sam/ReadUtils.java | 16 ++++++++------- .../sting/utils/sam/ReadUtilsUnitTest.java | 20 +++++++++---------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 524c29d64..87526545d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -381,9 +381,9 @@ public class ReadClipper { * @return a new read without adaptor sequence */ private GATKSAMRecord hardClipAdaptorSequence () { - final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read); + final int adaptorBoundary = ReadUtils.getAdaptorBoundary(read); - if (adaptorBoundary == null || !ReadUtils.isInsideRead(read, adaptorBoundary)) + if (adaptorBoundary == ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY || !ReadUtils.isInsideRead(read, adaptorBoundary)) return read; return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index b61628d4d..b43b590df 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -169,8 +169,8 @@ public class ReadUtils { * @return whether or not the base is in the adaptor */ public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { - Integer adaptorBoundary = getAdaptorBoundary(read); - if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + final int adaptorBoundary = getAdaptorBoundary(read); + if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) return false; return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; @@ -199,26 +199,28 @@ public class ReadUtils { * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) * * @param read the read being tested for the adaptor boundary - * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig. + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. + * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. */ - public static Integer getAdaptorBoundary(final SAMRecord read) { + public static int getAdaptorBoundary(final SAMRecord read) { final int MAXIMUM_ADAPTOR_LENGTH = 8; final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or unmapped pairs - return null; + return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; - Integer adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) + int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) if (read.getReadNegativeStrandFlag()) adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) else adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header) if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) ) - adaptorBoundary = null; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor + adaptorBoundary = CANNOT_COMPUTE_ADAPTOR_BOUNDARY; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor return adaptorBoundary; } + public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; /** * is the read a 454 read? diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 71c7d1bb0..4194aa6d5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -40,7 +40,7 @@ public class ReadUtilsUnitTest extends BaseTest { final int mateStart = 1000; final int BEFORE = mateStart - 2; final int AFTER = mateStart + 2; - Integer myStart, boundary; + int myStart, boundary; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); read.setMateAlignmentStart(mateStart); @@ -51,43 +51,43 @@ public class ReadUtilsUnitTest extends BaseTest { read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); // Test case 2: positive strand, second read myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), myStart + fragmentSize + 1); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); // Test case 3: negative strand, second read myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), mateStart - 1); + Assert.assertEquals(boundary, mateStart - 1); // Test case 4: negative strand, first read myStart = BEFORE; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertEquals(boundary.intValue(), mateStart - 1); + Assert.assertEquals(boundary, mateStart - 1); // Test case 5: mate is mapped to another chromosome (test both strands) read.setInferredInsertSize(0); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setInferredInsertSize(10); // Test case 6: read is unmapped read.setReadUnmappedFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadUnmappedFlag(false); // Test case 7: reads don't overlap and look like this: @@ -99,7 +99,7 @@ public class ReadUtilsUnitTest extends BaseTest { read.setInferredInsertSize(20); read.setReadNegativeStrandFlag(true); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); // second read: myStart = 1000; @@ -107,6 +107,6 @@ public class ReadUtilsUnitTest extends BaseTest { read.setMateAlignmentStart(980); read.setReadNegativeStrandFlag(false); boundary = ReadUtils.getAdaptorBoundary(read); - Assert.assertNull(boundary); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); } } From 83fcc06e28b5d4d85e84183465f7c118536688f5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 12 Jan 2013 12:41:13 -0500 Subject: [PATCH 02/34] LIBS optimizations and performance tools -- Made LIBSPerformance a full featured CommandLineProgram, and it can be used to assess the LIBS performance by reading a provided BAM -- ReadStateManager now provides a clean interface to iterate in sample order the per-sample read states, allowing us to avoid many map.get calls -- Moved updateReadStates to ReadStateManager -- Removed the unnecessary wrapping of an iterator in ReadStateManager -- readStatesBySample is now a LinkedHashMap so that iteration occurs in LIBS sample order, allowing us to avoid many unnecessary calls to map.get iterating over samples. Now those are just map native iterations -- Restructured collectPendingReads for simplicity, removing redundant and consolidating common range checks. The new piece is code is much clearer and avoids several unnecessary function calls --- .../locusiterator/AlignmentStateMachine.java | 10 ++ .../locusiterator/LocusIteratorByState.java | 40 ++--- .../utils/locusiterator/ReadStateManager.java | 144 ++++++++++-------- .../ReadStateManagerUnitTest.java | 2 +- 4 files changed, 99 insertions(+), 97 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java index 32e56866b..50bc9e25b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java @@ -113,6 +113,16 @@ public class AlignmentStateMachine { return read; } + /** + * Get the reference index of the underlying read + * + * @return the reference index of the read + */ + @Ensures("result == getRead().getReferenceIndex()") + public int getReferenceIndex() { + return getRead().getReferenceIndex(); + } + /** * Is this the left edge state? I.e., one that is before or after the current read? * @return true if this state is an edge state, false otherwise diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java index 01c9e564e..9499bfa35 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java @@ -34,8 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -234,17 +233,16 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - // TODO: How can you determine here whether the current pileup has been downsampled? - boolean hasBeenSampled = false; - - for (final String sample : samples) { - final Iterator iterator = readStates.iterator(sample); - final List pile = new ArrayList(readStates.size(sample)); + for (final Map.Entry sampleStatePair : readStates ) { + final String sample = sampleStatePair.getKey(); + final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue(); + final Iterator iterator = readState.iterator(); + final List pile = new ArrayList(readState.size()); while (iterator.hasNext()) { // state object with the read/offset information final AlignmentStateMachine state = iterator.next(); - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); + final GATKSAMRecord read = state.getRead(); final CigarOperator op = state.getCigarOperator(); if (op == CigarOperator.N) // N's are never added to any pileup @@ -263,29 +261,9 @@ public class LocusIteratorByState extends LocusIterator { fullPileup.put(sample, new ReadBackedPileupImpl(location, pile)); } - updateReadStates(); // critical - must be called after we get the current state offsets and location + readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done - nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); - } - } - - /** - * Advances all fo the read states by one bp. After this call the read states are reflective - * of the next pileup. - */ - private void updateReadStates() { - for (final String sample : samples) { - Iterator it = readStates.iterator(sample); - while (it.hasNext()) { - AlignmentStateMachine state = it.next(); - CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 2dcf01d72..0a8d3a108 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,11 +49,18 @@ import java.util.*; * Date: 1/5/13 * Time: 2:02 PM */ -class ReadStateManager { +final class ReadStateManager implements Iterable> { private final List samples; private final PeekableIterator iterator; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + + /** + * A mapping from sample name -> the per sample read state manager that manages + * + * IT IS CRITICAL THAT THIS BE A LINKED HASH MAP, SO THAT THE ITERATION OF THE MAP OCCURS IN THE SAME + * ORDER AS THE ORIGINL SAMPLES + */ + private final Map readStatesBySample = new LinkedHashMap(); private LinkedList submittedReads; private final boolean keepSubmittedReads; @@ -70,6 +78,7 @@ class ReadStateManager { this.submittedReads = new LinkedList(); for (final String sample : samples) { + // because this is a linked hash map the order of iteration will be in sample order readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo)); } @@ -77,29 +86,16 @@ class ReadStateManager { } /** - * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented - * for this iterator; if present, total read states will be decremented. + * Returns a iterator over all the sample -> per-sample read state managers with each sample in this read state manager. * - * @param sample The sample. - * @return Iterator over the reads associated with that sample. + * The order of iteration is the same as the order of the samples provided upon construction to this + * ReadStateManager. + * + * @return Iterator over sample + per sample read state manager pairs for this read state manager. */ - public Iterator iterator(final String sample) { - // TODO -- why is this wrapped? - return new Iterator() { - private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public AlignmentStateMachine next() { - return wrappedIterator.next(); - } - - public void remove() { - wrappedIterator.remove(); - } - }; + @Override + public Iterator> iterator() { + return readStatesBySample.entrySet().iterator(); } public boolean isEmpty() { @@ -126,10 +122,9 @@ class ReadStateManager { } public AlignmentStateMachine getFirst() { - for (final String sample : samples) { - PerSampleReadStateManager reads = readStatesBySample.get(sample); - if (!reads.isEmpty()) - return reads.peek(); + for ( final PerSampleReadStateManager manager : readStatesBySample.values() ) { + if ( ! manager.isEmpty() ) + return manager.peek(); } return null; } @@ -138,51 +133,65 @@ class ReadStateManager { return totalReadStates > 0 || iterator.hasNext(); } - // fast testing of position - /** - * TODO -- this function needs to be optimized - * - * Notes: - * -- the only place where it's called is in a block where we know isEmpty is false - * -- getFirst() is quite expensive, and it seems that we could cache this value in the outer - * block, and then pass this in as an argument - * - * @param read - * @return + * Advances all fo the read states by one bp. After this call the read states are reflective + * of the next pileup. */ - private boolean readIsPastCurrentPosition(GATKSAMRecord read) { - if (isEmpty()) - return false; - else { - final AlignmentStateMachine state = getFirst(); - final GATKSAMRecord ourRead = state.getRead(); - return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + public void updateReadStates() { + for (final PerSampleReadStateManager readStateManager : readStatesBySample.values() ) { + final Iterator it = readStateManager.iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } } } + /** + * Does read start at the same position as described by currentContextIndex and currentAlignmentStart? + * + * @param read the read we want to test + * @param currentContigIndex the contig index (from the read's getReferenceIndex) of the reads in this state manager + * @param currentAlignmentStart the alignment start of the of the left-most position on the + * genome of the reads in this read state manager + * @return true if read has contig index and start equal to the current ones + */ + private boolean readStartsAtCurrentPosition(final GATKSAMRecord read, final int currentContigIndex, final int currentAlignmentStart) { + return read.getAlignmentStart() == currentAlignmentStart && read.getReferenceIndex() == currentContigIndex; + } + + /** + * Pull all of the reads off the iterator that overlap the left-most position among all + * reads this ReadStateManager + */ public void collectPendingReads() { if (!iterator.hasNext()) return; - // the next record in the stream, peeked as to not remove it from the stream + // determine the left-most boundary that determines which reads to keep in this new pileup + final int firstContigIndex; + final int firstAlignmentStart; if ( isEmpty() ) { - final int firstContigIndex = iterator.peek().getReferenceIndex(); - final int firstAlignmentStart = iterator.peek().getAlignmentStart(); - while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { - submitRead(iterator.next()); - } + // there are no reads here, so our next state is the next read in the stream + firstContigIndex = iterator.peek().getReferenceIndex(); + firstAlignmentStart = iterator.peek().getAlignmentStart(); } else { - // Fast fail in the case that the read is past the current position. - if (readIsPastCurrentPosition(iterator.peek())) - return; - - while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - submitRead(iterator.next()); - } + // there's a read in the system, so it's our targeted first read + final AlignmentStateMachine firstState = getFirst(); + firstContigIndex = firstState.getReferenceIndex(); + // note this isn't the alignment start of the read, but rather the alignment start position + firstAlignmentStart = firstState.getGenomePosition(); } - samplePartitioner.doneSubmittingReads(); + while ( iterator.hasNext() && readStartsAtCurrentPosition(iterator.peek(), firstContigIndex, firstAlignmentStart) ) { + submitRead(iterator.next()); + } for (final String sample : samples) { final Collection newReads = samplePartitioner.getReadsForSample(sample); @@ -271,11 +280,11 @@ class ReadStateManager { if (reads.isEmpty()) return; - Collection newReadStates = new LinkedList(); + final LinkedList newReadStates = new LinkedList(); - for (GATKSAMRecord read : reads) { - AlignmentStateMachine state = new AlignmentStateMachine(read); - if ( state.stepForwardOnGenome() != null ) + for (final GATKSAMRecord read : reads) { + final AlignmentStateMachine state = new AlignmentStateMachine(read); + if ( state.stepForwardOnGenome() != null ) // todo -- should be an assertion not a skip // explicitly filter out reads that are all insertions / soft clips newReadStates.add(state); } @@ -283,6 +292,7 @@ class ReadStateManager { readStates.addStatesAtNextAlignmentStart(newReadStates); } + // TODO -- refactor into separate class with pointer to ReadStateManager for updates to the total counts protected class PerSampleReadStateManager implements Iterable { private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; @@ -295,12 +305,16 @@ class ReadStateManager { : null; } - public void addStatesAtNextAlignmentStart(Collection states) { + /** + * Assumes it can just keep the states linked lists without making a copy + * @param states + */ + public void addStatesAtNextAlignmentStart(LinkedList states) { if ( states.isEmpty() ) { return; } - readStatesByAlignmentStart.add(new LinkedList(states)); + readStatesByAlignmentStart.add(states); thisSampleReadStates += states.size(); totalReadStates += states.size(); diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java index 1db0605c7..76b324d85 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java @@ -71,7 +71,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { makeReads(); for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + perSampleReadStateManager.addStatesAtNextAlignmentStart(new LinkedList(stackRecordStates)); } // read state manager should have the right number of reads From 19288b007d77c597f75bf9ce639df9ebf6601709 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 12 Jan 2013 13:39:19 -0500 Subject: [PATCH 03/34] LIBS bugfix: kept reads now only (correctly) includes reads that at least passed the reservoir -- Added unit tests to ensure this behavior is correct --- .../utils/locusiterator/ReadStateManager.java | 12 ++- .../LocusIteratorByStateUnitTest.java | 93 +++++++++++++------ 2 files changed, 72 insertions(+), 33 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 0a8d3a108..955dbcef7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -195,7 +195,15 @@ final class ReadStateManager implements Iterable newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + +// // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample +// // and add to the list of reads. Note this may reorder the list of reads someone (it groups them +// // by sample, but it cannot change their absolute position on the genome as they all must +// // start at the current location + if ( keepSubmittedReads ) + submittedReads.addAll(newReads); + + final PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); addReadsToSample(statesBySample, newReads); } @@ -208,8 +216,6 @@ final class ReadStateManager implements Iterable tests = new LinkedList(); - for ( final boolean doSampling : Arrays.asList(true, false) ) { - for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) { + for ( final int downsampleTo : Arrays.asList(-1, 1, 2, 5, 10, 30)) { + for ( final int nReadsPerLocus : Arrays.asList(1, 10, 60) ) { for ( final int nLoci : Arrays.asList(1, 10, 25) ) { for ( final int nSamples : Arrays.asList(1, 2, 10) ) { for ( final boolean keepReads : Arrays.asList(true, false) ) { for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) { -// for ( final int nReadsPerLocus : Arrays.asList(1) ) { -// for ( final int nLoci : Arrays.asList(1) ) { -// for ( final int nSamples : Arrays.asList(1) ) { -// for ( final boolean keepReads : Arrays.asList(true) ) { -// for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { - tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, doSampling}); +// for ( final int downsampleTo : Arrays.asList(1)) { +// for ( final int nReadsPerLocus : Arrays.asList(10) ) { +// for ( final int nLoci : Arrays.asList(25) ) { +// for ( final int nSamples : Arrays.asList(1) ) { +// for ( final boolean keepReads : Arrays.asList(true) ) { +// for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { + tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, + keepReads, grabReadsAfterEachCycle, + downsampleTo}); } } } @@ -432,14 +436,15 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { return tests.toArray(new Object[][]{}); } - @Test(enabled = true && ! DEBUG, dataProvider = "LIBSKeepSubmittedReads") - public void testLIBSKeepSubmittedReads(final int nReadsPerLocus, - final int nLoci, - final int nSamples, - final boolean keepReads, - final boolean grabReadsAfterEachCycle, - final boolean downsample) { - logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); + //@Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") + @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") + public void testLIBS_ComplexPileupTests(final int nReadsPerLocus, + final int nLoci, + final int nSamples, + final boolean keepReads, + final boolean grabReadsAfterEachCycle, + final int downsampleTo) { + //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); final int readLength = 10; final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); @@ -453,10 +458,9 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { header.addReadGroup(rg); } - final int maxCoveragePerSampleAtLocus = nReadsPerLocus * readLength / 2; - final int maxDownsampledCoverage = Math.max(maxCoveragePerSampleAtLocus / 2, 1); + final boolean downsample = downsampleTo != -1; final DownsamplingMethod downsampler = downsample - ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, maxDownsampledCoverage, null, false) + ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) : new DownsamplingMethod(DownsampleType.NONE, null, null, false); final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), @@ -472,6 +476,8 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { final AlignmentContext alignmentContext = li.next(); final ReadBackedPileup p = alignmentContext.getBasePileup(); + AssertWellOrderedPileup(p); + if ( downsample ) { // just not a safe test //Assert.assertTrue(p.getNumberOfElements() <= maxDownsampledCoverage * nSamples, "Too many reads at locus after downsampling"); @@ -480,22 +486,29 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { Assert.assertTrue(p.getNumberOfElements() >= minPileupSize); } + // the number of reads starting here + int nReadsStartingHere = 0; + for ( final GATKSAMRecord read : p.getReads() ) + if ( read.getAlignmentStart() == alignmentContext.getPosition() ) + nReadsStartingHere++; + + // we can have no more than maxDownsampledCoverage per sample + final int maxCoveragePerLocus = downsample ? downsampleTo : nReadsPerLocus; + Assert.assertTrue(nReadsStartingHere <= maxCoveragePerLocus * nSamples); + seenSoFar.addAll(p.getReads()); if ( keepReads && grabReadsAfterEachCycle ) { final List locusReads = li.transferReadsFromAllPreviousPileups(); - // the number of reads starting here - int nReadsStartingHere = 0; - for ( final GATKSAMRecord read : p.getReads() ) - if ( read.getAlignmentStart() == alignmentContext.getPosition() ) - nReadsStartingHere++; - if ( downsample ) + if ( downsample ) { // with downsampling we might have some reads here that were downsampled away - // in the pileup + // in the pileup. We want to ensure that no more than the max coverage per sample is added Assert.assertTrue(locusReads.size() >= nReadsStartingHere); - else + Assert.assertTrue(locusReads.size() <= maxCoveragePerLocus * nSamples); + } else { Assert.assertEquals(locusReads.size(), nReadsStartingHere); + } keptReads.addAll(locusReads); // check that all reads we've seen so far are in our keptReads @@ -543,6 +556,26 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { for ( final GATKSAMRecord read : seenSoFar ) { Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read); } + + if ( ! downsample ) { + // check that every read in the list of keep reads occurred at least once in one of the pileups + for ( final GATKSAMRecord keptRead : keptReads ) { + Assert.assertTrue(seenSoFar.contains(keptRead), "There's a read " + keptRead + " in our keptReads list that never appeared in any pileup"); + } + } + } + } + + private void AssertWellOrderedPileup(final ReadBackedPileup pileup) { + if ( ! pileup.isEmpty() ) { + int leftMostPos = -1; + + for ( final PileupElement pe : pileup ) { + Assert.assertTrue(pileup.getLocation().getContig().equals(pe.getRead().getReferenceName()), "ReadBackedPileup contains an element " + pe + " that's on a different contig than the pileup itself"); + Assert.assertTrue(pe.getRead().getAlignmentStart() >= leftMostPos, + "ReadBackedPileup contains an element " + pe + " whose read's alignment start " + pe.getRead().getAlignmentStart() + + " occurs before the leftmost position we've seen previously " + leftMostPos); + } } } } From a4334a67e088d9cd221dadc011edd1478dc7b28f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 12 Jan 2013 19:22:36 -0500 Subject: [PATCH 04/34] SamplePartitioner optimizations and bugfixes -- Use a linked hash map instead of a hash map since we want to iterate through the map fairly often -- Ensure that we call doneSubmittingReads before getting reads for samples. This function call fell out before and since it wasn't enforced I only noticed the problem while writing comments -- Don't make unnecessary calls to contains for map. Just use get() and check that the result is null -- Use a LinkedList in PassThroughDownsampler, since this is faster for add() than the existing ArrayList, and we were's using random access to any resulting --- .../downsampling/PassThroughDownsampler.java | 14 +- .../utils/locusiterator/ReadStateManager.java | 10 +- .../locusiterator/SamplePartitioner.java | 124 +++++++++++++++--- 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java index 600834012..b06d5f5b4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -27,8 +27,8 @@ package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMRecord; -import java.util.ArrayList; import java.util.Collection; +import java.util.LinkedList; import java.util.List; /** @@ -41,7 +41,7 @@ import java.util.List; */ public class PassThroughDownsampler implements ReadsDownsampler { - private ArrayList selectedReads; + private LinkedList selectedReads; public PassThroughDownsampler() { clear(); @@ -59,9 +59,13 @@ public class PassThroughDownsampler implements ReadsDownsam } public boolean hasFinalizedItems() { - return selectedReads.size() > 0; + return ! selectedReads.isEmpty(); } + /** + * Note that this list is a linked list and so doesn't support fast random access + * @return + */ public List consumeFinalizedItems() { // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; @@ -74,7 +78,7 @@ public class PassThroughDownsampler implements ReadsDownsam } public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.get(0); + return selectedReads.isEmpty() ? null : selectedReads.getFirst(); } public T peekPending() { @@ -90,7 +94,7 @@ public class PassThroughDownsampler implements ReadsDownsam } public void clear() { - selectedReads = new ArrayList(); + selectedReads = new LinkedList(); } public void reset() { diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 955dbcef7..b5dbe2ddb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -193,13 +193,15 @@ final class ReadStateManager implements Iterable newReads = samplePartitioner.getReadsForSample(sample); -// // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample -// // and add to the list of reads. Note this may reorder the list of reads someone (it groups them -// // by sample, but it cannot change their absolute position on the genome as they all must -// // start at the current location + // if we're keeping reads, take the (potentially downsampled) list of new reads for this sample + // and add to the list of reads. Note this may reorder the list of reads someone (it groups them + // by sample, but it cannot change their absolute position on the genome as they all must + // start at the current location if ( keepSubmittedReads ) submittedReads.addAll(newReads); diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java index 1653c6a92..7dada292b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.locusiterator; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.PassThroughDownsampler; @@ -33,49 +35,137 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import java.util.*; /** - * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * Divides reads by sample and (if requested) does a preliminary downsampling pass + * with a ReservoirDownsampler. * * Note: stores reads by sample ID string, not by sample object */ class SamplePartitioner { - private Map> readsBySample; + /** + * Map from sample name (as a string) to a downsampler of reads for that sample + */ + final private Map> readsBySample; + /** + * Are we in a state where we're done submitting reads and have semi-finalized the + * underlying per sample downsampler? + */ + boolean doneSubmittingReads = false; + + /** + * Create a new SamplePartitioner capable of splitting reads up into buckets of reads for + * each sample in samples, and perform a preliminary downsampling of these reads + * (separately for each sample) if downsampling is requested in LIBSDownsamplingInfo + * + * Note that samples must be comprehensive, in that all reads every submitted to this + * partitioner must come from one of the samples provided here. If not, submitRead + * will throw an exception. Duplicates in the list of samples will be ignored + * + * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? + * @param samples the complete list of samples we're going to partition reads into + */ + @Ensures({ + "readsBySample != null", + "! readsBySample.isEmpty()", + "readsBySample.size() == new HashSet(samples).size()" + }) public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List samples) { - readsBySample = new HashMap>(samples.size()); - for ( String sample : samples ) { + if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); + if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples); + + readsBySample = new LinkedHashMap>(samples.size()); + for ( final String sample : samples ) { readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo)); } } + /** + * Create a new, ready to use downsampler based on the parameters in LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the parameters to use in creating the downsampler + * @return a downsampler appropriate for LIBSDownsamplingInfo. If no downsampling is requested, + * uses the PassThroughDownsampler, which does nothing at all. + */ + @Requires("LIBSDownsamplingInfo != null") + @Ensures("result != null") private Downsampler createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { return LIBSDownsamplingInfo.isPerformDownsampling() ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage()) : new PassThroughDownsampler(); } - public void submitRead(T read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submit(read); + /** + * Offer this read to the partitioner, putting it into the bucket of reads for the sample + * of read (obtained via the read's read group). + * + * If the read group is missing, uses the special "null" read group + * + * @throws IllegalStateException if the sample of read wasn't present in the original + * set of samples provided to this SamplePartitioner at construction + * + * @param read the read to add to the sample's list of reads + */ + @Requires("read != null") + @Ensures("doneSubmittingReads == false") + public void submitRead(final T read) { + final String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) + throw new IllegalStateException("Offered read with sample name " + sampleName + " to SamplePartitioner " + + "but this sample wasn't provided as one of possible samples at construction"); + + downsampler.submit(read); + doneSubmittingReads = false; } + /** + * Tell this partitioner that all reads in this cycle have been submitted, so that we + * can finalize whatever downsampling is required by each sample. + * + * Note that we *must* call this function before getReadsForSample, or else that + * function will exception out. + */ + @Ensures("doneSubmittingReads == true") public void doneSubmittingReads() { - for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { - perSampleReads.getValue().signalEndOfInput(); + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.signalEndOfInput(); } + doneSubmittingReads = true; } - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); + /** + * Get the final collection of reads for this sample for this cycle + * + * The cycle is defined as all of the reads that occur between + * the first call to submitRead until doneSubmittingReads is called. At that + * point additional downsampling may occur (depending on construction arguments) + * and that set of reads is returned here. + * + * Note that this function can only be called once per cycle, as underlying + * collection of reads is cleared. + * + * @param sampleName the sample we want reads for, must be present in the original samples + * @return a non-null collection of reads for sample in this cycle + */ + @Ensures("result != null") + public Collection getReadsForSample(final String sampleName) { + if ( ! doneSubmittingReads ) throw new IllegalStateException("getReadsForSample called before doneSubmittingReads was called"); - return readsBySample.get(sampleName).consumeFinalizedItems(); + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) throw new NoSuchElementException("Sample name not found"); + + return downsampler.consumeFinalizedItems(); } + /** + * Resets this SamplePartitioner, indicating that we're starting a new + * cycle of adding reads to each underlying downsampler. + */ + @Ensures("doneSubmittingReads == false") public void reset() { - for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { - perSampleReads.getValue().clear(); - perSampleReads.getValue().reset(); + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.clear(); + downsampler.reset(); } + doneSubmittingReads = false; } } From 5c2799554aca87f3a5a0d95c609baff574f5e261 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 13 Jan 2013 12:23:51 -0500 Subject: [PATCH 05/34] Refactor updateReadStates into PerSampleReadStateManager, add tracking of downsampling rate --- .../utils/locusiterator/LIBSPerformance.java | 4 +- .../utils/locusiterator/ReadStateManager.java | 70 ++++++++++++++----- 2 files changed, 55 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java index 0985ed196..2d074f420 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java @@ -63,6 +63,8 @@ public class LIBSPerformance extends CommandLineProgram { @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false) public String location = null; + @Argument(fullName = "dt", shortName = "dt", doc = "Enable downsampling", required = false) + public boolean downsample = false; @Override public int execute() throws IOException { @@ -86,7 +88,7 @@ public class LIBSPerformance extends CommandLineProgram { for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() ) samples.add(rg.getSample()); - final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(false, -1); + final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250); final LocusIteratorByState libs = new LocusIteratorByState( diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index b5dbe2ddb..3276291ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.CigarOperator; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.Downsampler; import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -50,6 +51,8 @@ import java.util.*; * Time: 2:02 PM */ final class ReadStateManager implements Iterable> { + private final static Logger logger = Logger.getLogger(ReadStateManager.class); + private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; private final List samples; private final PeekableIterator iterator; private final SamplePartitioner samplePartitioner; @@ -138,18 +141,8 @@ final class ReadStateManager implements Iterable it = readStateManager.iterator(); - while (it.hasNext()) { - final AlignmentStateMachine state = it.next(); - final CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } + for (final PerSampleReadStateManager perSampleReadStateManager : readStatesBySample.values() ) { + perSampleReadStateManager.updateReadStates(); } } @@ -301,13 +294,17 @@ final class ReadStateManager implements Iterable { + protected final class PerSampleReadStateManager implements Iterable { private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; - private int thisSampleReadStates = 0; + private final int downsamplingTarget; + private int nSitesNeedingDownsampling = 0; + private int nSites = 0; + public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { + this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) : null; @@ -326,7 +323,8 @@ final class ReadStateManager implements Iterable downsamplingTarget; + if ( downsampling ) { + nSitesNeedingDownsampling++; + message = "Downsampling"; + } + + if ( downsampling || nSites % 10000 == 0 ) + logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", + message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + } + } + public boolean isEmpty() { return readStatesByAlignmentStart.isEmpty(); } @@ -351,11 +371,25 @@ final class ReadStateManager implements Iterable it = iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } + } + public Iterator iterator() { return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; + private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates; + private Iterator currentPositionReadStatesIterator; public boolean hasNext() { return alignmentStartIterator.hasNext() || From 5a5422e4f8220ecde133490eeef6b58fa3084397 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 13 Jan 2013 13:02:17 -0500 Subject: [PATCH 06/34] Refactor PerSampleReadStates into a separate class -- No longer update the total counts in each per-sample state manager, but instead return delta counts that are updated by the overall ReadStateManager -- One step on the way to improving the underlying representation of the data in PerSampleReadStateManager -- Make LocusIteratorByState final --- .../locusiterator/LocusIteratorByState.java | 6 +- .../PerSampleReadStateManager.java | 203 ++++++++++++++++++ .../utils/locusiterator/ReadStateManager.java | 138 +----------- .../LocusIteratorByStateUnitTest.java | 5 +- ...=> PerSampleReadStateManagerUnitTest.java} | 11 +- 5 files changed, 214 insertions(+), 149 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java rename public/java/test/org/broadinstitute/sting/utils/locusiterator/{ReadStateManagerUnitTest.java => PerSampleReadStateManagerUnitTest.java} (92%) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java index 9499bfa35..e7b75f1f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java @@ -65,7 +65,7 @@ import java.util.*; * occurs, if requested. This allows users of LIBS to see both a ReadBackedPileup view of the data as well as * a stream of unique, sorted reads */ -public class LocusIteratorByState extends LocusIterator { +public final class LocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ @@ -233,9 +233,9 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - for (final Map.Entry sampleStatePair : readStates ) { + for (final Map.Entry sampleStatePair : readStates ) { final String sample = sampleStatePair.getKey(); - final ReadStateManager.PerSampleReadStateManager readState = sampleStatePair.getValue(); + final PerSampleReadStateManager readState = sampleStatePair.getValue(); final Iterator iterator = readState.iterator(); final List pile = new ArrayList(readState.size()); diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java new file mode 100644 index 000000000..c2a47bbdb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.locusiterator; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.CigarOperator; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * ReadStateManager for a single sample + * + * User: depristo + * Date: 1/13/13 + * Time: 12:28 PM + */ +final class PerSampleReadStateManager implements Iterable { + private final static Logger logger = Logger.getLogger(ReadStateManager.class); + private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; + + private List> readStatesByAlignmentStart = new LinkedList>(); + private final Downsampler> levelingDownsampler; + private int thisSampleReadStates = 0; + + private final int downsamplingTarget; + private int nSitesNeedingDownsampling = 0; + private int nSites = 0; + + public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { + this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; + this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() + ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) + : null; + } + + /** + * Assumes it can just keep the states linked lists without making a copy + * @param states the new states to add to this manager + * @return The change in the number of states, after including states and potentially downsampling + */ + @Requires("states != null") + @Ensures("result >= 0") + public int addStatesAtNextAlignmentStart(LinkedList states) { + if ( states.isEmpty() ) { + return 0; + } + + readStatesByAlignmentStart.add(states); + int nStatesAdded = states.size(); + + if ( isDownsampling() ) { + captureDownsamplingStats(); + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } + + thisSampleReadStates += nStatesAdded; + return nStatesAdded; + } + + private boolean isDownsampling() { + return levelingDownsampler != null; + } + + private AlignmentStateMachine getFirst() { + if (readStatesByAlignmentStart.isEmpty()) + return null; + else + return readStatesByAlignmentStart.get(0).getFirst(); + } + + @Requires("isDownsampling()") + private void captureDownsamplingStats() { + if ( CAPTURE_DOWNSAMPLING_STATS ) { + nSites++; + final int loc = getFirst().getGenomePosition(); + String message = "Pass through"; + final boolean downsampling = thisSampleReadStates > downsamplingTarget; + if ( downsampling ) { + nSitesNeedingDownsampling++; + message = "Downsampling"; + } + + if ( downsampling || nSites % 10000 == 0 ) + logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", + message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + } + } + + /** + * Is there at least one alignment for this sample in this manager? + * @return true if there's at least one alignment, false otherwise + */ + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + public AlignmentStateMachine peek() { + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + } + + /** + * Get the number of read states currently in this manager + * @return the number of read states + */ + @Ensures("result >= 0") + public int size() { + return thisSampleReadStates; + } + + /** + * Advances all read states forward by one element, removing states that are + * no long aligned to the current position. + * @return the number of states we're removed after advancing + */ + public int updateReadStates() { + int nRemoved = 0; + final Iterator it = iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + nRemoved++; + } + } + + return nRemoved; + } + + // todo -- reimplement + public Iterator iterator() { + return new Iterator() { + private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates; + private Iterator currentPositionReadStatesIterator; + + @Override + public boolean hasNext() { + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + } + + @Override + public AlignmentStateMachine next() { + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); + } + + @Override + public void remove() { + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } + } + }; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 3276291ef..4011875a6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -28,10 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -50,9 +47,7 @@ import java.util.*; * Date: 1/5/13 * Time: 2:02 PM */ -final class ReadStateManager implements Iterable> { - private final static Logger logger = Logger.getLogger(ReadStateManager.class); - private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; +final class ReadStateManager implements Iterable> { private final List samples; private final PeekableIterator iterator; private final SamplePartitioner samplePartitioner; @@ -97,7 +92,7 @@ final class ReadStateManager implements Iterable> iterator() { + public Iterator> iterator() { return readStatesBySample.entrySet().iterator(); } @@ -142,7 +137,7 @@ final class ReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private final Downsampler> levelingDownsampler; - private int thisSampleReadStates = 0; - - private final int downsamplingTarget; - private int nSitesNeedingDownsampling = 0; - private int nSites = 0; - - public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { - this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; - this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() - ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) - : null; - } - - /** - * Assumes it can just keep the states linked lists without making a copy - * @param states - */ - public void addStatesAtNextAlignmentStart(LinkedList states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(states); - thisSampleReadStates += states.size(); - totalReadStates += states.size(); - - if ( isDownsampling() ) { - captureDownsamplingStats(); - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } - } - - private boolean isDownsampling() { - return levelingDownsampler != null; - } - - @Requires("isDownsampling()") - private void captureDownsamplingStats() { - if ( CAPTURE_DOWNSAMPLING_STATS ) { - nSites++; - final int loc = getFirst().getGenomePosition(); - String message = "Pass through"; - final boolean downsampling = thisSampleReadStates > downsamplingTarget; - if ( downsampling ) { - nSitesNeedingDownsampling++; - message = "Downsampling"; - } - - if ( downsampling || nSites % 10000 == 0 ) - logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", - message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); - } - } - - public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); - } - - public AlignmentStateMachine peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); - } - - public int size() { - return thisSampleReadStates; - } - - public void updateReadStates() { - final Iterator it = iterator(); - while (it.hasNext()) { - final AlignmentStateMachine state = it.next(); - final CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - } - } - } - - public Iterator iterator() { - return new Iterator() { - private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates; - private Iterator currentPositionReadStatesIterator; - - public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); - } - - public AlignmentStateMachine next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); - } - - public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } - } - }; - } + totalReadStates += readStates.addStatesAtNextAlignmentStart(newReadStates); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 727023b83..7ae2d97a1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -418,8 +418,8 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { for ( final boolean keepReads : Arrays.asList(true, false) ) { for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) { // for ( final int downsampleTo : Arrays.asList(1)) { -// for ( final int nReadsPerLocus : Arrays.asList(10) ) { -// for ( final int nLoci : Arrays.asList(25) ) { +// for ( final int nReadsPerLocus : Arrays.asList(1) ) { +// for ( final int nLoci : Arrays.asList(1) ) { // for ( final int nSamples : Arrays.asList(1) ) { // for ( final boolean keepReads : Arrays.asList(true) ) { // for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { @@ -436,7 +436,6 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { return tests.toArray(new Object[][]{}); } - //@Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") public void testLIBS_ComplexPileupTests(final int nReadsPerLocus, final int nLoci, diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java similarity index 92% rename from public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java index 76b324d85..b9f2fb29a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/ReadStateManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java @@ -38,11 +38,7 @@ import java.util.*; /** * testing of the new (non-legacy) version of LocusIteratorByState */ -public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - +public class PerSampleReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { private class PerSampleReadStateManagerTest extends TestDataProvider { private List readCountsPerAlignmentStart; private List reads; @@ -63,10 +59,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest { } public void run() { - final List samples = LocusIteratorByState.sampleListForSAMWithoutReadGroups(); - final Iterator iterator = new LinkedList().iterator(); - ReadStateManager readStateManager = new ReadStateManager(iterator, samples, LIBSDownsamplingInfo.NO_DOWNSAMPLING, false); - ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = readStateManager.new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING); + PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING); makeReads(); From c7f0ca8ac53e320d2762da917158be51a9b2d8ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 13 Jan 2013 14:36:25 -0500 Subject: [PATCH 07/34] Optimization for LIBS: PerSampleReadStateManager now uses a simple LinkedList of AlignmentStateMachine -- Instead of storing a list of list of alignment starts, which is expensive to manipulate, we instead store a linear list of alignment starts. Not grouped as previously. This enables us to simplify iteration and update operations, making them much faster -- Critically, the downsampler still requires this list of list. We convert back and forth between these two representations as required, which is very rarely for normal data sets (WGS NA12878 on chr20 is 0.2%, 4x WGS is even less). --- .../PerSampleReadStateManager.java | 170 ++++++++++++------ .../utils/locusiterator/ReadStateManager.java | 2 +- 2 files changed, 115 insertions(+), 57 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java index c2a47bbdb..3f3bc706f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManager.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; @@ -43,18 +44,42 @@ import java.util.List; * Date: 1/13/13 * Time: 12:28 PM */ +@Invariant({ + "readStartsAreWellOrdered()", + "! isDownsampling() || downsamplingTarget > 0", + "nSites >= 0", + "nSitesNeedingDownsampling >= 0", + "nSitesNeedingDownsampling <= nSites" +}) final class PerSampleReadStateManager implements Iterable { private final static Logger logger = Logger.getLogger(ReadStateManager.class); - private final static boolean CAPTURE_DOWNSAMPLING_STATS = true; + private final static boolean CAPTURE_DOWNSAMPLING_STATS = false; + + /** + * A list (potentially empty) of alignment state machines. + * + * The state machines must be ordered by the alignment start of their underlying reads, with the + * lowest alignment starts on the left, and the largest on the right + */ + private LinkedList readStatesByAlignmentStart = new LinkedList(); - private List> readStatesByAlignmentStart = new LinkedList>(); private final Downsampler> levelingDownsampler; - private int thisSampleReadStates = 0; - private final int downsamplingTarget; + + /** + * The number of sites where downsampling has been invoked + */ private int nSitesNeedingDownsampling = 0; + + /** + * The number of sites we've visited + */ private int nSites = 0; + /** + * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the downsampling params we want to use + */ public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() @@ -62,55 +87,118 @@ final class PerSampleReadStateManager implements Iterable : null; } + /** + * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines, + * where each list contains machines with a unique genome site. The outer list is ordered + * by alignment start. + * + * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then + * the resulting grouping will be [[10, 10], [11], [12, 12], [13]]. + * + * @return a non-null list of lists + */ + @Ensures("result != null") + private List> groupByAlignmentStart() { + final LinkedList> grouped = new LinkedList>(); + + AlignmentStateMachine last = null; + for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) { + if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) { + // we've advanced to a place where the state machine has a different state, + // so start a new list + grouped.add(new LinkedList()); + last = stateMachine; + } + grouped.getLast().add(stateMachine); + } + + return grouped; + } + + /** + * Flattens the grouped list of list of alignment state machines into a single list in order + * @return a non-null list contains the state machines + */ + @Ensures("result != null") + private LinkedList flattenByAlignmentStart(final List> grouped) { + final LinkedList flat = new LinkedList(); + for ( final List l : grouped ) + flat.addAll(l); + return flat; + } + + /** + * Test that the reads are ordered by their alignment starts + * @return true if well ordered, false otherwise + */ + private boolean readStartsAreWellOrdered() { + int lastStart = -1; + for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) { + if ( lastStart > machine.getRead().getAlignmentStart() ) + return false; + lastStart = machine.getRead().getAlignmentStart(); + } + return true; + } + /** * Assumes it can just keep the states linked lists without making a copy * @param states the new states to add to this manager - * @return The change in the number of states, after including states and potentially downsampling + * @return The change in the number of states, after including states and potentially downsampling. Note + * that this return result might be negative, if downsampling is enabled, as we might drop + * more sites than have been added by the downsampler */ @Requires("states != null") - @Ensures("result >= 0") - public int addStatesAtNextAlignmentStart(LinkedList states) { + public int addStatesAtNextAlignmentStart(final LinkedList states) { if ( states.isEmpty() ) { return 0; } - readStatesByAlignmentStart.add(states); + readStatesByAlignmentStart.addAll(states); int nStatesAdded = states.size(); - if ( isDownsampling() ) { + if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) { + // only go into the downsampling branch if we are downsampling and the coverage > the target captureDownsamplingStats(); - levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.submit(groupByAlignmentStart()); levelingDownsampler.signalEndOfInput(); nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); levelingDownsampler.reset(); } - thisSampleReadStates += nStatesAdded; return nStatesAdded; } + /** + * Is downsampling enabled for this manager? + * @return true if we are downsampling, false otherwise + */ private boolean isDownsampling() { return levelingDownsampler != null; } - private AlignmentStateMachine getFirst() { - if (readStatesByAlignmentStart.isEmpty()) - return null; - else - return readStatesByAlignmentStart.get(0).getFirst(); + /** + * Get the leftmost alignment state machine, or null if the read states is empty + * @return a potentially null AlignmentStateMachine + */ + public AlignmentStateMachine getFirst() { + return isEmpty() ? null : readStatesByAlignmentStart.getFirst(); } + /** + * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true + */ @Requires("isDownsampling()") private void captureDownsamplingStats() { if ( CAPTURE_DOWNSAMPLING_STATS ) { nSites++; final int loc = getFirst().getGenomePosition(); String message = "Pass through"; - final boolean downsampling = thisSampleReadStates > downsamplingTarget; + final boolean downsampling = size() > downsamplingTarget; if ( downsampling ) { nSitesNeedingDownsampling++; message = "Downsampling"; @@ -118,7 +206,7 @@ final class PerSampleReadStateManager implements Iterable if ( downsampling || nSites % 10000 == 0 ) logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", - message, loc, thisSampleReadStates, downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); } } @@ -130,17 +218,13 @@ final class PerSampleReadStateManager implements Iterable return readStatesByAlignmentStart.isEmpty(); } - public AlignmentStateMachine peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); - } - /** * Get the number of read states currently in this manager * @return the number of read states */ @Ensures("result >= 0") public int size() { - return thisSampleReadStates; + return readStatesByAlignmentStart.size(); } /** @@ -166,38 +250,12 @@ final class PerSampleReadStateManager implements Iterable return nRemoved; } - // todo -- reimplement + /** + * Iterate over the AlignmentStateMachine in this manager in alignment start order. + * @return a valid iterator + */ + @Ensures("result != null") public Iterator iterator() { - return new Iterator() { - private final Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates; - private Iterator currentPositionReadStatesIterator; - - @Override - public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); - } - - @Override - public AlignmentStateMachine next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); - } - - @Override - public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } - } - }; + return readStatesByAlignmentStart.iterator(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 4011875a6..09ec3b264 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -122,7 +122,7 @@ final class ReadStateManager implements Iterable Date: Sun, 13 Jan 2013 20:43:10 -0500 Subject: [PATCH 08/34] ReservoirDownsampler optimizations -- Add an option to not allocate always ArrayLists of targetSampleSize, but rather the previous size + MARGIN. This helps for LIBS as most of the time we don't need nearly so much space as we allow -- consumeFinalizedItems returns an empty list if the reservior is empty, which it often true for our BAM files with low coverage -- Allow empty sample lists for SamplePartitioner as these are used by the RefTraversals and other non-read based traversals Make the reservoir downsampler use a linked list, rather than a fixed sized array list, in the expectFewOverflows case --- .../downsampling/ReservoirDownsampler.java | 76 +++++++++++++++---- .../locusiterator/SamplePartitioner.java | 9 ++- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index 0d7a0dd14..4331fd723 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.util.*; /** * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with @@ -42,10 +40,25 @@ import java.util.List; * @author David Roazen */ public class ReservoirDownsampler implements ReadsDownsampler { + private final int targetSampleSize; - private ArrayList reservoir; + /** + * if true, this downsampler will be optimized for the case + * where most of the time we won't fill up anything like the + * targetSampleSize elements. If this is false, we will allocate + * internal buffers to targetSampleSize initially, which minimizes + * the cost of allocation if we often use targetSampleSize or more + * elements. + */ + private final boolean expectFewOverflows; - private int targetSampleSize; + /** + * At times this can be a linked list or an array list, depending on how we're accessing the + * data and whether or not we're expecting few overflows + */ + private List reservoir; + + private boolean isLinkedList; private int totalReadsSeen; @@ -56,17 +69,35 @@ public class ReservoirDownsampler implements ReadsDownsampl * * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained * after downsampling will be min(totalReads, targetSampleSize) + * @param expectFewOverflows if true, this downsampler will be optimized for the case + * where most of the time we won't fill up anything like the + * targetSampleSize elements. If this is false, we will allocate + * internal buffers to targetSampleSize initially, which minimizes + * the cost of allocation if we often use targetSampleSize or more + * elements. */ - public ReservoirDownsampler ( int targetSampleSize ) { + public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows) { if ( targetSampleSize <= 0 ) { throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); } this.targetSampleSize = targetSampleSize; + this.expectFewOverflows = expectFewOverflows; clear(); reset(); } + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalReads, targetSampleSize) + */ + public ReservoirDownsampler ( int targetSampleSize ) { + this(targetSampleSize, false); + } + + public void submit ( T newRead ) { totalReadsSeen++; @@ -74,7 +105,12 @@ public class ReservoirDownsampler implements ReadsDownsampl reservoir.add(newRead); } else { - int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); + if ( isLinkedList ) { + reservoir = new ArrayList(reservoir); + isLinkedList = false; + } + + final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen); if ( randomSlot < targetSampleSize ) { reservoir.set(randomSlot, newRead); } @@ -93,10 +129,15 @@ public class ReservoirDownsampler implements ReadsDownsampl } public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - List downsampledItems = reservoir; - clear(); - return downsampledItems; + if ( reservoir.isEmpty() ) { + // if there's nothing here, don't both allocating a new list completely + return Collections.emptyList(); + } else { + // pass by reference rather than make a copy, for speed + List downsampledItems = reservoir; + clear(); + return downsampledItems; + } } public boolean hasPendingItems() { @@ -119,9 +160,18 @@ public class ReservoirDownsampler implements ReadsDownsampl // NO-OP } + /** + * Clear the data structures used to hold information + */ public void clear() { - reservoir = new ArrayList(targetSampleSize); - totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below + // if we aren't expecting many overflows, allocate a linked list not an arraylist + reservoir = expectFewOverflows ? new LinkedList() : new ArrayList(targetSampleSize); + + // it's a linked list if we allocate one + isLinkedList = expectFewOverflows; + + // an internal stat used by the downsampling process, so not cleared by reset() below + totalReadsSeen = 0; } public void reset() { diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java index 7dada292b..9bb474e4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/SamplePartitioner.java @@ -62,16 +62,17 @@ class SamplePartitioner { * will throw an exception. Duplicates in the list of samples will be ignored * * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? - * @param samples the complete list of samples we're going to partition reads into + * @param samples the complete list of samples we're going to partition reads into. Can be + * empty, but in that case this code cannot function properly if you + * attempt to add data to it. */ @Ensures({ "readsBySample != null", - "! readsBySample.isEmpty()", "readsBySample.size() == new HashSet(samples).size()" }) public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List samples) { if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); - if ( samples == null || samples.isEmpty() ) throw new IllegalArgumentException("samples must be a non-null, non-empty list but got " + samples); + if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list"); readsBySample = new LinkedHashMap>(samples.size()); for ( final String sample : samples ) { @@ -89,7 +90,7 @@ class SamplePartitioner { @Ensures("result != null") private Downsampler createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { return LIBSDownsamplingInfo.isPerformDownsampling() - ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage()) + ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage(), true) : new PassThroughDownsampler(); } From b8b2b9b2de6270e1aead4f17ecf01b27d7f123f7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 13 Jan 2013 20:44:28 -0500 Subject: [PATCH 09/34] ManagingReferenceOrderedView optimization: don't allow a fresh RefMetaDataTracker in the frequent case where there's no reference meta data --- .../providers/ManagingReferenceOrderedView.java | 14 +++++++++----- .../sting/gatk/refdata/RefMetaDataTracker.java | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index 7d3cac33d..09b72f5eb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -77,13 +77,17 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { * @return A tracker containing information about this locus. */ public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { - List bindings = states.isEmpty() ? Collections.emptyList() : new ArrayList(states.size()); + if ( states.isEmpty() ) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + List bindings = new ArrayList(states.size()); - for ( ReferenceOrderedDataState state: states ) - // todo -- warning, I removed the reference to the name from states - bindings.add( state.iterator.seekForward(loc) ); + for ( ReferenceOrderedDataState state: states ) + // todo -- warning, I removed the reference to the name from states + bindings.add( state.iterator.seekForward(loc) ); - return new RefMetaDataTracker(bindings); + return new RefMetaDataTracker(bindings); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 9cb38b840..5a1b015fe 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -61,6 +61,7 @@ public class RefMetaDataTracker { final Map bindings; final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); + public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); // ------------------------------------------------------------------------------------------ // From 39bc9e999d8215486708a77a8cff31b9084f7dca Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 14 Jan 2013 08:34:42 -0500 Subject: [PATCH 10/34] Add a test to LocusIteratorByState to ensure that we aren't holding reads anywhere -- Run an iterator with 100Ks of reads, each carrying MBs of byte[] data, through LIBS, all starting at the same position. Will crash with an out-of-memory error if we're holding reads anywhere in the system. -- Is there a better way to test this behavior? --- .../LocusIteratorByStateUnitTest.java | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 7ae2d97a1..37494903c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.locusiterator; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -577,4 +578,94 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { } } } + + // --------------------------------------------------------------------------- + // make sure that downsampling isn't holding onto a bazillion reads + // + @DataProvider(name = "LIBS_NotHoldingTooManyReads") + public Object[][] makeLIBS_NotHoldingTooManyReads() { + final List tests = new LinkedList(); + + for ( final int downsampleTo : Arrays.asList(1, 10)) { + for ( final int nReadsPerLocus : Arrays.asList(100, 1000, 10000, 100000) ) { + for ( final int payloadInBytes : Arrays.asList(0, 1024, 1024*1024) ) { + tests.add(new Object[]{nReadsPerLocus, downsampleTo, payloadInBytes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_NotHoldingTooManyReads") +// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000) + public void testLIBS_NotHoldingTooManyReads(final int nReadsPerLocus, final int downsampleTo, final int payloadInBytes) { + logger.warn(String.format("testLIBS_NotHoldingTooManyReads %d %d %d", nReadsPerLocus, downsampleTo, payloadInBytes)); + final int readLength = 10; + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); + final int nSamples = 1; + final List samples = new ArrayList(nSamples); + for ( int i = 0; i < nSamples; i++ ) { + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); + final String sample = "sample" + i; + samples.add(sample); + rg.setSample(sample); + rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); + header.addReadGroup(rg); + } + + final boolean downsample = downsampleTo != -1; + final DownsamplingMethod downsampler = downsample + ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) + : new DownsamplingMethod(DownsampleType.NONE, null, null, false); + + // final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); + + final WeakReadTrackingIterator iterator = new WeakReadTrackingIterator(nReadsPerLocus, readLength, payloadInBytes, header); + + li = new LocusIteratorByState(iterator, + createTestReadProperties(downsampler, false), + genomeLocParser, + samples); + + while ( li.hasNext() ) { + final AlignmentContext next = li.next(); + Assert.assertTrue(next.getBasePileup().getNumberOfElements() <= downsampleTo, "Too many elements in pileup " + next); + // TODO -- assert that there are <= X reads in memory after GC for some X + } + } + + private static class WeakReadTrackingIterator implements Iterator { + final int nReads, readLength, payloadInBytes; + int readI = 0; + final SAMFileHeader header; + + private WeakReadTrackingIterator(int nReads, int readLength, final int payloadInBytes, final SAMFileHeader header) { + this.nReads = nReads; + this.readLength = readLength; + this.header = header; + this.payloadInBytes = payloadInBytes; + } + + @Override public boolean hasNext() { return readI < nReads; } + @Override public void remove() { throw new UnsupportedOperationException("no remove"); } + + @Override + public GATKSAMRecord next() { + readI++; + return makeRead(); + } + + private GATKSAMRecord makeRead() { + final SAMReadGroupRecord rg = header.getReadGroups().get(0); + final String readName = String.format("%s.%d.%s", "read", readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, 1, readLength); + read.setReadGroup(new GATKSAMReadGroupRecord(rg)); + if ( payloadInBytes > 0 ) + // add a payload byte array to push memory use per read even higher + read.setAttribute("PL", new byte[payloadInBytes]); + return read; + } + } } From 94800771e3c48e39fcef5280e5c31919031e8066 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Jan 2013 10:19:18 -0500 Subject: [PATCH 11/34] 1. Initial implementation of bam writing for the HaplotypeCaller with -bam argument; currently only assembled haplotypes are emitted. 2. Framework is set up in the VariantAnnotator for the HaplotypeCaller to be able to call in to annotate dbSNP plus comp RODs. Until the HC uses meta data though, this won't work. --- .../annotator/VariantAnnotatorEngine.java | 27 ++++++--- .../haplotypecaller/HaplotypeCaller.java | 57 ++++++++++++++++++- .../HaplotypeCallerIntegrationTest.java | 5 ++ .../broadinstitute/sting/utils/Haplotype.java | 12 ++++ 4 files changed, 92 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 99dadea54..f03a25c04 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -214,10 +215,10 @@ public class VariantAnnotatorEngine { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences - vc = annotateDBs(tracker, ref, vc, infoAnnotations); + vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations); // annotate expressions where available - annotateExpressions(tracker, ref, infoAnnotations); + annotateExpressions(tracker, ref.getLocus(), infoAnnotations); // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { @@ -254,10 +255,22 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); } - private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { + public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) { + final Map newInfoAnnotations = new HashMap(0); + vc = annotateDBs(tracker, loc, vc, newInfoAnnotations); + + if ( !newInfoAnnotations.isEmpty() ) { + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations); + vc = builder.make(); + } + + return vc; + } + + private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType()); // add the ID if appropriate if ( rsID != null ) { @@ -273,7 +286,7 @@ public class VariantAnnotatorEngine { } } else { boolean overlapsComp = false; - for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { + for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) { if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) { overlapsComp = true; break; @@ -287,9 +300,9 @@ public class VariantAnnotatorEngine { return vc; } - private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { + private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getValues(expression.binding, ref.getLocus()); + Collection VCs = tracker.getValues(expression.binding, loc); if ( VCs.size() == 0 ) continue; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 439a9b3b8..00db62bff 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,6 +47,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; +import com.sun.corba.se.impl.logging.UtilSystemException; +import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -57,6 +59,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -67,6 +70,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; @@ -142,6 +146,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here + * does not include uninformative reads so that not every input read is emitted to the bam. + */ + @Hidden + @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false) + protected StingSAMFileWriter bamWriter = null; + private SAMFileHeader bamHeader = null; + private long uniqueNameCounter = 1; + private final String readGroupId = "ArtificialHaplotype"; + /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ @@ -242,6 +257,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // the genotyping engine private GenotypingEngine genotypingEngine = null; + private VariantAnnotatorEngine annotationEngine = null; + // fasta reference reader to supplement the edges of the reference sequence private CachingIndexedFastaSequenceFile referenceReader; @@ -286,7 +303,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header - final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet(); @@ -320,6 +337,21 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); + + if ( bamWriter != null ) { + // prepare the bam header + bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); + final List readGroups = new ArrayList(1); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + bamWriter.writeHeader(bamHeader); + bamWriter.setPresorted(true); + } } //--------------------------------------------------------------------------------------------------------------- @@ -408,7 +440,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- @Override - public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { if ( justDetermineActiveRegions ) // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; @@ -461,9 +493,30 @@ public class HaplotypeCaller extends ActiveRegionWalker implem activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { + annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } + if ( bamWriter != null ) { + Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() ); + final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); + for ( Haplotype haplotype : haplotypes ) { + // TODO -- clean up this code + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(haplotype.getCigar()); + record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), readGroupId); + record.setFlags(16); + bamWriter.addAlignment(record); + } + } + if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } return 1; // One active region was processed during this map call diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8f5e275e6..e39975ea0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -75,6 +75,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } + @Test(enabled = false) + public void testHaplotypeCallerSingleSampleWithDbsnp() { + HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); + } + @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index efe9460cb..2706f2f99 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -191,6 +191,10 @@ public class Haplotype { public static class HaplotypeBaseComparator implements Comparator, Serializable { @Override public int compare( final Haplotype hap1, final Haplotype hap2 ) { + return compareHaplotypeBases(hap1, hap2); + } + + public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) { final byte[] arr1 = hap1.getBases(); final byte[] arr2 = hap2.getBases(); // compares byte arrays using lexical ordering @@ -203,6 +207,14 @@ public class Haplotype { } } + public static class HaplotypePositionComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final int comp = hap1.getAlignmentStartHapwrtRef() - hap2.getAlignmentStartHapwrtRef(); + return comp == 0 ? HaplotypeBaseComparator.compareHaplotypeBases(hap1, hap2) : comp; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, From 3c37ea014b91a57e55e56b0ac93033f7e3597ac8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 15 Jan 2013 10:24:45 -0500 Subject: [PATCH 12/34] Retire original TraverseActiveRegion, leaving only the new optimized version -- Required some updates to MD5s, which was unexpected, and will be sorted out later with more detailed unit tests --- .../HaplotypeCallerIntegrationTest.java | 12 +- .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../arguments/GATKArgumentCollection.java | 5 - .../sting/gatk/executive/MicroScheduler.java | 7 +- .../traversals/TraverseActiveRegions.java | 214 ++++++- .../TraverseActiveRegionsOptimized.java | 253 --------- .../TraverseActiveRegionsOriginal.java | 262 --------- ...TraverseActiveRegionsOriginalUnitTest.java | 523 ------------------ ...ava => TraverseActiveRegionsUnitTest.java} | 6 +- 9 files changed, 214 insertions(+), 1070 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java rename public/java/test/org/broadinstitute/sting/gatk/traversals/{TraverseActiveRegionsOptimizedUnitTest.java => TraverseActiveRegionsUnitTest.java} (99%) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8f5e275e6..780934c03 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -67,18 +67,18 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); + HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); + HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a"); } @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "c679ae7f04bdfda896b5c046d35e043c"); + "a938cdd7262968597fc8eb6c1c0a69f1"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "1a034b7eb572e1b6f659d6e5d57b3e76"); + "d590c8d6d5e58d685401b65a23846893"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -129,7 +129,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a5926aeae..f9d6955c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -842,7 +842,7 @@ public class GenomeAnalysisEngine { if (argCollection.keepProgramRecords) removeProgramRecords = false; - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART; + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; return new SAMDataSource( samReaderIDs, diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b6f0d5f90..ab09064dd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -448,10 +448,5 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - @Hidden - @Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false) - public boolean newART = false; - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c127899f6..371cce778 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -245,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else if (walker instanceof ReadPairWalker) { return new TraverseReadPairs(); } else if (walker instanceof ActiveRegionWalker) { - if ( engine.getArguments().newART ) { - // todo -- create optimized traversal - return new TraverseActiveRegionsOptimized(); - } else { - return new TraverseActiveRegionsOriginal(); - } + return new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 45dbb6dc8..03aaf95f2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; @@ -43,8 +44,7 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -53,7 +53,7 @@ import java.util.List; * Time: 4:45 PM * To change this template use File | Settings | File Templates. */ -public abstract class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { protected final static boolean DEBUG = false; // set by the tranversal @@ -66,14 +66,6 @@ public abstract class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); - abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker); - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public abstract T endTraversal(final Walker walker, T sum); - protected int getActiveRegionExtension() { return activeRegionExtension; } @@ -160,4 +152,204 @@ public abstract class TraverseActiveRegions extends TraversalEngine myReads = new LinkedList(); + private Shard lastShard = null; + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + + final HashSet maybeDuplicatedReads = new HashSet(); + // TODO -- there's got to be a better way to know this + if ( lastShard != dataProvider.getShard() ) { + maybeDuplicatedReads.addAll(myReads); + logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); + if ( DEBUG ) logger.warn("Clearing myReads"); + } + lastShard = dataProvider.getShard(); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); + for( final GATKSAMRecord read : reads ) { + notifyOfCurrentPosition(read); + // most of the time maybeDuplicatedReads is empty + // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the + // TODO -- potential list of duplicates we can clear the hashset + if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { + if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); + } else { + if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); + myReads.add((GATKSAMRecord)read); + } + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, false); + + return sum; + } + + private GenomeLoc startOfLiveRegion = null; + + protected void notifyOfCurrentPosition(final GATKSAMRecord read) { + notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read)); + } + + protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { + if ( startOfLiveRegion == null ) + startOfLiveRegion = currentLocation; + else + startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); + } + + protected GenomeLoc getStartOfLiveRegion() { + return startOfLiveRegion; + } + + protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { + return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) + || ! region.onSameContig(getStartOfLiveRegion()); + } + + private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); + } + } + + private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { + final ActiveRegion activeRegion = workQueue.remove(); + if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); + sum = processActiveRegion( activeRegion, sum, walker ); + } else { + break; + } + } + + return sum; + } + + @Override + public String toString() { + return "TraverseActiveRegions"; + } + + private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) { + return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true); + } + + protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { + final Iterator liveReads = myReads.iterator(); + while ( liveReads.hasNext() ) { + boolean killed = false; + final GATKSAMRecord read = liveReads.next(); + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + activeRegion.add(read); + + if ( ! walker.wantsNonPrimaryReads() ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); + liveReads.remove(); + killed = true; + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + + if ( ! killed && readIsDead(read, readLoc, activeRegion) ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); + liveReads.remove(); + } + } + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } + + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal(final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java deleted file mode 100644 index 809c7ea6a..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 12/9/11 - */ - -public class TraverseActiveRegionsOptimized extends TraverseActiveRegions { - private LinkedList myReads = new LinkedList(); - private Shard lastShard = null; - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final HashSet maybeDuplicatedReads = new HashSet(); - // TODO -- there's got to be a better way to know this - if ( lastShard != dataProvider.getShard() ) { - maybeDuplicatedReads.addAll(myReads); - logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); - if ( DEBUG ) logger.warn("Clearing myReads"); - } - lastShard = dataProvider.getShard(); - - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); - for( final GATKSAMRecord read : reads ) { - notifyOfCurrentPosition(read); - // most of the time maybeDuplicatedReads is empty - // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the - // TODO -- potential list of duplicates we can clear the hashset - if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { - if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); - } else { - if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); - myReads.add((GATKSAMRecord)read); - } - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - updateCumulativeMetrics(dataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) ); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, false); - - return sum; - } - - private GenomeLoc startOfLiveRegion = null; - - protected void notifyOfCurrentPosition(final GATKSAMRecord read) { - notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read)); - } - - protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { - if ( startOfLiveRegion == null ) - startOfLiveRegion = currentLocation; - else - startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); - } - - protected GenomeLoc getStartOfLiveRegion() { - return startOfLiveRegion; - } - - protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { - return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) - || ! region.onSameContig(getStartOfLiveRegion()); - } - - private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); - } - } - - private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { - final ActiveRegion activeRegion = workQueue.remove(); - if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - @Override - public String toString() { - return "TraverseActiveRegionsOptimized"; - } - - private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) { - return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true); - } - - @Override - protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { - final Iterator liveReads = myReads.iterator(); - while ( liveReads.hasNext() ) { - boolean killed = false; - final GATKSAMRecord read = liveReads.next(); - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - activeRegion.add(read); - - if ( ! walker.wantsNonPrimaryReads() ) { - if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); - liveReads.remove(); - killed = true; - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - - if ( ! killed && readIsDead(read, readLoc, activeRegion) ) { - if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); - liveReads.remove(); - } - } - - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); - } - - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - @Override - public T endTraversal(final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java deleted file mode 100644 index 0786bc800..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 12/9/11 - */ - -public class TraverseActiveRegionsOriginal extends TraverseActiveRegions { - private final LinkedHashSet myReads = new LinkedHashSet(); - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); - final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - - int minStart = Integer.MAX_VALUE; - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); - } - - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - updateCumulativeMetrics(dataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); - - return sum; - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - * - * @param profile - * @param activeRegions - * @param activeRegionExtension - * @param maxRegionSize - * @return - */ - private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, - final List activeRegions, - final int activeRegionExtension, - final int maxRegionSize) { - if ( profile.isEmpty() ) - throw new IllegalStateException("trying to incorporate an empty active profile " + profile); - - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); - return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); - } - - // -------------------------------------------------------------------------------- - // - // code to handle processing active regions - // - // -------------------------------------------------------------------------------- - - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); - } - } - - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - @Override - protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); - bestRegion = otherRegionToTest; - } - } - bestRegion.add( read ); - - // The read is also added to all other regions in which it overlaps but marked as non-primary - if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); - } - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( !bestRegion.equals(otherRegionToTest) ) { - // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } - } - } - } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map( activeRegion, null ); - return walker.reduce( x, sum ); - } - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker) walker, sum, Integer.MAX_VALUE, null); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java deleted file mode 100644 index 35a0931df..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import com.google.java.contract.PreconditionError; -import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 1/10/13 - * Time: 8:03 PM - * To change this template use File | Settings | File Templates. - */ -public class TraverseActiveRegionsOriginalUnitTest extends BaseTest { - - private class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new HashMap(); - - public DummyActiveRegionWalker() { - this.prob = 1.0; - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(EnumSet wantStates) { - this.prob = 1.0; - this.states = wantStates; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - return new ActivityProfileResult(ref.getLocus(), prob); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - - private final TraverseActiveRegions t = new TraverseActiveRegionsOriginal(); - - private IndexedFastaSequenceFile reference; - private SAMSequenceDictionary dictionary; - private GenomeLocParser genomeLocParser; - - private List intervals; - - private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; - private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; - - @BeforeClass - private void init() throws FileNotFoundException { - reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); - dictionary = reference.getSequenceDictionary(); - genomeLocParser = new GenomeLocParser(dictionary); - - // TODO: reads with indels - // TODO: reads which span many regions - // TODO: reads which are partially between intervals (in/outside extension) - // TODO: duplicate reads - // TODO: read at the end of a contig - // TODO: reads which are completely outside intervals but within extension - // TODO: test the extension itself - // TODO: unmapped reads - - intervals = new ArrayList(); - intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); - intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); - - List reads = new ArrayList(); - reads.add(buildSAMRecord("simple", "1", 100, 200)); - reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); - reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); - reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); - reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); - reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); - reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); - reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); - reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); - reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); - reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); - reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); - reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); - - createBAM(reads); - } - - private void createBAM(List reads) { - File outFile = new File(testBAM); - outFile.deleteOnExit(); - File indexFile = new File(testBAI); - indexFile.deleteOnExit(); - - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); - for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { - out.addAlignment(read); - } - out.close(); - } - - @Test - public void testAllBasesSeen() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - List activeIntervals = getIsActiveIntervals(walker, intervals); - // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call - verifyEqualIntervals(intervals, activeIntervals); - } - - private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { - List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) { - t.traverse(walker, dataProvider, 0); - activeIntervals.addAll(walker.isActiveCalls); - } - - return activeIntervals; - } - - @Test (expectedExceptions = PreconditionError.class) - public void testIsActiveRangeLow () { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); - getActiveRegions(walker, intervals).values(); - } - - @Test (expectedExceptions = PreconditionError.class) - public void testIsActiveRangeHigh () { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); - getActiveRegions(walker, intervals).values(); - } - - @Test - public void testActiveRegionCoverage() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(walker, intervals).values(); - verifyActiveRegionCoverage(intervals, activeRegions); - } - - private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { - List intervalStarts = new ArrayList(); - List intervalStops = new ArrayList(); - - for (GenomeLoc interval : intervals) { - intervalStarts.add(interval.getStartLocation()); - intervalStops.add(interval.getStopLocation()); - } - - Map baseRegionMap = new HashMap(); - - for (ActiveRegion activeRegion : activeRegions) { - for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { - // Contract: Regions do not overlap - Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); - baseRegionMap.put(activeLoc, activeRegion); - } - - GenomeLoc start = activeRegion.getLocation().getStartLocation(); - if (intervalStarts.contains(start)) - intervalStarts.remove(start); - - GenomeLoc stop = activeRegion.getLocation().getStopLocation(); - if (intervalStops.contains(stop)) - intervalStops.remove(stop); - } - - for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { - // Contract: Each location in the interval(s) is in exactly one region - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); - baseRegionMap.remove(baseLoc); - } - - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); - - // Contract: All explicit interval boundaries must also be region boundaries - Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); - Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); - } - - @Test - public void testActiveRegionExtensionOnContig() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(walker, intervals).values(); - for (ActiveRegion activeRegion : activeRegions) { - GenomeLoc loc = activeRegion.getExtendedLoc(); - - // Contract: active region extensions must stay on the contig - Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); - int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); - Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); - } - } - - @Test - public void testPrimaryReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testNonPrimaryReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testExtendedReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testUnmappedReads() { - // TODO - } - - private void verifyReadMapping(ActiveRegion region, String... reads) { - Collection wantReads = new ArrayList(Arrays.asList(reads)); - for (SAMRecord read : region.getReads()) { - String regionReadName = read.getReadName(); - Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region); - wantReads.remove(regionReadName); - } - - Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region); - } - - private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) - t.traverse(walker, dataProvider, 0); - - t.endTraversal(walker, 0); - - return walker.mappedActiveRegions; - } - - private Collection toSingleBaseLocs(GenomeLoc interval) { - List bases = new ArrayList(); - if (interval.size() == 1) - bases.add(interval); - else { - for (int location = interval.getStart(); location <= interval.getStop(); location++) - bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); - } - - return bases; - } - - private Collection toSingleBaseLocs(List intervals) { - Set bases = new TreeSet(); // for sorting and uniqueness - for (GenomeLoc interval : intervals) - bases.addAll(toSingleBaseLocs(interval)); - - return bases; - } - - private void verifyEqualIntervals(List aIntervals, List bIntervals) { - Collection aBases = toSingleBaseLocs(aIntervals); - Collection bBases = toSingleBaseLocs(bIntervals); - - Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); - - Iterator aIter = aBases.iterator(); - Iterator bIter = bBases.iterator(); - while (aIter.hasNext() && bIter.hasNext()) { - GenomeLoc aLoc = aIter.next(); - GenomeLoc bLoc = bIter.next(); - Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); - } - } - - // copied from LocusViewTemplate - protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { - SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); - header.setSequenceDictionary(dictionary); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - GATKSAMRecord record = new GATKSAMRecord(header); - - record.setReadName(readName); - record.setReferenceIndex(dictionary.getSequenceIndex(contig)); - record.setAlignmentStart(alignmentStart); - - Cigar cigar = new Cigar(); - int len = alignmentEnd - alignmentStart + 1; - cigar.add(new CigarElement(len, CigarOperator.M)); - record.setCigar(cigar); - record.setReadString(new String(new char[len]).replace("\0", "A")); - record.setBaseQualities(new byte[len]); - - return record; - } - - private List createDataProviders(final Walker walker, List intervals, String bamFile) { - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine, walker); - - Collection samFiles = new ArrayList(); - SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); - samFiles.add(readerID); - - SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); - - List providers = new ArrayList(); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } - } - - return providers; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 038cd2853..c4dadbcce 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -76,7 +76,7 @@ import java.util.*; * Test the Active Region Traversal Contract * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ -public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { +public class TraverseActiveRegionsUnitTest extends BaseTest { private final static boolean ENFORCE_CONTRACTS = false; private final static boolean DEBUG = false; @@ -131,7 +131,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { @DataProvider(name = "TraversalEngineProvider") public Object[][] makeTraversals() { final List traversals = new LinkedList(); - traversals.add(new Object[]{new TraverseActiveRegionsOptimized()}); + traversals.add(new Object[]{new TraverseActiveRegions()}); return traversals.toArray(new Object[][]{}); } @@ -537,7 +537,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, t instanceof TraverseActiveRegionsOptimized); + false, (byte)30, false, true); List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { From d3baa4b8cac086fb433b2e408e1165713b9ff0b7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Jan 2013 11:36:20 -0500 Subject: [PATCH 13/34] Have Haplotype extend the Allele class. This way, we don't need to create a new Allele for every read/Haplotype pair to be placed in the PerReadAlleleLikelihoodMap (very inefficient). Also, now we can easily get the Haplotype associated with the best allele for a given read. --- .../haplotypecaller/HaplotypeCaller.java | 87 ++++++++++++------- .../LikelihoodCalculationEngine.java | 3 +- .../broadinstitute/sting/utils/Haplotype.java | 64 +++++++------- .../variant/variantcontext/Allele.java | 4 +- 4 files changed, 93 insertions(+), 65 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 00db62bff..04da91f65 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.sun.corba.se.impl.logging.UtilSystemException; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -155,7 +154,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected StingSAMFileWriter bamWriter = null; private SAMFileHeader bamHeader = null; private long uniqueNameCounter = 1; - private final String readGroupId = "ArtificialHaplotype"; + private final static String readGroupId = "ArtificialHaplotype"; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. @@ -338,20 +337,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); - if ( bamWriter != null ) { - // prepare the bam header - bamHeader = new SAMFileHeader(); - bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); - final List readGroups = new ArrayList(1); - final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); - rg.setSample("HC"); - rg.setSequencingCenter("BI"); - readGroups.add(rg); - bamHeader.setReadGroups(readGroups); - bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - bamWriter.writeHeader(bamHeader); - bamWriter.setPresorted(true); - } + if ( bamWriter != null ) + setupBamWriter(); } //--------------------------------------------------------------------------------------------------------------- @@ -461,8 +448,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails - final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region - referenceHaplotype.setIsReference(true); + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype ); @@ -498,22 +484,19 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } if ( bamWriter != null ) { + // sort the haplotypes in coordinate order and then write them to the bam Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() ); final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); - for ( Haplotype haplotype : haplotypes ) { - // TODO -- clean up this code - final GATKSAMRecord record = new GATKSAMRecord(bamHeader); - record.setReadBases(haplotype.getBases()); - record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); - record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); - record.setCigar(haplotype.getCigar()); - record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0); - record.setReadName("HC" + uniqueNameCounter++); - record.setReadUnmappedFlag(false); - record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex()); - record.setAttribute(SAMTag.RG.toString(), readGroupId); - record.setFlags(16); - bamWriter.addAlignment(record); + for ( Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); + + // now, output the interesting reads for each sample + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), (Haplotype)bestAllele); + } } } @@ -608,6 +591,46 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } return returnMap; + } + + private void setupBamWriter() { + // prepare the bam header + bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + // include the original read groups plus a new artificial one for the haplotypes + final List readGroups = new ArrayList(getToolkit().getSAMFileHeader().getReadGroups()); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + + bamWriter.writeHeader(bamHeader); + bamWriter.setPresorted(true); + } + + private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(haplotype.getCigar()); + record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(paddedRefLoc.getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), readGroupId); + record.setFlags(16); + bamWriter.addAlignment(record); + } + + private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype) { + + + + } /* diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 8b844817d..e05ad85a9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -138,6 +138,7 @@ public class LikelihoodCalculationEngine { readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated + // TODO -- why is Q18 hard-coded here??? readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); } @@ -151,7 +152,7 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()), + perReadAlleleLikelihoodMap.add(read, haplotype, pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 2706f2f99..4830bf053 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -37,59 +37,71 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; -public class Haplotype { - protected final byte[] bases; +public class Haplotype extends Allele { protected final double[] quals; private GenomeLoc genomeLocation = null; private HashMap eventMap = null; - private boolean isRef = false; private Cigar cigar; private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; private Event artificialEvent = null; + /** + * Main constructor + * + * @param bases bases + * @param quals quals + * @param isRef is reference allele? + */ + public Haplotype( final byte[] bases, final double[] quals, final boolean isRef ) { + super(bases.clone(), isRef); + this.quals = quals.clone(); + } + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * * @param bases bases * @param qual qual */ - public Haplotype( final byte[] bases, final int qual ) { - this.bases = bases.clone(); + public Haplotype( final byte[] bases, final int qual, final boolean isRef ) { + super(bases.clone(), isRef); quals = new double[bases.length]; Arrays.fill(quals, (double)qual); } + public Haplotype( final byte[] bases, final int qual ) { + this(bases, qual, false); + } + + public Haplotype( final byte[] bases, final boolean isRef ) { + this(bases, 0, isRef); + } + public Haplotype( final byte[] bases, final double[] quals ) { - this.bases = bases.clone(); - this.quals = quals.clone(); + this(bases, quals, false); } public Haplotype( final byte[] bases ) { - this(bases, 0); + this(bases, 0, false); } protected Haplotype( final byte[] bases, final Event artificialEvent ) { - this(bases, 0); + this(bases, 0, false); this.artificialEvent = artificialEvent; } public Haplotype( final byte[] bases, final GenomeLoc loc ) { - this(bases); + this(bases, 0, false); this.genomeLocation = loc; } @Override public boolean equals( Object h ) { - return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases); + return h instanceof Haplotype && super.equals(h); } - @Override - public int hashCode() { - return Arrays.hashCode(bases); - } - public HashMap getEventMap() { return eventMap; } @@ -98,17 +110,9 @@ public class Haplotype { this.eventMap = eventMap; } - public boolean isReference() { - return isRef; - } - - public void setIsReference( boolean isRef ) { - this.isRef = isRef; - } - public double getQualitySum() { double s = 0; - for (int k=0; k < bases.length; k++) { + for (int k=0; k < quals.length; k++) { s += quals[k]; } return s; @@ -116,14 +120,14 @@ public class Haplotype { @Override public String toString() { - return new String(bases); + return getDisplayString(); } public double[] getQuals() { return quals.clone(); } public byte[] getBases() { - return bases.clone(); + return super.getBases().clone(); } public long getStartPosition() { @@ -178,13 +182,13 @@ public class Haplotype { public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); - if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype + if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= getBases().length ) { // desired change falls inside deletion so don't bother creating a new haplotype return null; } byte[] newHaplotypeBases = new byte[]{}; - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), haplotypeInsertLocation + refAllele.length(), getBases().length)); // bases after the variant return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation)); } diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java index 33bca1a8a..0a0b4d0b7 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java @@ -111,7 +111,7 @@ public class Allele implements Comparable { /** A generic static NO_CALL allele for use */ // no public way to create an allele - private Allele(byte[] bases, boolean isRef) { + protected Allele(byte[] bases, boolean isRef) { // null alleles are no longer allowed if ( wouldBeNullAllele(bases) ) { throw new IllegalArgumentException("Null alleles are not supported"); @@ -140,7 +140,7 @@ public class Allele implements Comparable { throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); } - private Allele(String bases, boolean isRef) { + protected Allele(String bases, boolean isRef) { this(bases.getBytes(), isRef); } From 0d282a7750df16b154f87cc83e391188c70e1dab Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 00:12:02 -0500 Subject: [PATCH 15/34] Bam writing from HaplotypeCaller seems to be working on all my test cases. Note that it's a hidden debugging option for now. Please let me know if you notice any bad behavior with it. --- .../haplotypecaller/HaplotypeCaller.java | 60 +++++++++++++++++-- .../LikelihoodCalculationEngine.java | 4 -- .../broadinstitute/sting/utils/Haplotype.java | 8 --- 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 04da91f65..4da2e1179 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -484,18 +484,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } if ( bamWriter != null ) { - // sort the haplotypes in coordinate order and then write them to the bam - Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() ); + // write the haplotypes to the bam final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); for ( Haplotype haplotype : haplotypes ) writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); - // now, output the interesting reads for each sample + // next, output the interesting reads for each sample aligned against the appropriate haplotype for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), (Haplotype)bestAllele); + writeReadAgainstHaplotype(entry.getKey(), (Haplotype) bestAllele, paddedRefLoc.getStart()); } } } @@ -607,8 +606,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem readGroups.add(rg); bamHeader.setReadGroups(readGroups); + bamWriter.setPresorted(false); bamWriter.writeHeader(bamHeader); - bamWriter.setPresorted(true); } private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { @@ -626,11 +625,60 @@ public class HaplotypeCaller extends ActiveRegionWalker implem bamWriter.addAlignment(record); } - private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype) { + private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) { + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2); + final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1(); + final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; + read.setAlignmentStart(readStartOnReference); + final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar()); + read.setCigar(cigar); + bamWriter.addAlignment(read); + } + private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) { + + int currentReadPos = 0; + int currentHapPos = 0; + final List readCigarElements = new ArrayList(); + + for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) { + + if ( cigarElement.getOperator() == CigarOperator.D ) { + if ( currentReadPos > 0 ) + readCigarElements.add(cigarElement); + } else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) { + + final int elementLength = cigarElement.getLength(); + final int nextReadPos = currentReadPos + elementLength; + final int nextHapPos = currentHapPos + elementLength; + + // do we want this element? + if ( currentReadPos > 0 ) { + // do we want the entire element? + if ( nextReadPos < read.getReadLength() ) { + readCigarElements.add(cigarElement); + currentReadPos = nextReadPos; + } + // otherwise, we can finish up and return the cigar + else { + readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator())); + return new Cigar(readCigarElements); + } + } + // do we want part of the element to start? + else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) { + currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength()); + readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator())); + } + + currentHapPos = nextHapPos; + } + } + + return new Cigar(readCigarElements); } /* diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index e05ad85a9..aafdbf126 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -145,10 +145,6 @@ public class LikelihoodCalculationEngine { for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { final Haplotype haplotype = haplotypes.get(jjj); - // TODO -- need to test against a reference/position with non-standard bases - //if ( !Allele.acceptableAlleleBases(haplotype.getBases(), false) ) - // continue; - final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 4830bf053..8c40b9972 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -211,14 +211,6 @@ public class Haplotype extends Allele { } } - public static class HaplotypePositionComparator implements Comparator, Serializable { - @Override - public int compare( final Haplotype hap1, final Haplotype hap2 ) { - final int comp = hap1.getAlignmentStartHapwrtRef() - hap2.getAlignmentStartHapwrtRef(); - return comp == 0 ? HaplotypeBaseComparator.compareHaplotypeBases(hap1, hap2) : comp; - } - } - public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, From 392b5cbcdfd5200f04d0b26f9e73d16399e17769 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 10:22:43 -0500 Subject: [PATCH 16/34] The CachingIndexedFastaSequenceFile now automatically converts IUPAC bases to Ns and errors out on other non-standard bases. This way walkers won't see anything except the standard bases plus Ns in the reference. Added option to turn off this feature (to maintain backwards compatibility). As part of this commit I cleaned up the BaseUtils code by adding a Base enum and removing all of the static indexes for each of the bases. This uncovered a bug in the way the DepthOfCoverage walker counts deletions (it was counting Ns instead!) that isn't covered by tests. Fortunately that walker is being deprecated soon... --- .../gatk/walkers/annotator/GCContent.java | 4 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../coverage/DepthOfCoverageStats.java | 2 +- .../validation/ValidationAmplicons.java | 2 +- .../ConcordanceMetricsUnitTest.java | 48 ++++----- .../gatk/walkers/coverage/CoverageUtils.java | 6 +- .../walkers/coverage/GCContentByInterval.java | 2 +- .../CachingIndexedFastaSequenceFile.java | 44 +++++--- .../sting/utils/pileup/PileupElement.java | 3 +- .../sting/utils/sam/AlignmentUtils.java | 9 +- .../variant/utils/BaseUtils.java | 102 ++++++++++++------ .../variant/utils/BaseUtilsUnitTest.java | 15 +++ .../GenotypeLikelihoodsUnitTest.java | 6 +- 13 files changed, 153 insertions(+), 92 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 3bb3d7d5a..2b3290595 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -95,9 +95,9 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota for ( byte base : ref.getBases() ) { int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex ) + if ( baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal() ) gc++; - else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex ) + else if ( baseIndex == BaseUtils.Base.A.ordinal() || baseIndex == BaseUtils.Base.T.ordinal() ) at++; else ; // ignore diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 1e4c55e0d..b10daab58 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -938,7 +938,7 @@ public class DepthOfCoverage extends LocusWalker { if ( lowerCaseSNPs ) { sequence.append(Character.toLowerCase((char) ref.getBase())); } else { - sequence.append((char) BaseUtils.N); + sequence.append((char) BaseUtils.Base.N.base); } rawSequence.append(Character.toUpperCase((char) ref.getBase())); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 28f128dd3..6db44efd5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -111,8 +111,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData1() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_C)); @@ -160,9 +160,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData2() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(reference_A,alt_T)); @@ -213,10 +213,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData3() { - Allele reference_ACT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C,BaseUtils.T},true); - Allele alt_AC = Allele.create(new byte[]{BaseUtils.A,BaseUtils.C}); - Allele alt_A = Allele.create(BaseUtils.A); - Allele alt_ATT = Allele.create(new byte[]{BaseUtils.A,BaseUtils.T,BaseUtils.T}); + Allele reference_ACT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base,BaseUtils.Base.T.base},true); + Allele alt_AC = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.C.base}); + Allele alt_A = Allele.create(BaseUtils.Base.A.base); + Allele alt_ATT = Allele.create(new byte[]{BaseUtils.Base.A.base,BaseUtils.Base.T.base,BaseUtils.Base.T.base}); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_ACT,alt_ATT)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(alt_A,alt_A)); @@ -267,9 +267,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData4() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", Arrays.asList(Allele.NO_CALL,Allele.NO_CALL)); @@ -316,9 +316,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private Pair getData5() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); - Allele alt_T = Allele.create(BaseUtils.T); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); Genotype sam_1_1_eval = GenotypeBuilder.create("test1_sample1", Arrays.asList(reference_A,reference_A)); Genotype sam_1_2_eval = GenotypeBuilder.create("test1_sample2", new ArrayList(0)); @@ -368,8 +368,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { private List> getData6() { - Allele reference_A = Allele.create(BaseUtils.A,true); - Allele alt_C = Allele.create(BaseUtils.C); + Allele reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_C = Allele.create(BaseUtils.Base.C.base); // site 1 - @@ -396,8 +396,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite1 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - reference_A = Allele.create(BaseUtils.A,true); - Allele alt_T = Allele.create(BaseUtils.T); + reference_A = Allele.create(BaseUtils.Base.A.base,true); + Allele alt_T = Allele.create(BaseUtils.Base.T.base); // site 2 - // sample 1: no-call/hom-ref @@ -421,7 +421,7 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Pair testDataSite2 = new Pair(eval_1_builder.make(),truth_1_builder.make()); - Allele alt_G = Allele.create(BaseUtils.G); + Allele alt_G = Allele.create(BaseUtils.Base.G.base); // site 3 - // sample 1: alleles do not match @@ -605,10 +605,10 @@ public class ConcordanceMetricsUnitTest extends BaseTest { public List> getData7() { - Allele ref1 = Allele.create(BaseUtils.T,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.G); - Allele alt3 = Allele.create(BaseUtils.A); + Allele ref1 = Allele.create(BaseUtils.Base.T.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.G.base); + Allele alt3 = Allele.create(BaseUtils.Base.A.base); GenomeLoc loc1 = genomeLocParser.createGenomeLoc("chr1",1,1); VariantContextBuilder site1Eval = new VariantContextBuilder(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index 573291d06..fe2eee2a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -217,9 +217,9 @@ public class CoverageUtils { private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); - } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); + counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount(); + } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { + counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount(); } else { try { counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java index 9cd1be2d9..668d3fd5f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java @@ -86,7 +86,7 @@ public class GCContentByInterval extends LocusWalker { if (tracker == null) return null; int baseIndex = ref.getBaseIndex(); - return (baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex) ? 1L : 0L; + return (baseIndex == BaseUtils.Base.G.ordinal() || baseIndex == BaseUtils.Base.C.ordinal()) ? 1L : 0L; } public Long reduce(Long toAdd, Long runningCount) { diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 3d43d5d4d..88eaa8910 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -33,6 +33,7 @@ import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.utils.BaseUtils; import java.io.File; import java.io.FileNotFoundException; @@ -41,9 +42,10 @@ import java.util.Arrays; /** * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * - * Thread-safe! Uses a thread-local cache + * Thread-safe! Uses a thread-local cache. * - * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set + * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set. + * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -64,10 +66,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { private final long cacheMissBackup; /** - * If true, we will preserve the case of the original base in the genome, not + * If true, we will preserve the case of the original base in the genome */ private final boolean preserveCase; + /** + * If true, we will preserve the IUPAC bases in the genome + */ + private final boolean preserveIUPAC; + // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; @@ -97,13 +104,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param index the index of the fasta file, used for efficient random access * @param cacheSize the size in bp of the cache we will use for this reader * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; } /** @@ -122,19 +131,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; + preserveIUPAC = false; } -// /** -// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. -// * -// * @param fasta The file to open. -// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. -// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. -// */ -// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { -// this(fasta, index, DEFAULT_CACHE_SIZE); -// } - /** * Same as general constructor but allows one to override the default cacheSize * @@ -145,7 +144,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size in bp of the cache we will use for this reader */ public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { - this(fasta, index, cacheSize, false); + this(fasta, index, cacheSize, false, false); } /** @@ -240,6 +239,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return ! isPreservingCase(); } + /** + * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns? + * + * @return true if the IUPAC bases coming from this reader are not modified + */ + public boolean isPreservingIUPAC() { + return preserveIUPAC; + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -261,8 +269,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true); } else { - // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true + // todo -- potential optimization is to check if contig.name == contig, as this in general will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); if (stop > contigInfo.getSequenceLength()) @@ -276,6 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { // convert all of the bases in the sequence to upper case if we aren't preserving cases if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true); } else { cacheHits++; } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index c0e18f227..5a5358208 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -52,7 +51,7 @@ public class PileupElement implements Comparable { private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); - public static final byte DELETION_BASE = BaseUtils.D; + public static final byte DELETION_BASE = BaseUtils.Base.D.base; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 0907a0239..b7a813ec2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,7 +31,6 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -402,13 +401,13 @@ public class AlignmentUtils { switch (ce.getOperator()) { case I: if (alignPos > 0) { - if (alignment[alignPos - 1] == BaseUtils.A) { + if (alignment[alignPos - 1] == BaseUtils.Base.A.base) { alignment[alignPos - 1] = PileupElement.A_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.C) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.C.base) { alignment[alignPos - 1] = PileupElement.C_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.T) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.T.base) { alignment[alignPos - 1] = PileupElement.T_FOLLOWED_BY_INSERTION_BASE; - } else if (alignment[alignPos - 1] == BaseUtils.G) { + } else if (alignment[alignPos - 1] == BaseUtils.Base.G.base) { alignment[alignPos - 1] = PileupElement.G_FOLLOWED_BY_INSERTION_BASE; } } diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java index 819041a3e..7a37e8de5 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java @@ -26,6 +26,7 @@ package org.broadinstitute.variant.utils; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.Random; @@ -34,42 +35,66 @@ import java.util.Random; * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { - public final static byte A = (byte) 'A'; - public final static byte C = (byte) 'C'; - public final static byte G = (byte) 'G'; - public final static byte T = (byte) 'T'; - public final static byte N = (byte) 'N'; - public final static byte D = (byte) 'D'; + public enum Base { + A ((byte)'A'), + C ((byte)'C'), + G ((byte)'G'), + T ((byte)'T'), + N ((byte)'N'), + D ((byte)'D'); - // - // todo -- we need a generalized base abstraction using the Base enum. - // + public byte base; + + private Base(final byte base) { + this.base = base; + } + } + + // todo -- add this to the generalized base abstraction using the Base enum. public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); - baseIndexMap['A'] = 0; - baseIndexMap['a'] = 0; - baseIndexMap['*'] = 0; // the wildcard character counts as an A - baseIndexMap['C'] = 1; - baseIndexMap['c'] = 1; - baseIndexMap['G'] = 2; - baseIndexMap['g'] = 2; - baseIndexMap['T'] = 3; - baseIndexMap['t'] = 3; + baseIndexMap['A'] = Base.A.ordinal(); + baseIndexMap['a'] = Base.A.ordinal(); + baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A + baseIndexMap['C'] = Base.C.ordinal(); + baseIndexMap['c'] = Base.C.ordinal(); + baseIndexMap['G'] = Base.G.ordinal(); + baseIndexMap['g'] = Base.G.ordinal(); + baseIndexMap['T'] = Base.T.ordinal(); + baseIndexMap['t'] = Base.T.ordinal(); } - // todo -- fix me (enums?) - public static final byte DELETION_INDEX = 4; - public static final byte NO_CALL_INDEX = 5; // (this is 'N') - - public static final int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A'); - public static final int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C'); - public static final int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G'); - public static final int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T'); + static private final int[] baseIndexWithIupacMap = baseIndexMap.clone(); + static { + baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad + baseIndexWithIupacMap['N'] = Base.N.ordinal(); + baseIndexWithIupacMap['n'] = Base.N.ordinal(); + baseIndexWithIupacMap['R'] = Base.N.ordinal(); + baseIndexWithIupacMap['r'] = Base.N.ordinal(); + baseIndexWithIupacMap['Y'] = Base.N.ordinal(); + baseIndexWithIupacMap['y'] = Base.N.ordinal(); + baseIndexWithIupacMap['M'] = Base.N.ordinal(); + baseIndexWithIupacMap['m'] = Base.N.ordinal(); + baseIndexWithIupacMap['K'] = Base.N.ordinal(); + baseIndexWithIupacMap['k'] = Base.N.ordinal(); + baseIndexWithIupacMap['W'] = Base.N.ordinal(); + baseIndexWithIupacMap['w'] = Base.N.ordinal(); + baseIndexWithIupacMap['S'] = Base.N.ordinal(); + baseIndexWithIupacMap['s'] = Base.N.ordinal(); + baseIndexWithIupacMap['B'] = Base.N.ordinal(); + baseIndexWithIupacMap['b'] = Base.N.ordinal(); + baseIndexWithIupacMap['D'] = Base.N.ordinal(); + baseIndexWithIupacMap['d'] = Base.N.ordinal(); + baseIndexWithIupacMap['H'] = Base.N.ordinal(); + baseIndexWithIupacMap['h'] = Base.N.ordinal(); + baseIndexWithIupacMap['V'] = Base.N.ordinal(); + baseIndexWithIupacMap['v'] = Base.N.ordinal(); + } // Use a fixed random seed to allow for deterministic results when using random bases private static final Random randomNumberGen = new Random(47382911L); @@ -96,10 +121,10 @@ public class BaseUtils { } public static boolean isTransition(byte base1, byte base2) { - int b1 = simpleBaseToBaseIndex(base1); - int b2 = simpleBaseToBaseIndex(base2); - return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 || - b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1; + final int b1 = simpleBaseToBaseIndex(base1); + final int b2 = simpleBaseToBaseIndex(base2); + return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() || + b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal(); } public static boolean isTransversion(byte base1, byte base2) { @@ -141,6 +166,19 @@ public class BaseUtils { return base >= 'A' && base <= 'Z'; } + public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) { + final int length = bases.length; + for ( int i = 0; i < length; i++ ) { + final int baseIndex = baseIndexWithIupacMap[bases[i]]; + if ( baseIndex == Base.N.ordinal() ) { + bases[i] = 'N'; + } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { + throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + } + } + return bases; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -231,10 +269,10 @@ public class BaseUtils { switch (base) { case 'd': case 'D': - return DELETION_INDEX; + return Base.D.ordinal(); case 'n': case 'N': - return NO_CALL_INDEX; + return Base.N.ordinal(); default: return simpleBaseToBaseIndex(base); diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java index 372d13a7a..4f918f718 100644 --- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java @@ -50,6 +50,21 @@ public class BaseUtilsUnitTest extends BaseTest { Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); } + @Test + public void testConvertIUPACtoN() { + + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false), new byte[]{'A', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false), new byte[]{'N', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false), new byte[]{'A', 'N', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false), new byte[]{'A', 'A', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false), new byte[]{'N', 'N', 'N'}); + } + + private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { + for ( int i = 0; i < b1.length; i++ ) + Assert.assertEquals(b1[i], b2[i]); + } + @Test public void testTransitionTransversion() { logger.warn("Executing testTransitionTransversion"); diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java index 49720d1f6..03d6f457f 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -154,9 +154,9 @@ public class GenotypeLikelihoodsUnitTest { public void testGetQualFromLikelihoodsMultiAllelic() { GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - Allele ref = Allele.create(BaseUtils.A,true); - Allele alt1 = Allele.create(BaseUtils.C); - Allele alt2 = Allele.create(BaseUtils.T); + Allele ref = Allele.create(BaseUtils.Base.A.base,true); + Allele alt1 = Allele.create(BaseUtils.Base.C.base); + Allele alt2 = Allele.create(BaseUtils.Base.T.base); List allAlleles = Arrays.asList(ref,alt1,alt2); List gtAlleles = Arrays.asList(alt1,alt2); GenotypeBuilder gtBuilder = new GenotypeBuilder(); From 445735a4a53f22a5ddd214b95f8e0f4eed8bd593 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 11:10:13 -0500 Subject: [PATCH 17/34] There was no reason to be sharing the Haplotype infrastructure between the HaplotypeCaller and the HaplotypeScore annotation since they were really looking for different things. Separated them out, adding efficiencies for the HaplotypeScore version. --- .../walkers/annotator/HaplotypeScore.java | 57 +++++++++++++++---- .../SimpleDeBruijnAssembler.java | 2 +- .../broadinstitute/sting/utils/Haplotype.java | 49 ++-------------- 3 files changed, 52 insertions(+), 56 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index fe4075117..af6304297 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.utils.BaseUtils; -import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -217,14 +216,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); List hlist = new ArrayList(); - hlist.add(new Haplotype(haplotype1.getBases(), 60)); + hlist.add(new Haplotype(haplotype1.getBases(), (byte)60)); for (int k = 1; k < haplotypesToCompute; k++) { Haplotype haplotype2 = consensusHaplotypeQueue.poll(); if (haplotype2 == null) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found - hlist.add(new Haplotype(haplotype2.getBases(), 20)); + hlist.add(new Haplotype(haplotype2.getBases(), (byte)20)); } return hlist; } else @@ -236,8 +235,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); - final double[] baseQualities = new double[contextSize]; - Arrays.fill(baseQualities, 0.0); + final byte[] baseQualities = new byte[contextSize]; + Arrays.fill(baseQualities, (byte)0); byte[] readBases = read.getReadBases(); readBases = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readBases); // Adjust the read bases based on the Cigar string @@ -267,7 +266,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot readQuals[baseOffset] = (byte) 0; } // quals less than 5 are used as codes and don't have actual probabilistic meaning behind them haplotypeBases[i] = readBases[baseOffset]; - baseQualities[i] = (double) readQuals[baseOffset]; + baseQualities[i] = readQuals[baseOffset]; } return new Haplotype(haplotypeBases, baseQualities); @@ -286,10 +285,10 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final int length = a.length; final byte[] consensusChars = new byte[length]; - final double[] consensusQuals = new double[length]; + final byte[] consensusQuals = new byte[length]; - final double[] qualsA = haplotypeA.getQuals(); - final double[] qualsB = haplotypeB.getQuals(); + final byte[] qualsA = haplotypeA.getQuals(); + final byte[] qualsB = haplotypeB.getQuals(); for (int i = 0; i < length; i++) { chA = a[i]; @@ -300,7 +299,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if ((chA == wc) && (chB == wc)) { consensusChars[i] = wc; - consensusQuals[i] = 0.0; + consensusQuals[i] = 0; } else if ((chA == wc)) { consensusChars[i] = chB; consensusQuals[i] = qualsB[i]; @@ -309,7 +308,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot consensusQuals[i] = qualsA[i]; } else { consensusChars[i] = chA; - consensusQuals[i] = qualsA[i] + qualsB[i]; + consensusQuals[i] = (byte)((int)qualsA[i] + (int)qualsB[i]); } } @@ -433,7 +432,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } @@ -441,4 +439,39 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); } + + private static class Haplotype { + private final byte[] bases; + private final byte[] quals; + private int qualitySum = -1; + + public Haplotype( final byte[] bases, final byte[] quals ) { + this.bases = bases; + this.quals = quals; + } + + public Haplotype( final byte[] bases, final byte qual ) { + this.bases = bases; + quals = new byte[bases.length]; + Arrays.fill(quals, qual); + } + + public double getQualitySum() { + if ( qualitySum == -1 ) { + qualitySum = 0; + for ( final byte qual : quals ) { + qualitySum += (int)qual; + } + } + return qualitySum; + } + + public byte[] getQuals() { + return quals.clone(); + } + + public byte[] getBases() { + return bases.clone(); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index e1a94eee7..e16994fa4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -338,7 +338,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final DefaultDirectedGraph graph : graphs ) { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { - final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); + final Haplotype h = new Haplotype( path.getBases( graph ) ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 8c40b9972..66aed1173 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -37,8 +37,8 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; -public class Haplotype extends Allele { - protected final double[] quals; +public class Haplotype extends Allele { + private GenomeLoc genomeLocation = null; private HashMap eventMap = null; private Cigar cigar; @@ -51,49 +51,23 @@ public class Haplotype extends Allele { * Main constructor * * @param bases bases - * @param quals quals * @param isRef is reference allele? */ - public Haplotype( final byte[] bases, final double[] quals, final boolean isRef ) { - super(bases.clone(), isRef); - this.quals = quals.clone(); - } - - /** - * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual - * - * @param bases bases - * @param qual qual - */ - public Haplotype( final byte[] bases, final int qual, final boolean isRef ) { - super(bases.clone(), isRef); - quals = new double[bases.length]; - Arrays.fill(quals, (double)qual); - } - - public Haplotype( final byte[] bases, final int qual ) { - this(bases, qual, false); - } - public Haplotype( final byte[] bases, final boolean isRef ) { - this(bases, 0, isRef); - } - - public Haplotype( final byte[] bases, final double[] quals ) { - this(bases, quals, false); + super(bases.clone(), isRef); } public Haplotype( final byte[] bases ) { - this(bases, 0, false); + this(bases, false); } protected Haplotype( final byte[] bases, final Event artificialEvent ) { - this(bases, 0, false); + this(bases, false); this.artificialEvent = artificialEvent; } public Haplotype( final byte[] bases, final GenomeLoc loc ) { - this(bases, 0, false); + this(bases, false); this.genomeLocation = loc; } @@ -110,22 +84,11 @@ public class Haplotype extends Allele { this.eventMap = eventMap; } - public double getQualitySum() { - double s = 0; - for (int k=0; k < quals.length; k++) { - s += quals[k]; - } - return s; - } - @Override public String toString() { return getDisplayString(); } - public double[] getQuals() { - return quals.clone(); - } public byte[] getBases() { return super.getBases().clone(); } From 4ffb43079f020e1a1ed3dc2fffc02a1bf660e01d Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Wed, 16 Jan 2013 12:43:15 -0500 Subject: [PATCH 18/34] Re-committing the following changes from Dec 18: Refactored interval specific arguments out of GATKArgumentCollection into InvtervalArgumentCollection such that it can be used in other CommandLinePrograms. Updated SelectHeaders to print out full interval arguments. Added RemoteFile.createUrl(Date expiration) to enable creation of presigned URLs for download over http: or file:. --- .../walkers/variantutils/SelectHeaders.java | 51 +++++++++++--- .../IntervalArgumentCollection.java | 70 +++++++++++++++++++ .../sting/gatk/GenomeAnalysisEngine.java | 38 +--------- .../arguments/GATKArgumentCollection.java | 45 +----------- .../sting/utils/interval/IntervalUtils.java | 42 +++++++++++ .../broadinstitute/variant/vcf/VCFHeader.java | 4 ++ .../utils/interval/IntervalUtilsUnitTest.java | 4 +- .../sting/queue/util/RemoteFile.scala | 3 + 8 files changed, 166 insertions(+), 91 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java index 81a17b6ae..38fa060cc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker implements TreeRe headerLines = new LinkedHashSet(getSelectedHeaders(headerLines)); // Optionally add in the intervals. - if (includeIntervals && getToolkit().getArguments().intervals != null) { - for (IntervalBinding intervalBinding : getToolkit().getArguments().intervals) { - String source = intervalBinding.getSource(); - if (source == null) - continue; - File file = new File(source); - if (file.exists()) { - headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); - } else { - headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + if (includeIntervals) { + IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments; + if (intervalArguments.intervals != null) { + for (IntervalBinding intervalBinding : intervalArguments.intervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + } } } + + if (intervalArguments.excludeIntervals != null) { + for (IntervalBinding intervalBinding : intervalArguments.excludeIntervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source)); + } + } + } + + if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging))); + } + + if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule))); + } + + if (intervalArguments.intervalPadding != 0) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding))); + } } TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java new file mode 100644 index 000000000..3f76ae652 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; + +import java.util.List; + +public class IntervalArgumentCollection { + /** + * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). + * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). + * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. + */ + @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> intervals = null; + + /** + * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). + * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). + */ + @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> excludeIntervals = null; + + /** + * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions + * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). + */ + @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) + public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; + + /** + * Should abutting (but not overlapping) intervals be treated as separate intervals? + */ + @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) + public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; + + /** + * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. + */ + @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) + public int intervalPadding = 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f9d6955c0..9b801be7d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -361,7 +360,6 @@ public class GenomeAnalysisEngine { * Returns a list of active, initialized read transformers * * @param walker the walker we need to apply read transformers too - * @return a non-null list of read transformers */ public void initializeReadTransformers(final Walker walker) { final List activeTransformers = new ArrayList(); @@ -672,41 +670,7 @@ public class GenomeAnalysisEngine { * Setup the intervals to be processed */ protected void initializeIntervals() { - // return if no interval arguments at all - if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) - return; - - // Note that the use of '-L all' is no longer supported. - - // if include argument isn't given, create new set of all possible intervals - - final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( - this.referenceDataSource, - argCollection.intervals, - argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, - argCollection.excludeIntervals); - - final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); - final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); - - // if no exclude arguments, can return parseIntervalArguments directly - if ( excludeSortedSet == null ) - intervals = includeSortedSet; - - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets - else { - intervals = includeSortedSet.subtractRegions(excludeSortedSet); - - // logging messages only printed when exclude (-XL) arguments are given - final long toPruneSize = includeSortedSet.coveredSize(); - final long toExcludeSize = excludeSortedSet.coveredSize(); - final long intervalSize = intervals.coveredSize(); - logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); - logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } - - logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index ab09064dd..62ca38ad2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -26,11 +26,7 @@ package org.broadinstitute.sting.gatk.arguments; import net.sf.samtools.SAMFileReader; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; @@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; import java.io.File; import java.util.ArrayList; @@ -100,41 +94,8 @@ public class GATKArgumentCollection { @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); - /** - * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). - * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). - * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. - */ - @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> intervals = null; - - /** - * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). - * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). - */ - @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> excludeIntervals = null; - - /** - * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions - * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). - */ - @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) - public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; - - /** - * Should abutting (but not overlapping) intervals be treated as separate intervals? - */ - @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) - public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - - /** - * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. - */ - @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) - public int intervalPadding = 0; + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index c647a7b80..7374dda14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.IntervalArgumentCollection; import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; @@ -534,6 +535,47 @@ public class IntervalUtils { } } + public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) { + GenomeLocSortedSet intervals = null; + + // return if no interval arguments at all + if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) + return intervals; + + // Note that the use of '-L all' is no longer supported. + + // if include argument isn't given, create new set of all possible intervals + + final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + referenceDataSource, + argCollection.intervals, + argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, + argCollection.excludeIntervals); + + final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + // if no exclude arguments, can return parseIntervalArguments directly + if ( excludeSortedSet == null ) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + intervals = includeSortedSet.subtractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + final long toPruneSize = includeSortedSet.coveredSize(); + final long toExcludeSize = excludeSortedSet.coveredSize(); + final long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); + logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + + logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + return intervals; + } + public static Pair parseIntervalBindingsPair( final ReferenceDataSource referenceDataSource, final List> intervals, diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java index 583a01417..9bdb86a48 100644 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java @@ -73,6 +73,10 @@ public class VCFHeader { public static final String REFERENCE_KEY = "reference"; public static final String CONTIG_KEY = "contig"; public static final String INTERVALS_KEY = "intervals"; + public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals"; + public static final String INTERVAL_MERGING_KEY = "interval_merging"; + public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule"; + public static final String INTERVAL_PADDING_KEY = "interval_padding"; // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 35f9d4137..2be2745de 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List> intervalArgs = new ArrayList>(1); intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); } @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") @@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List> intervalArgs = new ArrayList>(1); intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); } private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala index 28be82136..23a99b586 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala @@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util import java.io.File import org.broadinstitute.sting.utils.io.FileExtension +import java.util.Date +import java.net.URL /** * An extension of java.io.File that can be pulled from or pushed to a remote location. @@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension { def pullToLocal() def pushToRemote() def deleteRemote() + def createUrl(expiration: Date): URL def remoteDescription: String } From d18dbcbac103c0ce8f0480e04efcdd00a50f3394 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 14:55:33 -0500 Subject: [PATCH 19/34] Added tests for changing IUPAC bases to Ns, for failing on bad ref bases, and for the HaplotypeCaller not failing when running over a region with an IUPAC base. Out of curiosity, why does Picard's IndexedFastaSequenceFile allow one to query for start position 0? When doing so, that base is a line feed (-1 offset to the first base in the contig) which is an illegal base (and which caused me no end of trouble)... --- .../HaplotypeCallerIntegrationTest.java | 9 +++++ .../CachingIndexedFastaSequenceFile.java | 14 +++---- .../variant/utils/BaseUtils.java | 6 ++- ...chingIndexedFastaSequenceFileUnitTest.java | 39 +++++++++++++++++-- 4 files changed, 55 insertions(+), 13 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index d95da6b7f..6183fc411 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -50,6 +50,7 @@ import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; import java.util.Arrays; +import java.util.Collections; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; @@ -156,6 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { executeTest("HCTestStructuralIndels: ", spec); } + @Test + public void HCTestDoesNotFailOnBadRefBase() { + // don't care about the output - just want to make sure it doesn't fail + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); + executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing reduced reads diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 88eaa8910..a749625cd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -125,13 +125,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException { + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException { super(fasta); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; - preserveIUPAC = false; + this.preserveIUPAC = preserveIUPAC; } /** @@ -168,7 +168,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { - this(fasta, DEFAULT_CACHE_SIZE, preserveCase); + this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false); } /** @@ -181,7 +181,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 */ public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { - this(fasta, cacheSize, false); + this(fasta, cacheSize, false, false); } /** @@ -261,7 +261,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * all of the bases in the ReferenceSequence returned by this method will be upper cased. */ @Override - public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { + public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) { final ReferenceSequence result; final Cache myCache = cache.get(); @@ -269,7 +269,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); - if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1); } else { // todo -- potential optimization is to check if contig.name == contig, as this in general will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); @@ -285,7 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { // convert all of the bases in the sequence to upper case if we aren't preserving cases if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); - if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0); } else { cacheHits++; } diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java index 7a37e8de5..a6ac2ca53 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java @@ -166,9 +166,11 @@ public class BaseUtils { return base >= 'A' && base <= 'Z'; } - public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) { + public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) { final int length = bases.length; - for ( int i = 0; i < length; i++ ) { + final int start = ignoreConversionOfFirstByte ? 1 : 0; + + for ( int i = start; i < length; i++ ) { final int baseIndex = baseIndexWithIupacMap[bases[i]]; if ( baseIndex == Base.N.ordinal() ) { bases[i] = 'N'; diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index c67e52f2e..0c1b5b069 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -32,8 +32,10 @@ package org.broadinstitute.sting.utils.fasta; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import org.apache.commons.lang.StringUtils; import org.apache.log4j.Priority; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -49,7 +51,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; /** - * Basic unit test for GenomeLoc + * Basic unit test for CachingIndexedFastaSequenceFile */ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta"); @@ -80,7 +82,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", @@ -122,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -167,7 +169,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000) public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false); logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); for ( int iterations = 0; iterations < 1; iterations++ ) { @@ -230,4 +232,33 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { else return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); } + + @Test(enabled = true) + public void testIupacChanges() throws FileNotFoundException, InterruptedException { + final String testFasta = privateTestDir + "iupacFASTA.fasta"; + final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true); + final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta)); + + int preservingNs = 0; + int changingNs = 0; + for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) { + final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000); + preservingNs += StringUtils.countMatches(sPreserving, "N"); + + final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000); + changingNs += StringUtils.countMatches(sChanging, "N"); + } + + Assert.assertEquals(changingNs, preservingNs + 4); + } + + @Test(enabled = true, expectedExceptions = {UserException.class}) + public void testFailOnBadBase() throws FileNotFoundException, InterruptedException { + final String testFasta = privateTestDir + "problematicFASTA.fasta"; + final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta)); + + for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) { + fetchBaseString(fasta, contig.getSequenceName(), -1, -1); + } + } } From ec1cfe67329c43afe916be4816af3b5af23f27c3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 15:05:49 -0500 Subject: [PATCH 20/34] Oops, forgot to add 1 of my files --- .../variant/utils/BaseUtilsUnitTest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java index 4f918f718..37627204f 100644 --- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java @@ -53,11 +53,11 @@ public class BaseUtilsUnitTest extends BaseTest { @Test public void testConvertIUPACtoN() { - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false), new byte[]{'A', 'A', 'A'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false), new byte[]{'N', 'A', 'A'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false), new byte[]{'A', 'N', 'A'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false), new byte[]{'A', 'A', 'N'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false), new byte[]{'N', 'N', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false, false), new byte[]{'A', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false, false), new byte[]{'N', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false, false), new byte[]{'A', 'N', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false, false), new byte[]{'A', 'A', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false, false), new byte[]{'N', 'N', 'N'}); } private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { From 4d0e7b50ec967897a5400befb329177bb0256c69 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 15 Jan 2013 16:45:45 -0500 Subject: [PATCH 21/34] ArtificialBAMBuilder utility class for creating streams of GATKSAMRecords with a variety of properties -- Allows us to make a stream of reads or an index BAM file with read having the following properties (coming from n samples, of fixed read length and aligned to the genome with M operator, having N reads per alignment start, skipping N bases between each alignment start, starting at a given alignment start) -- This stream can be handed back to the caller immediately, or written to an indexed BAM file -- Update LocusIteratorByStateUnitTest to use this functionality (which was refactored from LIBS unit tests and ArtificialSAMUtils) --- .../sting/utils/sam/ArtificialBAMBuilder.java | 176 ++++++++++++++++++ .../sting/utils/sam/ArtificialSAMUtils.java | 29 --- .../LocusIteratorByStateUnitTest.java | 22 +-- .../sam/ArtificialBAMBuilderUnitTest.java | 122 ++++++++++++ 4 files changed, 305 insertions(+), 44 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java new file mode 100644 index 000000000..651d759e0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.NGSPlatform; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Easy to use creator of artificial BAM files for testing + * + * Allows us to make a stream of reads or an index BAM file with read having the following properties + * + * - coming from n samples + * - of fixed read length and aligned to the genome with M operator + * - having N reads per alignment start + * - skipping N bases between each alignment start + * - starting at a given alignment start + * + * User: depristo + * Date: 1/15/13 + * Time: 9:22 AM + */ +public class ArtificialBAMBuilder { + public final static int BAM_SHARD_SIZE = 16384; + + final int nReadsPerLocus; + final int nLoci; + + int skipNLoci = 0; + int alignmentStart = 1; + int readLength = 10; + private final ArrayList samples = new ArrayList(); + + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + { + factory.setCreateIndex(true); + } + + SAMFileHeader header; + + public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + this.nReadsPerLocus = nReadsPerLocus; + this.nLoci = nLoci; + createAndSetHeader(1); + } + + public ArtificialBAMBuilder createAndSetHeader(final int nSamples) { + this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + samples.clear(); + + for ( int i = 0; i < nSamples; i++ ) { + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); + final String sample = "sample" + i; + samples.add(sample); + rg.setSample(sample); + rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); + header.addReadGroup(rg); + } + + return this; + } + + public List getSamples() { + return samples; + } + + /** + * Create a read stream based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like LocusIteratorBystate + * + * @return a ordered list of reads + */ + public List makeReads() { + final String baseName = "read"; + List reads = new ArrayList(nReadsPerLocus*nLoci); + for ( int locusI = 0; locusI < nLoci; locusI++) { + final int locus = locusI * (skipNLoci + 1); + for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { + for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { + final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength); + read.setReadGroup(new GATKSAMReadGroupRecord(rg)); + reads.add(read); + } + } + } + + return reads; + } + + /** + * Make an indexed BAM file contains the reads in the builder, marking it for deleteOnExit() + * @return the BAM file + */ + public File makeTemporarilyBAMFile() { + try { + final File file = File.createTempFile("tempBAM", ".bam"); + file.deleteOnExit(); + return makeBAMFile(file); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + /** + * Write the reads from this builder to output, creating an index as well + * @param output the output BAM file we want to use + * @return + */ + public File makeBAMFile(final File output) { + final SAMFileWriter writer = factory.makeBAMWriter(header, true, output, 0); + for ( final GATKSAMRecord read : makeReads() ) + writer.addAlignment(read); + writer.close(); + return output; + } + + public int getnReadsPerLocus() { return nReadsPerLocus; } + public int getnLoci() { return nLoci; } + public int getSkipNLoci() { return skipNLoci; } + public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; } + public int getAlignmentStart() { return alignmentStart; } + public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; } + public int getReadLength() { return readLength; } + public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; } + public SAMFileHeader getHeader() { return header; } + public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; } + + public int getNSamples() { return samples.size(); } + + public int expectedNumberOfReads() { + return nLoci * nReadsPerLocus * header.getReadGroups().size(); + } + + @Override + public String toString() { + return "ArtificialBAMBuilder{" + + "samples=" + samples + + ", readLength=" + readLength + + ", alignmentStart=" + alignmentStart + + ", skipNLoci=" + skipNLoci + + ", nLoci=" + nLoci + + ", nReadsPerLocus=" + nReadsPerLocus + + '}'; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 4af6555d9..0f5d6a2f7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -327,35 +327,6 @@ public class ArtificialSAMUtils { return stack; } - /** - * Create a read stream based on the parameters. The cigar string for each - * read will be *M, where * is the length of the read. - * - * Useful for testing things like LocusIteratorBystate - * - * @return a collection of stackSize reads all sharing the above properties - */ - public static List createReadStream( final int nReadsPerLocus, - final int nLoci, - final SAMFileHeader header, - final int alignmentStart, - final int length ) { - final String baseName = "read"; - List reads = new ArrayList(nReadsPerLocus*nLoci); - for ( int locus = 0; locus < nLoci; locus++ ) { - for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { - for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { - final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); - final GATKSAMRecord read = createArtificialRead(header, readName, 0, alignmentStart + locus, length); - read.setReadGroup(new GATKSAMReadGroupRecord(rg)); - reads.add(read); - } - } - } - - return reads; - } - /** * create an iterator containing the specified read piles * diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 37494903c..2f984165e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -28,17 +28,16 @@ package org.broadinstitute.sting.utils.locusiterator; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -447,26 +446,19 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); final int readLength = 10; - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); - final List samples = new ArrayList(nSamples); - for ( int i = 0; i < nSamples; i++ ) { - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); - final String sample = "sample" + i; - samples.add(sample); - rg.setSample(sample); - rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); - header.addReadGroup(rg); - } - final boolean downsample = downsampleTo != -1; final DownsamplingMethod downsampler = downsample ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) : new DownsamplingMethod(DownsampleType.NONE, null, null, false); - final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); + + final List reads = bamBuilder.makeReads(); li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), createTestReadProperties(downsampler, keepReads), genomeLocParser, - samples); + bamBuilder.getSamples()); final Set seenSoFar = new HashSet(); final Set keptReads = new HashSet(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java new file mode 100644 index 000000000..cf3c97b34 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.apache.commons.collections.IteratorUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 1/15/13 + * Time: 3:49 PM + * To change this template use File | Settings | File Templates. + */ +public class ArtificialBAMBuilderUnitTest extends BaseTest { + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + for ( final int readLength : Arrays.asList(10, 20) ) { + for ( final int skips : Arrays.asList(0, 1, 10) ) { + for ( final int start : starts ) { + for ( final int nSamples : Arrays.asList(1, 2) ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) { + for ( final int nLoci : Arrays.asList(10, 100, 1000) ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + bamBuilder.createAndSetHeader(nSamples); + tests.add(new Object[]{bamBuilder, readLength, skips, start, nSamples, nReadsPerLocus, nLoci}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CombinatorialARTTilingProvider") + public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) { + Assert.assertEquals(bamBuilder.getReadLength(), readLength); + Assert.assertEquals(bamBuilder.getSkipNLoci(), skips); + Assert.assertEquals(bamBuilder.getAlignmentStart(), start); + Assert.assertEquals(bamBuilder.getNSamples(), nSamples); + Assert.assertEquals(bamBuilder.getnReadsPerLocus(), nReadsPerLocus); + Assert.assertEquals(bamBuilder.getnLoci(), nLoci); + + final List reads = bamBuilder.makeReads(); + Assert.assertEquals(reads.size(), bamBuilder.expectedNumberOfReads()); + for ( final GATKSAMRecord read : reads ) { + assertGoodRead(read, bamBuilder); + } + + final File bam = bamBuilder.makeTemporarilyBAMFile(); + final SAMFileReader reader = new SAMFileReader(bam); + Assert.assertTrue(reader.hasIndex()); + final Iterator bamIt = reader.iterator(); + int nReadsFromBam = 0; + int lastStart = -1; + while ( bamIt.hasNext() ) { + final SAMRecord read = bamIt.next(); + assertGoodRead(read, bamBuilder); + nReadsFromBam++; + Assert.assertTrue(read.getAlignmentStart() >= lastStart); + lastStart = read.getAlignmentStart(); + } + Assert.assertEquals(nReadsFromBam, bamBuilder.expectedNumberOfReads()); + } + + private void assertGoodRead(final SAMRecord read, final ArtificialBAMBuilder bamBuilder) { + Assert.assertEquals(read.getReadLength(), bamBuilder.getReadLength()); + Assert.assertEquals(read.getReadBases().length, bamBuilder.getReadLength()); + Assert.assertEquals(read.getBaseQualities().length, bamBuilder.getReadLength()); + Assert.assertTrue(read.getAlignmentStart() >= bamBuilder.getAlignmentStart()); + Assert.assertNotNull(read.getReadGroup()); + } +} + + From ddcb33fcf81cd208bc8ad6e23aeb1eb49624ea07 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 Jan 2013 12:09:36 -0500 Subject: [PATCH 22/34] Cache result of getLocation() in Shard so we don't performance expensive calculation over and over --- .../sting/gatk/datasources/reads/Shard.java | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java index 2c03363ba..5b4c2afda 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java @@ -95,7 +95,10 @@ public abstract class Shard implements HasGenomeLocation { */ private final Map fileSpans; - + /** + * Lazy-calculated span of all of the genome locs in this shard + */ + private GenomeLoc spanningLocation = null; /** * Statistics about which reads in this shards were used and which were filtered away. @@ -148,27 +151,34 @@ public abstract class Shard implements HasGenomeLocation { /** * Returns the span of the genomeLocs comprising this shard - * @param - * @return + * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last + * position in getGenomeLocs() */ public GenomeLoc getLocation() { - if ( getGenomeLocs() == null ) - return GenomeLoc.WHOLE_GENOME; + if ( spanningLocation == null ) { + if ( getGenomeLocs() == null ) + spanningLocation = GenomeLoc.WHOLE_GENOME; + else if ( getGenomeLocs().size() == 0 ) { + spanningLocation = getGenomeLocs().get(0); + } else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; - int start = Integer.MAX_VALUE; - int stop = Integer.MIN_VALUE; - String contig = null; + for ( GenomeLoc loc : getGenomeLocs() ) { + if ( GenomeLoc.isUnmapped(loc) ) + // special case the unmapped region marker, just abort out + return loc; + contig = loc.getContig(); + if ( loc.getStart() < start ) start = loc.getStart(); + if ( loc.getStop() > stop ) stop = loc.getStop(); + } - for ( GenomeLoc loc : getGenomeLocs() ) { - if ( GenomeLoc.isUnmapped(loc) ) - // special case the unmapped region marker, just abort out - return loc; - contig = loc.getContig(); - if ( loc.getStart() < start ) start = loc.getStart(); - if ( loc.getStop() > stop ) stop = loc.getStop(); + spanningLocation = parser.createGenomeLoc(contig, start, stop); + } } - return parser.createGenomeLoc(contig, start, stop); + return spanningLocation; } From 2a42b47e4a19c17ae3dad64a980e856229875295 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 Jan 2013 15:29:26 -0500 Subject: [PATCH 23/34] Massive expansion of ActiveRegionTraversal unit tests, resulting in several bugfixes to ART -- UnitTests now include combinational tiling of reads within and spanning shard boundaries -- ART now properly handles shard transitions, and does so efficiently without requiring hash sets or other collections of reads -- Updating HC and CountReadsInActiveRegions integration tests --- .../HaplotypeCallerIntegrationTest.java | 12 +- .../traversals/TraverseActiveRegions.java | 226 +++++++++++++----- .../sting/utils/sam/ArtificialBAMBuilder.java | 39 ++- .../traversals/DummyActiveRegionWalker.java | 104 ++++++++ .../TraverseActiveRegionsUnitTest.java | 207 +++++++++++----- .../LocusIteratorByStateUnitTest.java | 28 ++- .../sam/ArtificialBAMBuilderUnitTest.java | 6 +- 7 files changed, 482 insertions(+), 140 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6183fc411..e86834a4a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc"); + HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a"); + HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "a938cdd7262968597fc8eb6c1c0a69f1"); + "c679ae7f04bdfda896b5c046d35e043c"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "d590c8d6d5e58d685401b65a23846893"); + "1a034b7eb572e1b6f659d6e5d57b3e76"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 03aaf95f2..a7e4d7649 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.traversals; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; @@ -47,24 +48,36 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; /** - * Created with IntelliJ IDEA. + * Implement active region traversal + * * User: depristo * Date: 1/9/13 * Time: 4:45 PM - * To change this template use File | Settings | File Templates. + * + * Live region: + * + * The ART tracks a thing called the live region. The live region is a position on a specific contig + * of the alignment start of the last read we processed during this traversal. Because the + * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region + * (everything to the left of the live boundary) cannot have any more read data. The live / dead + * regions are used to decide when we can safely call map on active regions, as only active regions + * contained completely within the dead region (including extensions) have a complete set of read data + * in the collected read list. All of the data related to the live region is captured by the local + * variable spanOfLastReadSeen + * */ public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); protected final static boolean DEBUG = false; // set by the tranversal private int activeRegionExtension = -1; private int maxRegionSize = -1; - /** - * our log, which we want to capture anything from this class - */ - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - protected final LinkedList workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); + + private LinkedList myReads = new LinkedList(); + private GenomeLoc spanOfLastReadSeen = null; protected int getActiveRegionExtension() { return activeRegionExtension; @@ -79,6 +92,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine myReads = new LinkedList(); - private Shard lastShard = null; + /** + * Did read appear in the last shard? + * + * When we transition across shard boundaries we see duplicate reads because + * each shard contains the reads that *overlap* the shard. So if we just finished + * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001 + * that overlapped 1-1000. This function tests read to determine if we would have + * seen it before by asking if read.getAlignmentStart() is less than the + * stop position of the last seen read at the start of the traversal. The reason + * we need to use the location of the last read at the start of the traversal + * is that we update the lastRead during the traversal, and we only want to filter + * out reads whose start is before the last read of the previous shard, not the + * current shard. + * + * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal + * @param read the read we want to test if it's already been seen in the last shard + * @return true if read would have appeared in the last shard, false otherwise + */ + protected boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) { + if ( locOfLastReadAtTraversalStart == null ) + // we're in the first shard, so obviously the answer is no + return false; + else { + // otherwise check to see if the alignment occurred in the previous shard + return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart() + // we're on the same contig + && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex(); + } + + } + + // ------------------------------------------------------------------------------------- + // + // Actual traverse function + // + // ------------------------------------------------------------------------------------- + + /** + * Is the current shard on a new contig w.r.t. the previous shard? + * @param currentShard the current shard we are processing + * @return true if the last shard was on a different contig than the current shard + */ + private boolean onNewContig(final Shard currentShard) { + return spanOfLastSeenRead() != null + && spanOfLastSeenRead().getContigIndex() != currentShard.getLocation().getContigIndex(); + } @Override public T traverse( final ActiveRegionWalker walker, final LocusShardDataProvider dataProvider, T sum) { - if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final HashSet maybeDuplicatedReads = new HashSet(); - // TODO -- there's got to be a better way to know this - if ( lastShard != dataProvider.getShard() ) { - maybeDuplicatedReads.addAll(myReads); - logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); - if ( DEBUG ) logger.warn("Clearing myReads"); - } - lastShard = dataProvider.getShard(); + logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); final LocusView locusView = new AllLocusView(dataProvider); @@ -181,6 +234,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); for( final GATKSAMRecord read : reads ) { - notifyOfCurrentPosition(read); - // most of the time maybeDuplicatedReads is empty - // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the - // TODO -- potential list of duplicates we can clear the hashset - if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { + if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); } else { if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); - myReads.add((GATKSAMRecord)read); + rememberLastReadLocation(read); + myReads.add(read); } } @@ -257,28 +313,87 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, true); } - protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { - if ( startOfLiveRegion == null ) - startOfLiveRegion = currentLocation; - else - startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); + // ------------------------------------------------------------------------------------- + // + // Functions to manage and interact with the live / dead zone + // + // ------------------------------------------------------------------------------------- + + /** + * Update the live region to reflect that the last read we've seen in the traversal is read + * + * Requires that sequential calls always be provided reads in coordinate sorted order + * + * @param read the last read we've seen during the traversal + */ + protected void rememberLastReadLocation(final GATKSAMRecord read) { + final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isBefore(spanOfLastReadSeen) ) + throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); + spanOfLastReadSeen = currentLocation; + } } - protected GenomeLoc getStartOfLiveRegion() { - return startOfLiveRegion; + /** + * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. + * @return the left-most position of the live region on the genome + */ + protected GenomeLoc spanOfLastSeenRead() { + return spanOfLastReadSeen; } - protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { - return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) - || ! region.onSameContig(getStartOfLiveRegion()); + /** + * Is the active region completely within the traversal's dead zone? + * + * @param region the region we want to test + * @return true if the extended location of region is completely within the current dead zone, false otherwise + */ + protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { + return region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart() + || ! region.getExtendedLoc().onSameContig(spanOfLastSeenRead()); } + /** + * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? + * + * read: start |--------> stop ------ stop + extension + * region: start |-----------------| end + * + * Since the regions are coming in order, read could potentially be contained in a future interval if + * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end + * of this region, then we can discard it, since any future region could only include reads + * up to end + 1 - extension. + * + * Note that this function doesn't care about the dead zone. We're assuming that by + * actually calling this function with an active region that region is already in the dead zone, + * so checking that the read is in the dead zone doesn't make sense. + * + * @param read the read we're testing + * @param activeRegion the current active region + * @return true if the read is dead, false other + */ + @Requires({"read != null", "activeRegion != null"}) + private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { + return read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop(); + } + + // ------------------------------------------------------------------------------------- + // + // Functions to process active regions that are ready for map / reduce calls + // + // ------------------------------------------------------------------------------------- + private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { if( walker.activeRegionOutStream != null ) { writeActiveRegionsToStream(walker); @@ -292,11 +407,10 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker) { final Iterator liveReads = myReads.iterator(); while ( liveReads.hasNext() ) { @@ -325,7 +430,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java index 651d759e0..f5018db8c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -25,7 +25,9 @@ package org.broadinstitute.sting.utils.sam; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.NGSPlatform; import java.io.File; @@ -51,6 +53,9 @@ import java.util.List; public class ArtificialBAMBuilder { public final static int BAM_SHARD_SIZE = 16384; + private final IndexedFastaSequenceFile reference; + private final GenomeLocParser parser; + final int nReadsPerLocus; final int nLoci; @@ -66,14 +71,39 @@ public class ArtificialBAMBuilder { SAMFileHeader header; - public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) { this.nReadsPerLocus = nReadsPerLocus; this.nLoci = nLoci; + + this.reference = reference; + this.parser = new GenomeLocParser(reference); createAndSetHeader(1); } + public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + this(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000).getSequenceDictionary(), nReadsPerLocus, nLoci); + } + + public ArtificialBAMBuilder(final SAMSequenceDictionary dict, int nReadsPerLocus, int nLoci) { + this.nReadsPerLocus = nReadsPerLocus; + this.nLoci = nLoci; + this.reference = null; + this.parser = new GenomeLocParser(dict); + createAndSetHeader(1); + } + + public IndexedFastaSequenceFile getReference() { + return reference; + } + + public GenomeLocParser getGenomeLocParser() { + return parser; + } + public ArtificialBAMBuilder createAndSetHeader(final int nSamples) { - this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + this.header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(parser.getContigs()); samples.clear(); for ( int i = 0; i < nSamples; i++ ) { @@ -156,6 +186,11 @@ public class ArtificialBAMBuilder { public SAMFileHeader getHeader() { return header; } public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; } + public int getAlignmentEnd() { + return alignmentStart + nLoci * (skipNLoci + 1) + readLength; + } + + public int getNSamples() { return samples.size(); } public int expectedNumberOfReads() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java new file mode 100644 index 000000000..bc1e1d7b0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; + +import java.util.*; + +/** + * ActiveRegionWalker for unit testing + * + * User: depristo + * Date: 1/15/13 + * Time: 1:28 PM + */ +class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + private EnumSet states = super.desiredReadStates(); + private GenomeLocSortedSet activeRegions = null; + + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new LinkedHashMap(); + + public DummyActiveRegionWalker() { + this(1.0); + } + + public DummyActiveRegionWalker(double constProb) { + this.prob = constProb; + } + + public DummyActiveRegionWalker(EnumSet wantStates) { + this(1.0); + this.states = wantStates; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) { + this(1.0); + this.activeRegions = activeRegions; + } + + public void setStates(EnumSet states) { + this.states = states; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; + return new ActivityProfileResult(ref.getLocus(), p); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index c4dadbcce..15d4eec2d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -30,33 +30,26 @@ import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.*; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -80,54 +73,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private final static boolean ENFORCE_CONTRACTS = false; private final static boolean DEBUG = false; - private class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new HashMap(); - - public DummyActiveRegionWalker() { - this.prob = 1.0; - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(EnumSet wantStates) { - this.prob = 1.0; - this.states = wantStates; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - return new ActivityProfileResult(ref.getLocus(), prob); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - @DataProvider(name = "TraversalEngineProvider") public Object[][] makeTraversals() { final List traversals = new LinkedList(); @@ -297,7 +242,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } } - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + @Test(enabled = true, dataProvider = "TraversalEngineProvider") public void testPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); @@ -340,7 +285,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { verifyReadMapping(region, "simple20"); } - @Test(enabled = true, dataProvider = "TraversalEngineProvider") + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testNonPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker( EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); @@ -456,7 +401,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) + return getActiveRegions(t, walker, intervals, testBAM); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final String bam) { + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) t.traverse(walker, dataProvider, 0); t.endTraversal(walker, 0); @@ -516,14 +465,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { record.setCigar(cigar); record.setReadString(new String(new char[len]).replace("\0", "A")); record.setBaseQualities(new byte[len]); + record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test"))); return record; } - private List createDataProviders(TraverseActiveRegions t, final Walker walker, List intervals, String bamFile) { + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine, walker); + traverseActiveRegions.initialize(engine, walker); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); @@ -539,13 +489,144 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { new ArrayList(), false, (byte)30, false, true); + final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); + List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); } } return providers; } + + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + final List> allReadStates = Arrays.asList( + EnumSet.of(ActiveRegionReadState.PRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED) + ); + + final int maxTests = Integer.MAX_VALUE; + int nTests = 0; + for ( final int readLength : Arrays.asList(10, 100) ) { + for ( final int skips : Arrays.asList(0, 1, 10) ) { + for ( final int start : starts ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { + for ( final int nLoci : Arrays.asList(1, 1000) ) { + for ( EnumSet readStates : allReadStates ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + + for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { + nTests++; + if ( nTests < maxTests ) // && nTests == 1238 ) + tests.add(new Object[]{nTests, activeRegions, readStates, bamBuilder}); + } + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Collection enumerateActiveRegions(final int start, final int stop) { + // should basically cut up entire region into equal sized chunks, of + // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive + // Need to make sure we include some edge cases: + final List activeRegions = new LinkedList(); + + for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) { + for ( final boolean startWithActive : Arrays.asList(true, false) ) { + activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive)); + } + } + + // active region is the whole interval + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop))); + + // active region extends up to the end of the data, but doesn't include start + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop))); + + return activeRegions; + } + + private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) { + final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser); + + boolean includeRegion = startWithActive; + for ( int left = start; left < stop; left += stepSize) { + final int right = left + stepSize; + final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right); + if ( includeRegion ) + active.add(region); + includeRegion = ! includeRegion; + } + + return active; + } + + + @Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider") + public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { + logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + walker.setStates(readStates); + + final TraverseActiveRegions traversal = new TraverseActiveRegions(); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + + final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary + for ( final ActiveRegion region : activeRegionsMap.values() ) { + int nReadsExpectedInRegion = 0; + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + final Set readNamesInRegion = readNamesInRegion(region); + + boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) + ? region.getExtendedLoc().overlapsP(readLoc) + : region.getLocation().overlapsP(readLoc); + + if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) { + if ( alreadySeenReads.contains(read.getReadName()) ) + shouldBeInRegion = false; + else if ( shouldBeInRegion ) + alreadySeenReads.add(read.getReadName()); + } + + Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region + + " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"); + + nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } + } + + private Set readNamesInRegion(final ActiveRegion region) { + final Set readNames = new LinkedHashSet(region.getReads().size()); + for ( final SAMRecord read : region.getReads() ) + readNames.add(read.getReadName()); + return readNames; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 2f984165e..e5e28e1f6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -54,6 +54,32 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { private static final boolean DEBUG = false; protected LocusIteratorByState li; + @Test(enabled = true) + public void testUnmappedAndAllIReadsPassThrough() { + final int readLength = 10; + GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength); + GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength); + GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength); + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength); + + unmapped.setReadUnmappedFlag(true); + unmapped.setCigarString("*"); + allI.setCigarString(readLength + "I"); + + List reads = Arrays.asList(mapped1, unmapped, allI, mapped2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,createTestReadProperties(DownsamplingMethod.NONE, true)); + + Assert.assertTrue(li.hasNext()); + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads"); + + final List rawReads = li.transferReadsFromAllPreviousPileups(); + Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads"); + } + @Test(enabled = true && ! DEBUG) public void testXandEQOperators() { final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -451,7 +477,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) : new DownsamplingMethod(DownsampleType.NONE, null, null, false); - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci); bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); final List reads = bamBuilder.makeReads(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java index cf3c97b34..2a638eb69 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -47,8 +47,8 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public class ArtificialBAMBuilderUnitTest extends BaseTest { - @DataProvider(name = "CombinatorialARTTilingProvider") - public Object[][] makeCombinatorialARTTilingProvider() { + @DataProvider(name = "ArtificialBAMBuilderUnitTestProvider") + public Object[][] makeArtificialBAMBuilderUnitTestProvider() { final List tests = new LinkedList(); final List starts = Arrays.asList( @@ -79,7 +79,7 @@ public class ArtificialBAMBuilderUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "CombinatorialARTTilingProvider") + @Test(dataProvider = "ArtificialBAMBuilderUnitTestProvider") public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) { Assert.assertEquals(bamBuilder.getReadLength(), readLength); Assert.assertEquals(bamBuilder.getSkipNLoci(), skips); From 4cf34ee9da6dfa9539b485daeed9f276fb192975 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 15:35:04 -0500 Subject: [PATCH 24/34] Bug fix to FisherStrand: do not let it output INFINITY. This all needs to be unit tested, but that's coming on the horizon. --- .../sting/gatk/walkers/annotator/FisherStrand.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 167e5df63..fd81103cd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -116,8 +116,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat else if (table1 == null) return annotationForOneTable(pValueForContingencyTable(table2)); else { // take the one with the best (i.e., least significant pvalue) - double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE); - double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE); + double pvalue1 = pValueForContingencyTable(table1); + double pvalue2 = pValueForContingencyTable(table2); return annotationForOneTable(Math.max(pvalue1, pvalue2)); } } @@ -129,7 +129,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * @return a hash map from FS -> phred-scaled pValue */ private Map annotationForOneTable(final double pValue) { - final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)); + final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs return Collections.singletonMap(FS, value); // Map map = new HashMap(); // map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue))); From 79bc8180228480f037a122b609b24ff666a7040f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 16:15:58 -0500 Subject: [PATCH 25/34] Bug fix for VariantsToVCF: old dbSNP files can have '-' as reference base and those records always need to be padded. --- .../sting/gatk/refdata/VariantContextAdaptors.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index c7edebd81..a77341a5d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -194,17 +194,18 @@ public class VariantContextAdaptors { return null; // we weren't given enough reference context to create the VariantContext final byte refBaseForIndel = ref.getBases()[index]; + final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); boolean addPaddingBase; if ( isSNP(dbsnp) || isMNP(dbsnp) ) addPaddingBase = false; else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); + addPaddingBase = refBaseIsDash || VariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); else return null; // can't handle anything else Allele refAllele; - if ( dbsnp.getNCBIRefBase().equals("-") ) + if ( refBaseIsDash ) refAllele = Allele.create(refBaseForIndel, true); else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) return null; From 3c476a92a27db20f80ea94598cdac18e3d31c09c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 Jan 2013 15:56:03 -0500 Subject: [PATCH 26/34] Add dummy functionality (currently throws an error) to allow HC to include unmapped reads during assembly and calling --- .../haplotypecaller/HaplotypeCaller.java | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 4da2e1179..ce6aa32f4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -184,6 +184,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; + /** + * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling + * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the + * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking + * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, + * and may make use of them in assembly and calling, where possible. + */ + @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) + protected boolean includeUnmappedReads = false; + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -354,11 +364,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // enable non primary and extended reads in the active region @Override public EnumSet desiredReadStates() { - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED - ); + if ( includeUnmappedReads ) { + throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional"); +// return EnumSet.of( +// ActiveRegionReadState.PRIMARY, +// ActiveRegionReadState.NONPRIMARY, +// ActiveRegionReadState.EXTENDED, +// ActiveRegionReadState.UNMAPPED +// ); + } else + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); } @Override From 738c24a3b1efea489a5638eb146107d8533b8878 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 Jan 2013 16:25:11 -0500 Subject: [PATCH 27/34] Add tests to ensure that all insertion reads appear in the active region traversal --- .../sting/utils/sam/ArtificialBAMBuilder.java | 18 +++++- .../TraverseActiveRegionsUnitTest.java | 59 ++++++++++++++++++- 2 files changed, 73 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java index f5018db8c..ab539c9dc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -32,8 +32,7 @@ import org.broadinstitute.sting.utils.NGSPlatform; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * Easy to use creator of artificial BAM files for testing @@ -64,6 +63,8 @@ public class ArtificialBAMBuilder { int readLength = 10; private final ArrayList samples = new ArrayList(); + private LinkedList additionalReads = new LinkedList(); + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); { factory.setCreateIndex(true); @@ -118,6 +119,14 @@ public class ArtificialBAMBuilder { return this; } + public void addReads(final GATKSAMRecord readToAdd) { + additionalReads.add(readToAdd); + } + + public void addReads(final Collection readsToAdd) { + additionalReads.addAll(readsToAdd); + } + public List getSamples() { return samples; } @@ -145,6 +154,11 @@ public class ArtificialBAMBuilder { } } + if ( ! additionalReads.isEmpty() ) { + reads.addAll(additionalReads); + Collections.sort(reads, new SAMRecordCoordinateComparator()); + } + return reads; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 15d4eec2d..319af5ec5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -501,6 +501,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return providers; } + // --------------------------------------------------------------------------------------------------------- + // + // Combinatorial tests to ensure reads are going into the right regions + // + // --------------------------------------------------------------------------------------------------------- + @DataProvider(name = "CombinatorialARTTilingProvider") public Object[][] makeCombinatorialARTTilingProvider() { final List tests = new LinkedList(); @@ -582,7 +588,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } - @Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider") + @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider") public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); final List intervals = Arrays.asList( @@ -597,10 +603,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary for ( final ActiveRegion region : activeRegionsMap.values() ) { + final Set readNamesInRegion = readNamesInRegion(region); int nReadsExpectedInRegion = 0; for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); - final Set readNamesInRegion = readNamesInRegion(region); boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) ? region.getExtendedLoc().overlapsP(readLoc) @@ -629,4 +635,53 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { readNames.add(read.getReadName()); return readNames; } + + // --------------------------------------------------------------------------------------------------------- + // + // Make sure all insertion reads are properly included in the active regions + // + // --------------------------------------------------------------------------------------------------------- + + @Test + public void ensureAllInsertionReadsAreInActiveRegions() { + + final int readLength = 10; + final int start = 20; + final int nReadsPerLocus = 10; + final int nLoci = 3; + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setAlignmentStart(start); + + // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength); + allI.setCigarString(readLength + "I"); + allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0))); + + bamBuilder.addReads(allI); + + final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser()); + activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30)); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + + final TraverseActiveRegions traversal = new TraverseActiveRegions(); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + + final ActiveRegion region = activeRegionsMap.values().iterator().next(); + int nReadsExpectedInRegion = 0; + + final Set readNamesInRegion = readNamesInRegion(region); + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + Assert.assertTrue(readNamesInRegion.contains(read.getReadName()), + "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't"); + nReadsExpectedInRegion++; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } } From dbb69a1e1088db4c7fcfd9e512b1b9e095a62ba3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 22:33:16 -0500 Subject: [PATCH 28/34] Need to use ints for quals in HaplotypeScore instead of bytes because of overflow (they are summed when haplotypes are combined) --- .../walkers/annotator/HaplotypeScore.java | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index af6304297..3acba48ae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -216,14 +216,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); List hlist = new ArrayList(); - hlist.add(new Haplotype(haplotype1.getBases(), (byte)60)); + hlist.add(new Haplotype(haplotype1.getBases(), 60)); for (int k = 1; k < haplotypesToCompute; k++) { Haplotype haplotype2 = consensusHaplotypeQueue.poll(); if (haplotype2 == null) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found - hlist.add(new Haplotype(haplotype2.getBases(), (byte)20)); + hlist.add(new Haplotype(haplotype2.getBases(), 20)); } return hlist; } else @@ -285,10 +285,10 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final int length = a.length; final byte[] consensusChars = new byte[length]; - final byte[] consensusQuals = new byte[length]; + final int[] consensusQuals = new int[length]; - final byte[] qualsA = haplotypeA.getQuals(); - final byte[] qualsB = haplotypeB.getQuals(); + final int[] qualsA = haplotypeA.getQuals(); + final int[] qualsB = haplotypeB.getQuals(); for (int i = 0; i < length; i++) { chA = a[i]; @@ -308,7 +308,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot consensusQuals[i] = qualsA[i]; } else { consensusChars[i] = chA; - consensusQuals[i] = (byte)((int)qualsA[i] + (int)qualsB[i]); + consensusQuals[i] = qualsA[i] + qualsB[i]; } } @@ -442,31 +442,38 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private static class Haplotype { private final byte[] bases; - private final byte[] quals; + private final int[] quals; private int qualitySum = -1; - public Haplotype( final byte[] bases, final byte[] quals ) { + public Haplotype( final byte[] bases, final int[] quals ) { this.bases = bases; this.quals = quals; } - public Haplotype( final byte[] bases, final byte qual ) { + public Haplotype( final byte[] bases, final int qual ) { this.bases = bases; - quals = new byte[bases.length]; + quals = new int[bases.length]; Arrays.fill(quals, qual); } + public Haplotype( final byte[] bases, final byte[] quals ) { + this.bases = bases; + this.quals = new int[quals.length]; + for ( int i = 0 ; i < quals.length; i++ ) + this.quals[i] = (int)quals[i]; + } + public double getQualitySum() { if ( qualitySum == -1 ) { qualitySum = 0; - for ( final byte qual : quals ) { - qualitySum += (int)qual; + for ( final int qual : quals ) { + qualitySum += qual; } } return qualitySum; } - public byte[] getQuals() { + public int[] getQuals() { return quals.clone(); } From a623cca89a7310fe52118529dfd9c9f28698d8e5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 16 Jan 2013 22:47:58 -0500 Subject: [PATCH 29/34] Bug fix for HaplotypeCaller, as reported on the forum: when reduced reads didn't completely overlap a deletion call, we were incorrectly trying to find the reference position of a base on the read that didn't exist. Added integration test to cover this case. --- .../sting/gatk/walkers/annotator/DepthOfCoverage.java | 2 +- .../gatk/walkers/annotator/DepthPerAlleleBySample.java | 2 +- .../sting/gatk/walkers/annotator/FisherStrand.java | 2 +- .../haplotypecaller/HaplotypeCallerIntegrationTest.java | 8 ++++++++ .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 5 +++++ 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index aeec36c18..4adb2ca71 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -99,7 +99,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); - depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index a194fe323..5acea12f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -144,7 +144,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa continue; // read is non-informative if (!vc.getAlleles().contains(a)) continue; // sanity check - shouldn't be needed - alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); + alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index fd81103cd..fbd27dfe3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -277,7 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int column = isFW ? 0 : 1; final GATKSAMRecord read = el.getKey(); - table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e86834a4a..03d4216dd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -178,4 +178,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); executeTest("HC calling on a ReducedRead BAM", spec); } + + @Test + public void testReducedBamWithReadsNotFullySpanningDeletion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + Arrays.asList("0446c11fe2ba68a14f938ebc6e71ded7")); + executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index b43b590df..1488f7269 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -394,6 +394,11 @@ public class ReadUtils { return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); } + public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) { + final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord); + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false); + } + public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); int readCoord = result.getFirst(); From 953592421b267254b0dc4811522c982bbdd5360d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 17 Jan 2013 09:19:21 -0500 Subject: [PATCH 30/34] I think we got out of sync with the HC tests as we were clobbering each other's changes. Only differences here are to some RankSumTest values. --- .../HaplotypeCallerIntegrationTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 03d4216dd..3ceb0df94 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); + HCTest(CEUTRIO_BAM, "", "0e59153c6359d7cb7be44e25ab552790"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); + HCTest(NA12878_BAM, "", "d4b377aed2c8be2ebd81ee5e43b73a93"); } @Test(enabled = false) @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6c0c441b71848c2eea38ab5e2afe1120"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "14ed8e5be2d2a0bf478d742b4baa5a46"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "76fe5e57ed96541bdfee74782331b061"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "25981f7706f61d930556fb128cd1e5c5"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4701887e1927814259560d85098b6440")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); + Arrays.asList("18d047bf8116b56e0c6212e0875eceea")); executeTest("HC calling on a ReducedRead BAM", spec); } From 6db3e473af175328166cf5aea9e0bd94ff9a9e31 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 17 Jan 2013 10:30:04 -0500 Subject: [PATCH 31/34] Better error message for bad qual --- .../gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index fc7573f21..8c8de2bad 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -425,7 +425,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { byte qual = p.getQual(); if ( qual > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")"); if ( capBaseQualsAtMappingQual ) qual = (byte)Math.min((int)qual, p.getMappingQual()); if ( (int)qual < minBaseQual ) From 6a903f2c235fd48dd02aa9f1982da9e980046c2e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 18 Jan 2013 01:21:08 -0500 Subject: [PATCH 32/34] I finally gave up on trying to get the Haplotype/Allele merging to work in the HaplotypeCaller. I've resigned myself instead to create a mapping from Allele to Haplotype. It's cheap so not a big deal, but really shouldn't be necessary. Ryan and I are talking about refactoring for GATK2.5. --- .../haplotypecaller/HaplotypeCaller.java | 7 ++++++- .../LikelihoodCalculationEngine.java | 9 +++++++-- .../HaplotypeCallerIntegrationTest.java | 18 +++++++++--------- .../broadinstitute/sting/utils/Haplotype.java | 9 +++++++-- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ce6aa32f4..26f2560b7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -508,12 +508,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for ( Haplotype haplotype : haplotypes ) writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + // next, output the interesting reads for each sample aligned against the appropriate haplotype for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), (Haplotype) bestAllele, paddedRefLoc.getStart()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedRefLoc.getStart()); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index aafdbf126..57e071189 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -124,9 +124,14 @@ public class LikelihoodCalculationEngine { } private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads) { + // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) + final int numHaplotypes = haplotypes.size(); + final Map alleleVersions = new HashMap(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + alleleVersions.put(haplotype, Allele.create(haplotype.getBases())); + } final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - final int numHaplotypes = haplotypes.size(); for( final GATKSAMRecord read : reads ) { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? @@ -148,7 +153,7 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - perReadAlleleLikelihoodMap.add(read, haplotype, + perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 3ceb0df94..6c7afd8bb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "0e59153c6359d7cb7be44e25ab552790"); + HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "d4b377aed2c8be2ebd81ee5e43b73a93"); + HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } @Test(enabled = false) @@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "1a034b7eb572e1b6f659d6e5d57b3e76"); + "d590c8d6d5e58d685401b65a23846893"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "14ed8e5be2d2a0bf478d742b4baa5a46"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6c0c441b71848c2eea38ab5e2afe1120"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "76fe5e57ed96541bdfee74782331b061"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "25981f7706f61d930556fb128cd1e5c5"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4701887e1927814259560d85098b6440")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("18d047bf8116b56e0c6212e0875eceea")); + Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("0446c11fe2ba68a14f938ebc6e71ded7")); + Arrays.asList("6c22e5d57c4f5b631e3345e721aaca1b")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 66aed1173..baab1f5fa 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -73,9 +73,14 @@ public class Haplotype extends Allele { @Override public boolean equals( Object h ) { - return h instanceof Haplotype && super.equals(h); + return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); } - + + @Override + public int hashCode() { + return Arrays.hashCode(getBases()); + } + public HashMap getEventMap() { return eventMap; } From 39c73a6cf5adf5d0643c93af0c60b354a31e73ad Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 18 Jan 2013 03:35:48 -0500 Subject: [PATCH 33/34] 1. Ryan and I noticed that the FisherStrand annotation was completely busted for indels with reduced reads; fixed. 2. While making the previous fix and unifying FS for SNPs and indels, I noticed that FS was slightly broken in the general case for indels too; fixed. 3. I also fixed a minor bug in the Allele Biased Downsampling code for reduced reads. --- .../AlleleBiasedDownsamplingUtils.java | 11 ++-- .../gatk/walkers/annotator/FisherStrand.java | 53 +++++++++---------- .../UnifiedGenotyperIntegrationTest.java | 12 ++--- .../HaplotypeCallerIntegrationTest.java | 4 +- 4 files changed, 39 insertions(+), 41 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index a7bb58d0c..ba1da7c87 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -84,12 +84,13 @@ public class AlleleBiasedDownsamplingUtils { // start by stratifying the reads by the alleles they represent at this position for( final PileupElement pe : pileup ) { // we do not want to remove a reduced read - if ( pe.getRead().isReducedRead() ) + if ( pe.getRead().isReducedRead() ) { reducedReadPileups.add(pe); - - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); + } else { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } } // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index fbd27dfe3..ff3d7940f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -265,24 +265,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); - final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); - - if ( !matchesRef && !matchesAlt ) - continue; - - boolean isFW = el.getKey().getReadNegativeStrandFlag(); - - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - + final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); - table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; + updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount); } } return table; } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -299,31 +291,36 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - // ignore reduced reads because they are always on the forward strand! - // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test - if ( p.getRead().isReducedRead() ) - continue; - if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions continue; if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) continue; - final Allele base = Allele.create(p.getBase(), false); - final boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - - final boolean matchesRef = ref.equals(base, true); - final boolean matchesAlt = alt.equals(base, true); - if ( matchesRef || matchesAlt ) { - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - - table[row][column] += p.getRepresentativeCount(); - } + updateTable(table, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount()); } } return table; } + + private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { + // ignore reduced reads because they are always on the forward strand! + // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test + if ( read.isReducedRead() ) + return; + + final boolean matchesRef = allele.equals(ref, true); + final boolean matchesAlt = allele.equals(alt, true); + + if ( matchesRef || matchesAlt ) { + + final boolean isFW = !read.getReadNegativeStrandFlag(); + + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column] += representativeCount; + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a84019988..5b5a75d4e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -363,7 +363,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("39c7a813fd6ee82d3604f2a868b35b2a")); + Arrays.asList("8231ae37b52b927db9fc1e5c221b0ba0")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -391,13 +391,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("3d3c5691973a223209a1341272d881be")); + Arrays.asList("a47810de2f6ef8087f4644064a0814bc")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("23b7a37a64065cee53a80495c8717eea")); + Arrays.asList("53b8d2b0fa63c5d1019855e8e0db28f0")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -497,18 +497,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("092e42a712afb660ec79ff11c55933e2")); + Arrays.asList("02175dc9731aed92837ce0db78489fc0")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); + testReducedCalling("SNP", "fe1af8b30b7f1a267f772b9aaf388f24"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "1c9aaf65ffaa12bb766855265a1c3f8e"); + testReducedCalling("INDEL", "a85c110fcac9574a54c7daccb1e2d5ae"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6c7afd8bb..27fe31fa7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "d590c8d6d5e58d685401b65a23846893"); + "1a034b7eb572e1b6f659d6e5d57b3e76"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("6c22e5d57c4f5b631e3345e721aaca1b")); + Arrays.asList("4e8121dd9dc90478f237bd6ae4d19920")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } }