From 12ae3a22b65fae0f55cbc2b5c51a46f1c1ec186d Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 13 May 2009 18:51:16 +0000 Subject: [PATCH] Break locus context data access providers into modular components in preparation for traverse by loci. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@689 348d0f76-0448-11de-a6fe-93d51630548a --- .../providers/LocusContextProvider.java | 74 ---- .../providers/LocusContextQueue.java | 128 +++++++ .../providers/ReferenceLocusIterator.java | 71 ++++ .../providers/ShardDataProvider.java | 62 ++-- .../sting/gatk/iterators/LocusIterator.java | 24 ++ .../gatk/traversals/TraverseDuplicates.java | 17 +- .../traversals/TraverseLociByReference.java | 17 +- .../broadinstitute/sting/utils/GenomeLoc.java | 43 ++- .../providers/LocusContextQueueTest.java | 320 ++++++++++++++++++ 9 files changed, 610 insertions(+), 146 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueue.java create mode 100755 java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceLocusIterator.java create mode 100755 java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java create mode 100755 java/test/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueueTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java deleted file mode 100755 index 2de7c65a7..000000000 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextProvider.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.broadinstitute.sting.gatk.dataSources.providers; - -import edu.mit.broad.picard.filter.FilteringIterator; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.gatk.iterators.LocusContextIterator; -import org.broadinstitute.sting.gatk.iterators.LocusContextIteratorByHanger; -import org.broadinstitute.sting.gatk.traversals.TraversalEngine; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.ArrayList; -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: hanna - * Date: Apr 8, 2009 - * Time: 3:00:28 PM - * To change this template use File | Settings | File Templates. - */ -public class LocusContextProvider { - private Iterator reads; - - // What's the last locus accessed? Used for sanity checking. - private GenomeLoc lastLoc = null; - private LocusContextIterator loci; - private LocusContext locus; - protected static Logger logger = Logger.getLogger(LocusContextProvider.class); - - public LocusContextProvider( Iterator reads ) { - this.reads = new FilteringIterator(reads, new TraversalEngine.locusStreamFilterFunc()); - // prepare the iterator by loci from reads - loci = new LocusContextIteratorByHanger(this.reads); - } - - public LocusContext getLocusContext( GenomeLoc loc ) { - // Precondition checks - if( lastLoc != null && !loc.isPast( lastLoc ) ) - throw new RuntimeException( "Internal error: LocusContextProvider assumes that queries it receives are ordered." ); - - if( (loc.getStop() - loc.getStart()) > 0 ) - throw new RuntimeException( "Internal error :LocusContextProviders currently require 1-base genomeLocs."); - - // jump to the first reference site - LocusContext locusContext = advanceReadsToLoc( loci, loc ); - - // if no locus context was found, create an empty locus - if ( locusContext == null || locusContext.getLocation().compareTo( loc ) != 0 ) - locusContext = new LocusContext(loc, new ArrayList(), new ArrayList()); - - lastLoc = loc; - - return locusContext; - } - - private LocusContext advanceReadsToLoc(LocusContextIterator locusIter, GenomeLoc target) { - if ( ! locusIter.hasNext() ) - return null; - - if (locus == null) { - locus = locusIter.next(); - } - - while (target.isPast(locus.getLocation()) && locusIter.hasNext() ) { - logger.debug(String.format(" current locus is %s vs %s => %d", locus.getLocation(), target, locus.getLocation().compareTo(target))); - locus = locusIter.next(); - } - - logger.debug(String.format(" returning %s", locus.getLocation())); - return locus; - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueue.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueue.java new file mode 100755 index 000000000..23cbeea7e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueue.java @@ -0,0 +1,128 @@ +package org.broadinstitute.sting.gatk.dataSources.providers; + +import org.broadinstitute.sting.gatk.iterators.LocusContextIterator; +import org.broadinstitute.sting.gatk.iterators.LocusContextIteratorByHanger; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.traversals.TraversalEngine; +import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.apache.log4j.Logger; +import net.sf.samtools.SAMRecord; + +import java.util.ArrayList; +import java.util.Iterator; + +import edu.mit.broad.picard.filter.FilteringIterator; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 11:24:42 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A queue of locus contexts. Provides unidirectional seek. Stripped down + * implementation of java.util.Queue interface. + */ + +public class LocusContextQueue { + private Shard shard; + private LocusContextIterator loci; + + /** + * Gets the position to which the last seek was requested. + */ + private GenomeLoc seekPoint; + + /** + * What's the context for the last locus accessed? + * @param provider + */ + private LocusContext nextLocusContext = null; + + private static Logger logger = Logger.getLogger(LocusContextQueue.class); + + /** + * Create a new queue of locus contexts, sorted in + * @param provider + */ + public LocusContextQueue(ShardDataProvider provider) { + Iterator reads = new FilteringIterator(provider.getReadIterator(), new TraversalEngine.locusStreamFilterFunc()); + this.loci = new LocusContextIteratorByHanger(reads); + this.shard = provider.getShard(); + + // Seed the state tracking members with the first possible seek position and the first possible locus context. + seekPoint = new GenomeLoc(shard.getGenomeLoc().getContigIndex(),shard.getGenomeLoc().getStart()); + + if( loci.hasNext() ) + nextLocusContext = loci.next(); + else + nextLocusContext = this.createEmptyLocusContext(seekPoint); + } + + /** + * Get the locus context at the given position. + * @return Locus context, or null if no locus context exists at this position. + */ + public LocusContext peek() { + // Haven't reached the next locus context in the list yet. Return null. + if( seekPoint.isBefore(nextLocusContext.getLocation()) ) + return createEmptyLocusContext(seekPoint); + + return nextLocusContext; + } + + /** + * Seek to the given point the queue of locus contexts. + * @param target Target base pair to which to seek. Must be a single base pair. + * @return an instance of itself for parameter chaining. + */ + public LocusContextQueue seek(GenomeLoc target) { + if( !target.isSingleBP() ) + throw new IllegalArgumentException("Seek point must be a single base pair."); + + // If outside the range of the target, throw an illegal argument exception. + if( target.isBefore(shard.getGenomeLoc()) || target.isPast(shard.getGenomeLoc())) + throw new IllegalArgumentException(String.format("Target is out of range; target = %s, valid range = %s",target,shard.getGenomeLoc())); + + seekPoint = (GenomeLoc)target.clone(); + + // Search for the next locus context following the target positions. + while (nextLocusContext.getLocation().isBefore(target) && loci.hasNext() ) { + logger.debug(String.format(" current locus is %s vs %s => %d", nextLocusContext.getLocation(), + target, + nextLocusContext.getLocation().compareTo(target))); + nextLocusContext = loci.next(); + } + + // Couldn't find a next? Force the nextLocusContext to null. + if( nextLocusContext.getLocation().isBefore(target) && !loci.hasNext() ) + nextLocusContext = createEmptyLocusContext( seekPoint ); + + return this; + } + + /** + * Gets the point to which the queue has currently seeked. + * @return Single bp position where the queue has been positioned. A locus context may or may not + * exist at this point. + */ + public GenomeLoc getSeekPoint() { + return seekPoint; + } + + /** + * Creates a blank locus context at the specified location. + * @param site Site at which to create the blank locus context. + * @return empty context. + */ + private LocusContext createEmptyLocusContext( GenomeLoc site ) { + return new LocusContext(site, new ArrayList(), new ArrayList()); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceLocusIterator.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceLocusIterator.java new file mode 100755 index 000000000..df2c8581d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ReferenceLocusIterator.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.gatk.dataSources.providers; + +import org.broadinstitute.sting.gatk.iterators.LocusIterator; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.NoSuchElementException; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 10:52:47 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Iterates through all of the loci provided in the reference. + */ +public class ReferenceLocusIterator implements LocusIterator { + /** + * The entire region over which we're iterating. + */ + private GenomeLoc completeLocus; + + /** + * The current position in the traversal. + */ + private GenomeLoc currentLocus; + + /** + * Creates an iterator that can traverse over the entire + * reference specified in the given ShardDataProvider. + * @param provider Data provider to use as a backing source. + * Provider must have a reference (hasReference() == true). + */ + public ReferenceLocusIterator( ShardDataProvider provider ) { + if( !provider.hasReference() ) + throw new StingException("Trying to iterate through reference, but no reference has been provided."); + completeLocus = provider.getShard().getGenomeLoc(); + currentLocus = new GenomeLoc(completeLocus.getContig(),completeLocus.getStart()); + } + + /** + * Is the iterator still within the locus? + * @return True if the iterator has more elements. False otherwise. + */ + public boolean hasNext() { + return !currentLocus.isPast(completeLocus); + } + + /** + * Get the next single-base locus context bounded by the iterator. + * @return GenomeLoc representing the next single-base locus context. + */ + public GenomeLoc next() { + if( !hasNext() ) + throw new NoSuchElementException("No elements remaining in bounded reference region."); + GenomeLoc toReturn = (GenomeLoc)currentLocus.clone(); + currentLocus.incPos(); + return toReturn; + } + + public void remove() { + throw new UnsupportedOperationException( "ReferenceLocusIterator is read-only" ); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ShardDataProvider.java b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ShardDataProvider.java index 4acc4c0b3..2ea49f5ec 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ShardDataProvider.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/providers/ShardDataProvider.java @@ -2,9 +2,7 @@ package org.broadinstitute.sting.gatk.dataSources.providers; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard; import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource; -import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import net.sf.samtools.SAMRecord; @@ -26,28 +24,28 @@ import net.sf.samtools.SAMRecord; * tries to assemble as much as possible with it. */ public class ShardDataProvider { + /** + * The shard over which we're providing data. + */ + private final Shard shard; + /** * The raw collection of reads. */ private final StingSAMIterator reads; - /** - * Information about the locus. Can be accessed mutually exclusively - * with the reads iterator. - */ - private final LocusContextProvider locusContextProvider; - /** * Provider of reference data for this particular shard. */ private final ReferenceProvider referenceProvider; /** - * Users should only drive using the reads directly or using the locus. - * Perhaps in the future we can support direct simultaneous access to both. + * Retrieves the shard associated with this data provider. + * @return The shard associated with this data provider. */ - private enum LastAccessType { READS, LOCUS }; - private LastAccessType lastAccessed = null; + public Shard getShard() { + return shard; + } /** * Can this data source provide reads? @@ -57,14 +55,6 @@ public class ShardDataProvider { return reads != null; } - /** - * Can this data source provide a locus context? - * @return True if possible, false otherwise. - */ - public boolean hasLocusContext() { - return locusContextProvider != null; - } - /** * Can this data source provide reference information? * @return True if possible, false otherwise. @@ -79,26 +69,9 @@ public class ShardDataProvider { * @return An iterator over all reads in this shard. */ public StingSAMIterator getReadIterator() { - if( LastAccessType.LOCUS.equals(lastAccessed) ) - throw new UnsupportedOperationException("Cannot mix direct access to reads with access to locus context"); - lastAccessed = LastAccessType.READS; return reads; } - /** - * Gets a locus context for a particular region on the genome. - * WARNING: Right now, this cannot be concurrently accessed with the read iterator. - * WARNING: Right now, accesses must be sequential along the genome. - * @param genomeLoc The location for which to determine the context. - * @return The context associated with this location. - */ - public LocusContext getLocusContext( GenomeLoc genomeLoc ) { - if( LastAccessType.READS.equals(lastAccessed) ) - throw new UnsupportedOperationException("Cannot mix direct access to reads with access to locus context"); - lastAccessed = LastAccessType.LOCUS; - return locusContextProvider.getLocusContext( genomeLoc ); - } - /** * Gets the reference base associated with this particular point on the genome. * @param genomeLoc Region for which to retrieve the base. GenomeLoc must represent a 1-base region. @@ -124,14 +97,23 @@ public class ShardDataProvider { * @param reference A getter for a section of the reference. */ public ShardDataProvider( Shard shard, SAMDataSource reads, IndexedFastaSequenceFile reference ) { + this.shard = shard; // Provide basic reads information. this.reads = reads.seek( shard ); - // Watch out! the locus context provider will start prefetching data off the queue. Only create this - // if absolutely necessary. - this.locusContextProvider = !(shard instanceof ReadShard) ? new LocusContextProvider(this.reads) : null; this.referenceProvider = (reference != null) ? new ReferenceProvider(reference,shard) : null; } + /** + * Skeletal, package protected constructor for unit tests which require a ShardDataProvider. + * @param shard the shard + * @param reads reads iterator. + */ + ShardDataProvider( Shard shard, StingSAMIterator reads ) { + this.shard = shard; + this.reads = reads; + this.referenceProvider = null; + } + /** * Retire this shard. */ diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java new file mode 100755 index 000000000..9ca0523dd --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java @@ -0,0 +1,24 @@ +package org.broadinstitute.sting.gatk.iterators; + +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Iterator; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 10:48:56 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * An iterator for iterating through loci on a genome. + */ + +public interface LocusIterator extends Iterator { +} diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index f45021c37..e4e74e367 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -3,20 +3,16 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; +import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import java.io.File; import java.util.*; @@ -137,9 +133,6 @@ public class TraverseDuplicates extends TraversalEngine { /** * Traverse by reads, given the data and the walker - * @param walker the walker to execute over - * @param shard the shard of data to feed the walker - * @param locusProvider the factory for loci * @param sum of type T, the return from the walker * @param the generic type * @param the return type of the reduce function @@ -294,7 +287,6 @@ public class TraverseDuplicates extends TraversalEngine { * Traverse by reads, given the data and the walker * @param walker the walker to execute over * @param shard the shard of data to feed the walker - * @param locusProvider the factory for loci * @param sum of type T, the return from the walker * @param the generic type * @param the return type of the reduce function @@ -302,8 +294,7 @@ public class TraverseDuplicates extends TraversalEngine { */ public T traverse(Walker walker, Shard shard, - LocusContextProvider locusProvider, - BoundedReadIterator readIter, + ShardDataProvider dataProvider, T sum) { logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize())); @@ -319,8 +310,8 @@ public class TraverseDuplicates extends TraversalEngine { // -> those with the same mate pair position, for paired reads // -> those flagged as unpaired and duplicated but having the same start and end and - FilteringIterator filterIter = new FilteringIterator(readIter, new duplicateStreamFilterFunc()); + FilteringIterator filterIter = new FilteringIterator(dataProvider.getReadIterator(), new duplicateStreamFilterFunc()); PushbackIterator iter = new PushbackIterator(filterIter); - return actuallyTraverse(dupWalker, readIter, sum); + return actuallyTraverse(dupWalker, iter, sum); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java index a6bf84203..12e9f2e5e 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociByReference.java @@ -4,14 +4,13 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; -import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider; -import org.broadinstitute.sting.gatk.dataSources.providers.InvalidPositionException; +import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceLocusIterator; import org.broadinstitute.sting.gatk.dataSources.providers.ShardDataProvider; +import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextQueue; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.apache.log4j.Logger; @@ -57,18 +56,20 @@ public class TraverseLociByReference extends TraversalEngine { throw new IllegalArgumentException("Walker isn't a loci walker!"); LocusWalker locusWalker = (LocusWalker)walker; - GenomeLoc loc = shard.getGenomeLoc(); + + LocusIterator locusIterator = new ReferenceLocusIterator( dataProvider ); + LocusContextQueue locusContextQueue = new LocusContextQueue( dataProvider ); // We keep processing while the next reference location is within the interval - for( long pos = loc.getStart(); pos <= loc.getStop(); pos++ ) { - GenomeLoc site = new GenomeLoc( loc.getContig(), pos ); + while( locusIterator.hasNext() ) { + GenomeLoc site = locusIterator.next(); TraversalStatistics.nRecords++; // Iterate forward to get all reference ordered data covering this locus final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus( site ); - LocusContext locus = dataProvider.getLocusContext( site ); + LocusContext locus = locusContextQueue.seek( site ).peek(); char refBase = dataProvider.getReferenceBase( site ); if ( DOWNSAMPLE_BY_COVERAGE ) diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 672583179..21171abf4 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -29,7 +29,7 @@ import java.util.regex.Pattern; * * */ -public class GenomeLoc implements Comparable { +public class GenomeLoc implements Comparable, Cloneable { private static Logger logger = Logger.getLogger(GenomeLoc.class); private Integer contigIndex; @@ -432,6 +432,11 @@ public class GenomeLoc implements Comparable { return this.compareTo(left) > -1 && this.compareTo(right) < 1; } + public final boolean isBefore( GenomeLoc that ) { + int comparison = this.compareContigs(that); + return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); + } + public final boolean isPast( GenomeLoc that ) { int comparison = this.compareContigs(that); return ( comparison == 1 || ( comparison == 0 && this.getStart() > that.getStop() )); @@ -451,16 +456,32 @@ public class GenomeLoc implements Comparable { return n; } - // Dangerous -// public boolean equals(Object o) { -// // Not strictly necessary, but often a good optimization -// if (this == o) -// return true; -// if (!(o instanceof GenomeLoc)) -// return false; -// else -// return compareContigs((GenomeLoc)o) == 0; -// } + /** + * Check to see whether two genomeLocs are equal. + * Note that this implementation ignores the contigInfo object. + * @param other Other contig to compare. + */ + @Override + public boolean equals(Object other) { + if(other == null) + return false; + if(other instanceof GenomeLoc) { + GenomeLoc otherGenomeLoc = (GenomeLoc)other; + return this.contigIndex.equals(otherGenomeLoc.contigIndex) && + this.start == otherGenomeLoc.start && + this.stop == otherGenomeLoc.stop; + } + return false; + } + + /** + * Return a new GenomeLoc at this same position. + * @return A GenomeLoc with the same contents as the current loc. + */ + @Override + public Object clone() { + return new GenomeLoc(this); + } // // Comparison operations diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueueTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueueTest.java new file mode 100755 index 000000000..9ec317c27 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/providers/LocusContextQueueTest.java @@ -0,0 +1,320 @@ +package org.broadinstitute.sting.gatk.dataSources.providers; + +import org.junit.Test; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.gatk.dataSources.shards.LocusShard; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.BaseTest; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMFileHeader; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Collections; +import java.io.FileNotFoundException; + +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequence; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 2:34:46 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the locus context queue. + */ +public class LocusContextQueueTest extends BaseTest { + private static ReferenceSequenceFile sequenceSourceFile = null; + + @BeforeClass + public static void setupGenomeLoc() throws FileNotFoundException { + sequenceSourceFile = fakeReferenceSequenceFile(); + GenomeLoc.setupRefContigOrdering(sequenceSourceFile); + } + + private static ReferenceSequenceFile fakeReferenceSequenceFile() { + return new ReferenceSequenceFile() { + public SAMSequenceDictionary getSequenceDictionary() { + SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1"); + SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord)); + return dictionary; + } + public ReferenceSequence nextSequence() { throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); } + }; + } + + @Test + public void emptyLocusContextTest() { + SAMRecordIterator iterator = new SAMRecordIterator(); + + GenomeLoc shardBounds = new GenomeLoc("chr1",1,5); + Shard shard = new LocusShard(shardBounds); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.emptyList() ); + } + + @Test + public void singleReadTest() { + SAMRecord read = buildSAMRecord("chr1",1,5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + GenomeLoc shardBounds = new GenomeLoc("chr1",1,5); + Shard shard = new LocusShard(shardBounds); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.singletonList(read) ); + } + + @Test + public void readCoveringFirstPartTest() { + SAMRecord read = buildSAMRecord("chr1",1,5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.singletonList(read) ); + } + + @Test + public void readCoveringLastPartTest() { + SAMRecord read = buildSAMRecord("chr1",6,10); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.singletonList(read) ); + } + + @Test + public void readCoveringMiddleTest() { + SAMRecord read = buildSAMRecord("chr1",3,7); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.singletonList(read) ); + } + + @Test + public void readOverlappingStartTest() { + SAMRecord read = buildSAMRecord("chr1",1,10); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new LocusShard(new GenomeLoc("chr1",6,15)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.singletonList(read) ); + } + + @Test + public void readOverlappingEndTest() { + SAMRecord read = buildSAMRecord("chr1",6,15); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + testReadsInContext( queue, shard.getGenomeLoc(), Collections.singletonList(read) ); + } + + @Test + public void readsSpanningTest() { + SAMRecord read1 = buildSAMRecord("chr1",1,5); + SAMRecord read2 = buildSAMRecord("chr1",6,10); + SAMRecordIterator iterator = new SAMRecordIterator(read1,read2); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads,read1,read2); + testReadsInContext( queue, shard.getGenomeLoc(), expectedReads ); + } + + @Test + public void duplicateReadsTest() { + SAMRecord read1 = buildSAMRecord("chr1",1,5); + SAMRecord read2 = buildSAMRecord("chr1",1,5); + SAMRecord read3 = buildSAMRecord("chr1",6,10); + SAMRecord read4 = buildSAMRecord("chr1",6,10); + SAMRecordIterator iterator = new SAMRecordIterator(read1,read2,read3,read4); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads,read1,read2,read3,read4); + testReadsInContext( queue, shard.getGenomeLoc(), expectedReads ); + } + + @Test + public void cascadingReadsWithinBoundsTest() { + SAMRecord read1 = buildSAMRecord("chr1",2,6); + SAMRecord read2 = buildSAMRecord("chr1",3,7); + SAMRecord read3 = buildSAMRecord("chr1",4,8); + SAMRecord read4 = buildSAMRecord("chr1",5,9); + SAMRecordIterator iterator = new SAMRecordIterator(read1,read2,read3,read4); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads,read1,read2,read3,read4); + testReadsInContext( queue, shard.getGenomeLoc(), expectedReads ); + } + + @Test + public void cascadingReadsAtBoundsTest() { + SAMRecord read1 = buildSAMRecord("chr1",1,5); + SAMRecord read2 = buildSAMRecord("chr1",2,6); + SAMRecord read3 = buildSAMRecord("chr1",3,7); + SAMRecord read4 = buildSAMRecord("chr1",4,8); + SAMRecord read5 = buildSAMRecord("chr1",5,9); + SAMRecord read6 = buildSAMRecord("chr1",6,10); + SAMRecordIterator iterator = new SAMRecordIterator(read1,read2,read3,read4,read5,read6); + + Shard shard = new LocusShard(new GenomeLoc("chr1",1,10)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads,read1,read2,read3,read4,read5,read6); + testReadsInContext( queue, shard.getGenomeLoc(), expectedReads ); + } + + @Test + public void cascadingReadsOverlappingBoundsTest() { + SAMRecord read01 = buildSAMRecord("chr1",1,5); + SAMRecord read02 = buildSAMRecord("chr1",2,6); + SAMRecord read03 = buildSAMRecord("chr1",3,7); + SAMRecord read04 = buildSAMRecord("chr1",4,8); + SAMRecord read05 = buildSAMRecord("chr1",5,9); + SAMRecord read06 = buildSAMRecord("chr1",6,10); + SAMRecord read07 = buildSAMRecord("chr1",7,11); + SAMRecord read08 = buildSAMRecord("chr1",8,12); + SAMRecord read09 = buildSAMRecord("chr1",9,13); + SAMRecord read10 = buildSAMRecord("chr1",10,14); + SAMRecord read11 = buildSAMRecord("chr1",11,15); + SAMRecord read12 = buildSAMRecord("chr1",12,16); + SAMRecordIterator iterator = new SAMRecordIterator(read01,read02,read03,read04,read05,read06, + read07,read08,read09,read10,read11,read12); + + Shard shard = new LocusShard(new GenomeLoc("chr1",6,15)); + ShardDataProvider dataProvider = new ShardDataProvider( shard, iterator ); + LocusContextQueue queue = new LocusContextQueue( dataProvider ); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads,read01,read02,read03,read04,read05,read06, + read07,read08,read09,read10,read11,read12); + testReadsInContext( queue, shard.getGenomeLoc(), expectedReads ); + } + + /** + * Test the reads according to an independently derived context. + * @param queue + * @param bounds + * @param reads + */ + private void testReadsInContext( LocusContextQueue queue, GenomeLoc bounds, List reads ) { + Assert.assertEquals("Initial position of queue is incorrect", new GenomeLoc(bounds.getContig(),bounds.getStart()), queue.getSeekPoint() ); + + for( long i = bounds.getStart(); i <= bounds.getStop(); i++ ) { + GenomeLoc site = new GenomeLoc("chr1",i); + queue.seek(site); + Assert.assertEquals("Seeked queue is incorrect", site, queue.getSeekPoint() ); + + LocusContext locusContext = queue.peek(); + Assert.assertEquals("Target locus context location is incorrect", site, locusContext.getLocation() ); + int expectedReadsAtSite = 0; + + for( SAMRecord read: reads ) { + if(new GenomeLoc(read).containsP(locusContext.getLocation())) { + Assert.assertTrue("Target locus context does not contain reads", locusContext.getReads().contains(read) ); + expectedReadsAtSite++; + } + } + + Assert.assertEquals("Found wrong number of reads at site", expectedReadsAtSite, locusContext.getReads().size()); + } + + } + + private SAMRecord buildSAMRecord( String contig, int alignmentStart, int alignmentEnd ) { + SAMFileHeader header = new SAMFileHeader(); + header.setSequenceDictionary(sequenceSourceFile.getSequenceDictionary()); + + SAMRecord record = new SAMRecord(header); + + record.setReferenceIndex(sequenceSourceFile.getSequenceDictionary().getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + Cigar cigar = new Cigar(); + cigar.add(new CigarElement(alignmentEnd-alignmentStart+1,CigarOperator.M)); + record.setCigar(cigar); + return record; + } + + private class SAMRecordIterator implements StingSAMIterator { + private Iterator backingIterator = null; + + public SAMRecordIterator( SAMRecord... reads ) { + List backingList = new ArrayList(); + backingList.addAll(Arrays.asList(reads)); + backingIterator = backingList.iterator(); + } + + public boolean hasNext() { + return backingIterator.hasNext(); + } + + public SAMRecord next() { + return backingIterator.next(); + } + + public Iterator iterator() { + return this; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a read-only iterator"); + } + } +}