From c7e006a996d4aa969f6e23e32b5cdef93cc7e953 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 5 Feb 2010 21:47:54 +0000 Subject: [PATCH] Bug fixes for interval batching in sharding system. Sharding system now batches intervals and passes basic tests for small and large intervals and intervals that cross bin boundaries. Currently works only with a single BAM file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2800 348d0f76-0448-11de-a6fe-93d51630548a --- .../simpleDataSources/ReadStreamPointer.java | 15 +++++--- .../ReferenceOrderedDataSource.java | 32 ++++++++++------ .../simpleDataSources/ResourcePool.java | 37 +++++++++++++++++-- .../sting/gatk/examples/HelloWalker.java | 4 +- .../ArtificialResourcePool.java | 6 ++- 5 files changed, 68 insertions(+), 26 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java index 60e7552b2..269d8ae84 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLoc; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.SAMFileReader; @@ -145,9 +146,10 @@ class MappedReadStreamPointer extends ReadStreamPointer { // The getStop() + 1 is a hack to work around an old bug in the way Picard created SAM files where queries // over a given interval would occasionally not pick up the last read in that interval. - mergingIterator.queryOverlapping( mappedSegment.locus.getContig(), - (int)mappedSegment.locus.getStart(), - (int)mappedSegment.locus.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT); + GenomeLoc bounds = mappedSegment.getBounds(); + mergingIterator.queryOverlapping( bounds.getContig(), + (int)bounds.getStart(), + (int)bounds.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT); return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator); } @@ -163,9 +165,10 @@ class MappedReadStreamPointer extends ReadStreamPointer { MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo ); // NOTE: explicitly not using the queryOverlapping hack above since, according to the above criteria, // we'd only miss reads that are one base long when performing a contained query. - mergingIterator.queryContained( mappedSegment.locus.getContig(), - (int)mappedSegment.locus.getStart(), - (int)mappedSegment.locus.getStop()+1); + GenomeLoc bounds = mappedSegment.getBounds(); + mergingIterator.queryContained( bounds.getContig(), + (int)bounds.getStart(), + (int)bounds.getStop()+1); return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java index fa75f165c..9204d92cc 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java @@ -67,7 +67,8 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * @return Iterator through the data. */ public Iterator seek( Shard shard ) { - SeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(shard.getGenomeLocs()) ); + DataStreamSegment dataStreamSegment = shard.getGenomeLocs().size() != 0 ? new MappedStreamSegment(shard.getGenomeLocs()) : new EntireStream(); + SeekableRODIterator iterator = iteratorPool.iterator(dataStreamSegment); return iterator; } @@ -108,20 +109,27 @@ class ReferenceOrderedDataPool extends ResourcePool resources ) { - if( !(segment instanceof MappedStreamSegment) ) - throw new StingException("Reference-ordered data cannot utilitize unmapped segments."); - - GenomeLoc position = ((MappedStreamSegment)segment).locus; - //######################################### + if(segment instanceof MappedStreamSegment) { + GenomeLoc position = ((MappedStreamSegment)segment).getFirstLocation(); + //######################################### //## System.out.printf("Searching for iterator at locus %s; %d resources available%n", position, resources.size()); - for( SeekableRODIterator iterator: resources ) { + for( SeekableRODIterator iterator: resources ) { //##System.out.printf("Examining iterator at position %s [last query location: %s]%n", iterator.position(),iterator.lastQueryLocation()); - if( (iterator.position() == null && iterator.hasNext()) || - (iterator.position() != null && iterator.position().isBefore(position)) ) - return iterator; - } + if( (iterator.position() == null && iterator.hasNext()) || + (iterator.position() != null && iterator.position().isBefore(position)) ) + return iterator; + } //##System.out.printf("Failed to find iterator at locus %s%n", position); - return null; + return null; + } + else if(segment instanceof EntireStream) { + // Asking for a segment over the entire stream, so by definition, there is no best existing resource. + // Force the system to create a new one. + return null; + } + else { + throw new StingException("Unable to find a ROD iterator for segments of type " + segment.getClass()); + } } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java index 776e09446..7ab281f9e 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.List; import java.util.ArrayList; @@ -169,11 +170,39 @@ class EntireStream implements DataStreamSegment { * Models a mapped position within a stream of GATK input data. */ class MappedStreamSegment implements DataStreamSegment { - public final GenomeLoc locus; + public final List loci; + + /** + * Retrieves the first location covered by a mapped stream segment. + * @return Location of the first base in this segment. + */ + public GenomeLoc getFirstLocation() { + GenomeLoc firstLocus = loci.get(0); + return GenomeLocParser.createGenomeLoc(firstLocus.getContigIndex(),firstLocus.getStart()); + } + + /** + * Get the total range of the given mapped stream segment. + * @return A GenomeLoc consisting of the first base of the first locus to the last base of the last locus, inclusive. + */ + public GenomeLoc getBounds() { + GenomeLoc firstLocus = loci.get(0); + GenomeLoc lastLocus = loci.get(loci.size()-1); + return GenomeLocParser.createGenomeLoc(getFirstLocation().getContigIndex(),firstLocus.getStart(),lastLocus.getStop()); + } + public MappedStreamSegment( List loci ) { - if(loci.size() > 1) - throw new StingException("MappedStreamSegments cannot apply to a range of loci"); - this.locus = !loci.isEmpty() ? loci.get(0) : null; + // Validate that the list of loci is non-empty. + if(loci.size() == 0) + throw new StingException("Cannot map to a locus of length 0."); + + // Validate that all loci in the given list are from the same contig. + int contigIndex = loci.get(0).getContigIndex(); + for(GenomeLoc locus: loci) { + if(contigIndex != locus.getContigIndex()) + throw new StingException("All loci in a MappedStreamSegment must be on the same contig."); + } + this.loci = loci; } } diff --git a/java/src/org/broadinstitute/sting/gatk/examples/HelloWalker.java b/java/src/org/broadinstitute/sting/gatk/examples/HelloWalker.java index 94e5187df..9d4d62892 100644 --- a/java/src/org/broadinstitute/sting/gatk/examples/HelloWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/examples/HelloWalker.java @@ -36,12 +36,12 @@ public class HelloWalker extends LocusWalker { * which they fall, and the base from the reference that overlaps. * @param tracker The accessor for reference metadata. * @param ref The reference base that lines up with this locus. - * @param locus Information about reads overlapping this locus. + * @param context Information about reads aligning to this locus. * @return In this case, returns a count of how many loci were seen at this site (1). */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - out.printf("Hello locus %s; your ref base is %c and you have %d reads%n", context.getLocation(), ref.getBase(), context.getReads().size() ); + out.printf("Hello locus %s; your ref base is %c and you have %d reads%n", context.getLocation(), ref.getBase(), context.getBasePileup().size() ); return 1; } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java index 0aa799cec..6cbc5b070 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ArtificialResourcePool.java @@ -5,6 +5,7 @@ import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.utils.sam.ArtificialSAMIterator; import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; @@ -68,10 +69,11 @@ public class ArtificialResourcePool extends SAMResourcePool { if (segment instanceof MappedStreamSegment && iterator instanceof ArtificialSAMQueryIterator) { ArtificialSAMQueryIterator queryIterator = (ArtificialSAMQueryIterator)iterator; MappedStreamSegment mappedSegment = (MappedStreamSegment)segment; + GenomeLoc bounds = mappedSegment.getBounds(); if (!this.queryOverlapping) { - queryIterator.queryContained(mappedSegment.locus.getContig(), (int)mappedSegment.locus.getStart(), (int)mappedSegment.locus.getStop()); + queryIterator.queryContained(bounds.getContig(), (int)bounds.getStart(), (int)bounds.getStop()); } else { - queryIterator.queryOverlapping(mappedSegment.locus.getContig(), (int)mappedSegment.locus.getStart(), (int)mappedSegment.locus.getStop()); + queryIterator.queryOverlapping(bounds.getContig(), (int)bounds.getStart(), (int)bounds.getStop()); } return queryIterator; }