Bug fixes for interval batching in sharding system. Sharding system now batches intervals and passes
basic tests for small and large intervals and intervals that cross bin boundaries. Currently works only with a single BAM file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2800 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fdc2789322
commit
c7e006a996
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
|
||||
|
|
@ -145,9 +146,10 @@ class MappedReadStreamPointer extends ReadStreamPointer {
|
|||
|
||||
// The getStop() + 1 is a hack to work around an old bug in the way Picard created SAM files where queries
|
||||
// over a given interval would occasionally not pick up the last read in that interval.
|
||||
mergingIterator.queryOverlapping( mappedSegment.locus.getContig(),
|
||||
(int)mappedSegment.locus.getStart(),
|
||||
(int)mappedSegment.locus.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT);
|
||||
GenomeLoc bounds = mappedSegment.getBounds();
|
||||
mergingIterator.queryOverlapping( bounds.getContig(),
|
||||
(int)bounds.getStart(),
|
||||
(int)bounds.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT);
|
||||
|
||||
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
|
||||
}
|
||||
|
|
@ -163,9 +165,10 @@ class MappedReadStreamPointer extends ReadStreamPointer {
|
|||
MergingSamRecordIterator2 mergingIterator = new MergingSamRecordIterator2( headerMerger, sourceInfo );
|
||||
// NOTE: explicitly not using the queryOverlapping hack above since, according to the above criteria,
|
||||
// we'd only miss reads that are one base long when performing a contained query.
|
||||
mergingIterator.queryContained( mappedSegment.locus.getContig(),
|
||||
(int)mappedSegment.locus.getStart(),
|
||||
(int)mappedSegment.locus.getStop()+1);
|
||||
GenomeLoc bounds = mappedSegment.getBounds();
|
||||
mergingIterator.queryContained( bounds.getContig(),
|
||||
(int)bounds.getStart(),
|
||||
(int)bounds.getStop()+1);
|
||||
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -67,7 +67,8 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
|
|||
* @return Iterator through the data.
|
||||
*/
|
||||
public Iterator seek( Shard shard ) {
|
||||
SeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(shard.getGenomeLocs()) );
|
||||
DataStreamSegment dataStreamSegment = shard.getGenomeLocs().size() != 0 ? new MappedStreamSegment(shard.getGenomeLocs()) : new EntireStream();
|
||||
SeekableRODIterator iterator = iteratorPool.iterator(dataStreamSegment);
|
||||
return iterator;
|
||||
}
|
||||
|
||||
|
|
@ -108,20 +109,27 @@ class ReferenceOrderedDataPool extends ResourcePool<SeekableRODIterator,Seekable
|
|||
* @return @{inheritedDoc}
|
||||
*/
|
||||
public SeekableRODIterator selectBestExistingResource( DataStreamSegment segment, List<SeekableRODIterator> resources ) {
|
||||
if( !(segment instanceof MappedStreamSegment) )
|
||||
throw new StingException("Reference-ordered data cannot utilitize unmapped segments.");
|
||||
|
||||
GenomeLoc position = ((MappedStreamSegment)segment).locus;
|
||||
//#########################################
|
||||
if(segment instanceof MappedStreamSegment) {
|
||||
GenomeLoc position = ((MappedStreamSegment)segment).getFirstLocation();
|
||||
//#########################################
|
||||
//## System.out.printf("Searching for iterator at locus %s; %d resources available%n", position, resources.size());
|
||||
for( SeekableRODIterator iterator: resources ) {
|
||||
for( SeekableRODIterator iterator: resources ) {
|
||||
//##System.out.printf("Examining iterator at position %s [last query location: %s]%n", iterator.position(),iterator.lastQueryLocation());
|
||||
if( (iterator.position() == null && iterator.hasNext()) ||
|
||||
(iterator.position() != null && iterator.position().isBefore(position)) )
|
||||
return iterator;
|
||||
}
|
||||
if( (iterator.position() == null && iterator.hasNext()) ||
|
||||
(iterator.position() != null && iterator.position().isBefore(position)) )
|
||||
return iterator;
|
||||
}
|
||||
//##System.out.printf("Failed to find iterator at locus %s%n", position);
|
||||
return null;
|
||||
return null;
|
||||
}
|
||||
else if(segment instanceof EntireStream) {
|
||||
// Asking for a segment over the entire stream, so by definition, there is no best existing resource.
|
||||
// Force the system to create a new one.
|
||||
return null;
|
||||
}
|
||||
else {
|
||||
throw new StingException("Unable to find a ROD iterator for segments of type " + segment.getClass());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -169,11 +170,39 @@ class EntireStream implements DataStreamSegment {
|
|||
* Models a mapped position within a stream of GATK input data.
|
||||
*/
|
||||
class MappedStreamSegment implements DataStreamSegment {
|
||||
public final GenomeLoc locus;
|
||||
public final List<GenomeLoc> loci;
|
||||
|
||||
/**
|
||||
* Retrieves the first location covered by a mapped stream segment.
|
||||
* @return Location of the first base in this segment.
|
||||
*/
|
||||
public GenomeLoc getFirstLocation() {
|
||||
GenomeLoc firstLocus = loci.get(0);
|
||||
return GenomeLocParser.createGenomeLoc(firstLocus.getContigIndex(),firstLocus.getStart());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the total range of the given mapped stream segment.
|
||||
* @return A GenomeLoc consisting of the first base of the first locus to the last base of the last locus, inclusive.
|
||||
*/
|
||||
public GenomeLoc getBounds() {
|
||||
GenomeLoc firstLocus = loci.get(0);
|
||||
GenomeLoc lastLocus = loci.get(loci.size()-1);
|
||||
return GenomeLocParser.createGenomeLoc(getFirstLocation().getContigIndex(),firstLocus.getStart(),lastLocus.getStop());
|
||||
}
|
||||
|
||||
public MappedStreamSegment( List<GenomeLoc> loci ) {
|
||||
if(loci.size() > 1)
|
||||
throw new StingException("MappedStreamSegments cannot apply to a range of loci");
|
||||
this.locus = !loci.isEmpty() ? loci.get(0) : null;
|
||||
// Validate that the list of loci is non-empty.
|
||||
if(loci.size() == 0)
|
||||
throw new StingException("Cannot map to a locus of length 0.");
|
||||
|
||||
// Validate that all loci in the given list are from the same contig.
|
||||
int contigIndex = loci.get(0).getContigIndex();
|
||||
for(GenomeLoc locus: loci) {
|
||||
if(contigIndex != locus.getContigIndex())
|
||||
throw new StingException("All loci in a MappedStreamSegment must be on the same contig.");
|
||||
}
|
||||
this.loci = loci;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,12 +36,12 @@ public class HelloWalker extends LocusWalker<Integer,Long> {
|
|||
* which they fall, and the base from the reference that overlaps.
|
||||
* @param tracker The accessor for reference metadata.
|
||||
* @param ref The reference base that lines up with this locus.
|
||||
* @param locus Information about reads overlapping this locus.
|
||||
* @param context Information about reads aligning to this locus.
|
||||
* @return In this case, returns a count of how many loci were seen at this site (1).
|
||||
*/
|
||||
@Override
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
out.printf("Hello locus %s; your ref base is %c and you have %d reads%n", context.getLocation(), ref.getBase(), context.getReads().size() );
|
||||
out.printf("Hello locus %s; your ref base is %c and you have %d reads%n", context.getLocation(), ref.getBase(), context.getBasePileup().size() );
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import org.broadinstitute.sting.utils.sam.ArtificialSAMIterator;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMQueryIterator;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator;
|
||||
|
|
@ -68,10 +69,11 @@ public class ArtificialResourcePool extends SAMResourcePool {
|
|||
if (segment instanceof MappedStreamSegment && iterator instanceof ArtificialSAMQueryIterator) {
|
||||
ArtificialSAMQueryIterator queryIterator = (ArtificialSAMQueryIterator)iterator;
|
||||
MappedStreamSegment mappedSegment = (MappedStreamSegment)segment;
|
||||
GenomeLoc bounds = mappedSegment.getBounds();
|
||||
if (!this.queryOverlapping) {
|
||||
queryIterator.queryContained(mappedSegment.locus.getContig(), (int)mappedSegment.locus.getStart(), (int)mappedSegment.locus.getStop());
|
||||
queryIterator.queryContained(bounds.getContig(), (int)bounds.getStart(), (int)bounds.getStop());
|
||||
} else {
|
||||
queryIterator.queryOverlapping(mappedSegment.locus.getContig(), (int)mappedSegment.locus.getStart(), (int)mappedSegment.locus.getStop());
|
||||
queryIterator.queryOverlapping(bounds.getContig(), (int)bounds.getStart(), (int)bounds.getStop());
|
||||
}
|
||||
return queryIterator;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue