2009-04-24 12:33:35 +08:00
package org.broadinstitute.sting.gatk.traversals ;
2009-04-26 10:26:08 +08:00
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker ;
2009-04-24 12:33:35 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
import org.broadinstitute.sting.gatk.LocusContext ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum ;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker ;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator ;
import org.broadinstitute.sting.utils.GenomeLoc ;
import org.broadinstitute.sting.utils.Utils ;
import java.util.List ;
import java.util.Iterator ;
import java.util.ArrayList ;
import java.io.File ;
import net.sf.samtools.SAMRecord ;
import net.sf.samtools.util.CloseableIterator ;
import edu.mit.broad.picard.filter.FilteringIterator ;
/ * *
* Created by IntelliJ IDEA .
* User : ebanks
* Date : Apr 23 , 2009
* Time : 10 : 26 : 03 AM
* To change this template use File | Settings | File Templates .
* /
2009-04-26 10:26:08 +08:00
public class TraverseByLocusWindows extends TraversalEngine {
2009-04-24 12:33:35 +08:00
2009-04-26 10:26:08 +08:00
public TraverseByLocusWindows ( List < File > reads , File ref , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
2009-04-24 12:33:35 +08:00
super ( reads , ref , rods ) ;
}
public < M , T > T traverse ( Walker < M , T > walker , ArrayList < GenomeLoc > locations ) {
2009-04-26 10:26:08 +08:00
if ( walker instanceof LocusWindowWalker ) {
LocusWindowWalker < M , T > locusWindowWalker = ( LocusWindowWalker < M , T > ) walker ;
T sum = traverseByIntervals ( locusWindowWalker , locations ) ;
2009-04-24 12:33:35 +08:00
return sum ;
} else {
throw new IllegalArgumentException ( "Walker isn't an interval walker!" ) ;
}
}
/ * *
* Traverse by intervals - - the key driver of linearly ordered traversal of intervals . Provides reads , RODs , and
* the reference base for each interval in the reference to the intervalWalker walker . Supports all of the
* interaction contract implied by the interval walker
*
* @param walker An interval walker object
* @param < M > MapType - - the result of calling map ( ) on walker
* @param < T > ReduceType - - the result of calling reduce ( ) on the walker
* @return 0 on success
* /
2009-04-26 10:26:08 +08:00
protected < M , T > T traverseByIntervals ( LocusWindowWalker < M , T > walker , ArrayList < GenomeLoc > locations ) {
2009-04-24 12:33:35 +08:00
logger . debug ( "Entering traverseByIntervals" ) ;
2009-04-25 03:40:21 +08:00
if ( readsFiles . size ( ) > 1 )
throw new UnsupportedOperationException ( "Cannot do ByInterval traversal on file with multiple inputs." ) ;
samReader = initializeSAMFile ( readsFiles . get ( 0 ) ) ;
2009-04-24 12:33:35 +08:00
verifySortOrder ( true ) ;
walker . initialize ( ) ;
T sum = walker . reduceInit ( ) ;
if ( locations . isEmpty ( ) ) {
logger . debug ( "There are no intervals provided for the traversal" ) ;
} else {
if ( ! samReader . hasIndex ( ) )
Utils . scareUser ( "Processing locations were requested, but no index was found for the input SAM/BAM file. This operation is potentially dangerously slow, aborting." ) ;
for ( GenomeLoc interval : locations ) {
logger . debug ( String . format ( "Processing interval %s" , interval . toString ( ) ) ) ;
CloseableIterator < SAMRecord > readIter = samReader . queryOverlapping ( interval . getContig ( ) ,
( int ) interval . getStart ( ) ,
( int ) interval . getStop ( ) ) ;
Iterator < SAMRecord > wrappedIter = WrapReadsIterator ( readIter , false ) ;
sum = carryWalkerOverInterval ( walker , wrappedIter , sum , interval ) ;
readIter . close ( ) ;
}
}
2009-04-25 05:39:44 +08:00
//printOnTraversalDone("intervals", sum);
2009-04-24 12:33:35 +08:00
walker . onTraversalDone ( sum ) ;
return sum ;
}
2009-04-26 10:26:08 +08:00
protected < M , T > T carryWalkerOverInterval ( LocusWindowWalker < M , T > walker , Iterator < SAMRecord > readIter , T sum , GenomeLoc interval ) {
2009-04-24 12:33:35 +08:00
logger . debug ( String . format ( "TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s" , interval ) ) ;
// prepare the read filtering read iterator and provide it to a new interval iterator
FilteringIterator filterIter = new FilteringIterator ( readIter , new locusStreamFilterFunc ( ) ) ;
ArrayList < SAMRecord > reads = new ArrayList < SAMRecord > ( ) ;
ArrayList < Integer > offsets = new ArrayList < Integer > ( ) ;
boolean done = false ;
2009-04-29 22:59:53 +08:00
long leftmostIndex = interval . getStart ( ) ,
rightmostIndex = interval . getStop ( ) ;
2009-04-24 12:33:35 +08:00
while ( filterIter . hasNext ( ) & & ! done ) {
TraversalStatistics . nRecords + + ;
SAMRecord read = filterIter . next ( ) ;
reads . add ( read ) ;
offsets . add ( ( int ) ( read . getAlignmentStart ( ) - interval . getStart ( ) ) ) ;
2009-04-29 22:59:53 +08:00
if ( read . getAlignmentStart ( ) < leftmostIndex )
leftmostIndex = read . getAlignmentStart ( ) ;
if ( read . getAlignmentEnd ( ) > rightmostIndex )
rightmostIndex = read . getAlignmentEnd ( ) ;
if ( this . maxReads > 0 & & TraversalStatistics . nRecords > this . maxReads ) {
2009-04-24 12:33:35 +08:00
logger . warn ( String . format ( "Maximum number of reads encountered, terminating traversal " + TraversalStatistics . nRecords ) ) ;
done = true ;
}
}
2009-04-29 22:59:53 +08:00
GenomeLoc window = new GenomeLoc ( interval . getContig ( ) , leftmostIndex , rightmostIndex ) ;
LocusContext locus = new LocusContext ( window , reads , offsets ) ;
2009-04-24 12:33:35 +08:00
if ( DOWNSAMPLE_BY_COVERAGE )
locus . downsampleToCoverage ( downsamplingCoverage ) ;
2009-04-29 22:59:53 +08:00
ReferenceIterator refSite = refIter . seekForward ( window ) ;
StringBuffer refBases = new StringBuffer ( refSite . getBaseAsString ( ) ) ;
int locusLength = ( int ) ( rightmostIndex - leftmostIndex ) ;
for ( int i = 0 ; i < locusLength ; i + + ) {
refSite = refSite . next ( ) ;
refBases . append ( refSite . getBaseAsChar ( ) ) ;
}
2009-04-24 12:33:35 +08:00
locus . setReferenceContig ( refSite . getCurrentContig ( ) ) ;
// Iterate forward to get all reference ordered data covering this interval
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus ( locus . getLocation ( ) ) ;
2009-04-29 22:59:53 +08:00
sum = walkAtinterval ( walker , sum , locus , refBases . toString ( ) , tracker ) ;
2009-04-24 12:33:35 +08:00
//System.out.format("Working at %s\n", locus.getLocation().toString());
printProgress ( "intervals" , locus . getLocation ( ) ) ;
return sum ;
}
2009-04-26 10:26:08 +08:00
protected < M , T > T walkAtinterval ( final LocusWindowWalker < M , T > walker ,
2009-04-24 12:33:35 +08:00
T sum ,
final LocusContext locus ,
2009-04-29 22:59:53 +08:00
final String refSeq ,
2009-04-24 12:33:35 +08:00
final RefMetaDataTracker tracker ) {
//logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
//
// Execute our contract with the walker. Call filter, map, and reduce
//
2009-04-29 22:59:53 +08:00
final boolean keepMeP = walker . filter ( tracker , refSeq , locus ) ;
2009-04-24 12:33:35 +08:00
if ( keepMeP ) {
2009-04-29 22:59:53 +08:00
M x = walker . map ( tracker , refSeq , locus ) ;
2009-04-24 12:33:35 +08:00
sum = walker . reduce ( x , sum ) ;
}
//printProgress("intervals", interval.getLocation());
return sum ;
}
}