Refactored interval clipping utility

reads are clipped in map() and now we cover almost all cases. Left behind the case where the read stretches through two intervals. This will need special treatment later.
This commit is contained in:
Mauricio Carneiro 2011-08-13 19:33:53 -04:00
parent e921230e72
commit 0be1dacddb
1 changed files with 65 additions and 1 deletions

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -112,7 +113,42 @@ public class ReadUtils {
* @version 0.1
*/
public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR }
public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR}
/**
* This enum represents all the different ways in which a read can overlap an interval.
*
* NO_OVERLAP:
* the read does not overlap the interval.
*
* |----------------| (interval)
* <----------------> (read)
*
* LEFT_OVERLAP:
* the read starts before the beginning of the interval but ends inside of it
*
* |----------------| (interval)
* <----------------> (read)
*
* RIGHT_OVERLAP:
* the read starts inside the interval but ends outside of it
*
* |----------------| (interval)
* <----------------> (read)
*
* FULL_OVERLAP:
* the read starts before the interval and ends after the interval
*
* |-----------| (interval)
* <-------------------> (read)
*
* CONTAINED:
* the read starts and ends inside the interval
*
* |----------------| (interval)
* <--------> (read)
*/
public enum ReadAndIntervalOverlap {NO_OVERLAP, LEFT_OVERLAP, RIGHT_OVERLAP, FULL_OVERLAP, CONTAINED}
/**
* God, there's a huge information asymmetry in SAM format:
@ -569,6 +605,34 @@ public class ReadUtils {
return 0;
}
/**
* Determines what is the position of the read in relation to the interval.
* Note: This function uses the UNCLIPPED ENDS of the reads for the comparison.
* @param read the read
* @param interval the interval
* @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
*/
public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) {
if ( (!read.getReferenceName().equals(interval.getContig())) ||
(read.getUnclippedEnd() < interval.getStart()) ||
(read.getUnclippedStart() > interval.getStop()) )
return ReadAndIntervalOverlap.NO_OVERLAP;
else if ( (read.getUnclippedStart() > interval.getStart()) &&
(read.getUnclippedEnd() < interval.getStop()) )
return ReadAndIntervalOverlap.CONTAINED;
else if ( (read.getUnclippedStart() < interval.getStart()) &&
(read.getUnclippedEnd() > interval.getStop()) )
return ReadAndIntervalOverlap.FULL_OVERLAP;
else if ( (read.getAlignmentStart() < interval.getStart()) )
return ReadAndIntervalOverlap.LEFT_OVERLAP;
else
return ReadAndIntervalOverlap.RIGHT_OVERLAP;
}