diff --git a/java/src/org/broadinstitute/sting/gatk/filters/AdaptorSequenceFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/AdaptorSequenceFilter.java new file mode 100755 index 000000000..bf04633ec --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/AdaptorSequenceFilter.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +/** + * This class doesn't actually filter out reads (it's really a ReadTransformer): it hard-clips the + * reads to remove the adaptor sequences from them. + * + * @author ebanks, depristo + * @version 0.1 + */ + +public class AdaptorSequenceFilter implements SamRecordFilter { + + public boolean filterOut(final SAMRecord rec) { + return ReadUtils.hardClipAdaptorSequence(rec); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 5f2f35154..8fa6fcd0f 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; import java.io.File; @@ -136,36 +138,218 @@ public class ReadUtils { public static OverlapType readPairBaseOverlapType(final SAMRecord rec, long basePos, final int adaptorLength) { OverlapType state = OverlapType.NOT_OVERLAPPING; - long isize = rec.getInferredInsertSize(); - if ( isize != 0 ) { // we're not an unmapped pair -- cannot filter out - long adaptorStart, adaptorEnd; - long mateStart = rec.getMateAlignmentStart(); - long mateEnd = -1; + Pair adaptorBoundaries = getAdaptorBoundaries(rec, adaptorLength); - if ( rec.getReadNegativeStrandFlag() ) { - // we are on the negative strand, so our mate is on the positive strand - adaptorStart = mateStart - adaptorLength - 1; - adaptorEnd = mateStart - 1; - } else { - // we are on the positive strand, so our mate is on the negative strand - mateEnd = rec.getAlignmentStart() + isize - 1; - adaptorStart = mateEnd + 1; - adaptorEnd = mateEnd + adaptorLength; - } + if ( adaptorBoundaries != null ) { // we're not an unmapped pair -- cannot filter out - boolean inAdapator = basePos >= adaptorStart && basePos <= adaptorEnd; + boolean inAdapator = basePos >= adaptorBoundaries.first && basePos <= adaptorBoundaries.second; if ( inAdapator ) { state = OverlapType.IN_ADAPTOR; -// System.out.printf("baseOverlapState: %50s negStrand=%b base=%d start=%d stop=%d, mateStart=%d mateStop=%d adaptorStart=%d adaptorEnd=%d isize=%d => %s%n", -// rec.getReadName(), rec.getReadNegativeStrandFlag(), basePos, rec.getAlignmentStart(), rec.getAlignmentEnd(), mateStart, mateEnd, adaptorStart, adaptorEnd, isize, state); + //System.out.printf("baseOverlapState: %50s negStrand=%b base=%d start=%d stop=%d, adaptorStart=%d adaptorEnd=%d isize=%d => %s%n", + // rec.getReadName(), rec.getReadNegativeStrandFlag(), basePos, rec.getAlignmentStart(), rec.getAlignmentEnd(), adaptorBoundaries.first, adaptorBoundaries.second, rec.getInferredInsertSize(), state); } } return state; } + private static Pair getAdaptorBoundaries(SAMRecord rec, int adaptorLength) { + int isize = rec.getInferredInsertSize(); + if ( isize == 0 ) + return null; // don't worry about unmapped pairs + + int adaptorStart, adaptorEnd; + + if ( rec.getReadNegativeStrandFlag() ) { + // we are on the negative strand, so our mate is on the positive strand + int mateStart = rec.getMateAlignmentStart(); + adaptorStart = mateStart - adaptorLength - 1; + adaptorEnd = mateStart - 1; + } else { + // we are on the positive strand, so our mate is on the negative strand + int mateEnd = rec.getAlignmentStart() + isize - 1; + adaptorStart = mateEnd + 1; + adaptorEnd = mateEnd + adaptorLength; + } + + return new Pair(adaptorStart, adaptorEnd); + } + + // return true if the read needs to be completely clipped + public static boolean hardClipAdaptorSequence(final SAMRecord rec, int adaptorLength) { + + Pair adaptorBoundaries = getAdaptorBoundaries(rec, adaptorLength); + if ( adaptorBoundaries != null ) { + if ( rec.getReadNegativeStrandFlag() && adaptorBoundaries.second >= rec.getAlignmentStart() && adaptorBoundaries.first < rec.getAlignmentEnd() ) + return hardClipStartOfRead(rec, adaptorBoundaries.second); + else if ( !rec.getReadNegativeStrandFlag() && adaptorBoundaries.first <= rec.getAlignmentEnd() ) + return hardClipEndOfRead(rec, adaptorBoundaries.first); + } + + return false; + } + + // return true if the read needs to be completely clipped + private static boolean hardClipStartOfRead(SAMRecord rec, int stopPosition) { + + if ( stopPosition >= rec.getAlignmentEnd() ) { + // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it + //System.out.printf("Entire read needs to be clipped: %50s %n", rec.getReadName()); + return true; + } + + //System.out.printf("Clipping start of read: %50s start=%d adaptorEnd=%d isize=%d %n", + // rec.getReadName(), rec.getAlignmentStart(), stopPosition, rec.getInferredInsertSize()); + + Cigar oldCigar = rec.getCigar(); + LinkedList newCigarElements = new LinkedList(); + int currentPos = rec.getAlignmentStart(); + int basesToClip = 0; + int basesAlreadyClipped = 0; + + for ( CigarElement ce : oldCigar.getCigarElements() ) { + + if ( currentPos > stopPosition) { + newCigarElements.add(ce); + continue; + } + + int elementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case M: + for (int i = 0; i < elementLength; i++, currentPos++, basesToClip++) { + if ( currentPos > stopPosition ) { + newCigarElements.add(new CigarElement(elementLength - i, CigarOperator.M)); + break; + } + } + break; + case I: + case S: + basesToClip += elementLength; + break; + case D: + case N: + currentPos += elementLength; + break; + case H: + basesAlreadyClipped += elementLength; + case P: + break; + default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + + } + + // copy over the unclipped bases + final byte[] bases = rec.getReadBases(); + final byte[] quals = rec.getBaseQualities(); + int newLength = bases.length - basesToClip; + byte[] newBases = new byte[newLength]; + byte[] newQuals = new byte[newLength]; + System.arraycopy(bases, basesToClip, newBases, 0, newLength); + System.arraycopy(quals, basesToClip, newQuals, 0, newLength); + rec.setReadBases(newBases); + rec.setBaseQualities(newQuals); + + // now add a CIGAR element for the clipped bases + newCigarElements.addFirst(new CigarElement(basesToClip + basesAlreadyClipped, CigarOperator.H)); + Cigar newCigar = new Cigar(newCigarElements); + rec.setCigar(newCigar); + + // adjust the start accordingly + rec.setAlignmentStart(stopPosition + 1); + + return false; + } + + // return true if the read needs to be completely clipped + private static boolean hardClipEndOfRead(SAMRecord rec, int startPosition) { + + if ( startPosition <= rec.getAlignmentStart() ) { + // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it + //System.out.printf("Entire read needs to be clipped: %50s %n", rec.getReadName()); + return true; + } + + //System.out.printf("Clipping end of read: %50s adaptorStart=%d end=%d isize=%d %n", + // rec.getReadName(), startPosition, rec.getAlignmentEnd(), rec.getInferredInsertSize()); + + Cigar oldCigar = rec.getCigar(); + LinkedList newCigarElements = new LinkedList(); + int currentPos = rec.getAlignmentStart(); + int basesToKeep = 0; + int basesAlreadyClipped = 0; + + for ( CigarElement ce : oldCigar.getCigarElements() ) { + + int elementLength = ce.getLength(); + + if ( currentPos >= startPosition ) { + if ( ce.getOperator() == CigarOperator.H ) + basesAlreadyClipped += elementLength; + continue; + } + + switch ( ce.getOperator() ) { + case M: + for (int i = 0; i < elementLength; i++, currentPos++, basesToKeep++) { + if ( currentPos == startPosition ) { + newCigarElements.add(new CigarElement(i, CigarOperator.M)); + break; + } + } + + if ( currentPos != startPosition ) + newCigarElements.add(ce); + break; + case I: + case S: + newCigarElements.add(ce); + basesToKeep += elementLength; + break; + case D: + case N: + newCigarElements.add(ce); + currentPos += elementLength; + break; + case H: + case P: + newCigarElements.add(ce); + break; + default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + + } + + // copy over the unclipped bases + final byte[] bases = rec.getReadBases(); + final byte[] quals = rec.getBaseQualities(); + byte[] newBases = new byte[basesToKeep]; + byte[] newQuals = new byte[basesToKeep]; + System.arraycopy(bases, 0, newBases, 0, basesToKeep); + System.arraycopy(quals, 0, newQuals, 0, basesToKeep); + rec.setReadBases(newBases); + rec.setBaseQualities(newQuals); + + // now add a CIGAR element for the clipped bases + newCigarElements.add(new CigarElement((bases.length - basesToKeep) + basesAlreadyClipped, CigarOperator.H)); + Cigar newCigar = new Cigar(newCigarElements); + rec.setCigar(newCigar); + + // adjust the stop accordingly + // rec.setAlignmentEnd(startPosition - 1); + + return false; + } + private static int DEFAULT_ADAPTOR_SIZE = 100; + + public static boolean hardClipAdaptorSequence(final SAMRecord rec) { + return hardClipAdaptorSequence(rec, DEFAULT_ADAPTOR_SIZE); + } + public static OverlapType readPairBaseOverlapType(final SAMRecord rec, long basePos) { return readPairBaseOverlapType(rec, basePos, DEFAULT_ADAPTOR_SIZE); }