diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index dffb8060e..3293cfd45 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.Utils; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; public class AlignmentUtils { @@ -235,15 +236,112 @@ public class AlignmentUtils { if ( currentPos > windowStart ) refIndex += Math.min(cigarElementLength, currentPos - windowStart); break; - default: - // fail silently - return 0; + case H: + case P: + break; } } return sum; } + /** Returns the number of mismatches in the pileup element within the given reference context. + * + * @param read the SAMRecord + * @param ref the reference context + * @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good + * @param windowSize window size (on each side) to test + * @return a bitset representing which bases are good + */ + public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) { + // first determine the positions with mismatches + int readLength = read.getReadLength(); + BitSet mismatches = new BitSet(readLength); + + // TODO -- we only care about starting from curpos + + byte[] refBases = ref.getBases(); + int refIndex = read.getAlignmentStart() - (int)ref.getWindow().getStart(); + // it's possible we aren't starting at the beginning of a read + int startOffset = 0; + if ( refIndex < 0 ) { + startOffset = -1 * refIndex; + refIndex = 0; + + // TODO -- fix me + return null; + } + + byte[] readBases = read.getReadBases(); + int readIndex = 0; + + Cigar c = read.getCigar(); + + for (int i = 0 ; i < c.numCigarElements() ; i++) { + CigarElement ce = c.getCigarElement(i); + int cigarElementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case M: + for (int j = 0; j < cigarElementLength; j++, readIndex++, refIndex++) { + if ( refIndex >= refBases.length ) { + // TODO -- fix me + return null; + } + byte refChr = refBases[refIndex]; + byte readChr = readBases[readIndex]; + if ( readChr != refChr ) + mismatches.set(readIndex); + } + break; + case I: + case S: + readIndex += cigarElementLength; + break; + case D: + case N: + refIndex += cigarElementLength; + break; + case H: + case P: + break; + } + } + + // all bits are set to false by default + BitSet result = new BitSet(readLength); + + int currentPos = 0, leftPos = 0, rightPos; + int mismatchCount = 0; + + // calculate how many mismatches exist in the windows to the left/right + for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) { + if ( mismatches.get(rightPos) ) + mismatchCount++; + } + if ( mismatchCount <= maxMismatches ) + result.set(currentPos); + + // now, traverse over the read positions + while ( currentPos < readLength ) { + // add a new rightmost position + if ( rightPos < readLength && mismatches.get(rightPos++) ) + mismatchCount++; + // re-penalize the previous position + if ( mismatches.get(currentPos++) ) + mismatchCount++; + // don't penalize the current position + if ( mismatches.get(currentPos) ) + mismatchCount--; + // subtract the leftmost position + if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) ) + mismatchCount--; + + if ( mismatchCount <= maxMismatches ) + result.set(currentPos); + } + + return result; + } /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. * This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but * it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 9e3b2609f..036477758 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.sam; import java.util.*; import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; /** @@ -39,6 +38,11 @@ public class GATKSAMRecord extends SAMRecord { // These attributes exist in memory only, and are never written to disk. private Map temporaryAttributes; + // A bitset which represents the bases of the read. If a bit is set, then + // the base is good; otherwise it is a bad base (as defined by the setter). + // TODO: this is a temporary hack. If it works, clean it up. + private BitSet mBitSet = null; + public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities) { super(null); // it doesn't matter - this isn't used if ( record == null ) @@ -54,7 +58,7 @@ public class GATKSAMRecord extends SAMRecord { for ( SAMTagAndValue attribute : attributes ) setAttribute(attribute.tag, attribute.value); - // if we are using original quals, set them now if t hey are present in the record + // if we are using original quals, set them now if they are present in the record if ( useOriginalBaseQualities ) { byte[] originalQuals = mRecord.getOriginalBaseQualities(); if ( originalQuals != null ) @@ -66,6 +70,15 @@ public class GATKSAMRecord extends SAMRecord { throw new UserException.MalformedBam(this, String.format("Error: the number of base qualities does not match the number of bases in %s (and the GATK does not currently support '*' for the quals)", mRecord.getReadName())); } + public void setGoodBases(GATKSAMRecordFilter filter, boolean abortIfAlreadySet) { + if ( mBitSet == null || !abortIfAlreadySet ) + mBitSet = filter.getGoodBases(this); + } + + public boolean isGoodBase(int index) { + return ( mBitSet == null || mBitSet.length() <= index ? true : mBitSet.get(index)); + } + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecordFilter.java b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecordFilter.java new file mode 100755 index 000000000..7fbfa94e3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecordFilter.java @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import java.util.BitSet; + +/** + * A filtering interface for GATKSAMRecords. + * + * @author ebanks + * @version 0.1 + */ +public interface GATKSAMRecordFilter { + public BitSet getGoodBases(final GATKSAMRecord record); +} \ No newline at end of file