Experimental code for better filtering of bases in sam records. Not hooked up yet.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4475 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-10-11 02:19:51 +00:00
parent a0de269c4b
commit 530875817f
3 changed files with 154 additions and 5 deletions

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.Utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
public class AlignmentUtils {
@ -235,15 +236,112 @@ public class AlignmentUtils {
if ( currentPos > windowStart )
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
break;
default:
// fail silently
return 0;
case H:
case P:
break;
}
}
return sum;
}
/** Returns the number of mismatches in the pileup element within the given reference context.
*
* @param read the SAMRecord
* @param ref the reference context
* @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
* @param windowSize window size (on each side) to test
* @return a bitset representing which bases are good
*/
public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) {
// first determine the positions with mismatches
int readLength = read.getReadLength();
BitSet mismatches = new BitSet(readLength);
// TODO -- we only care about starting from curpos
byte[] refBases = ref.getBases();
int refIndex = read.getAlignmentStart() - (int)ref.getWindow().getStart();
// it's possible we aren't starting at the beginning of a read
int startOffset = 0;
if ( refIndex < 0 ) {
startOffset = -1 * refIndex;
refIndex = 0;
// TODO -- fix me
return null;
}
byte[] readBases = read.getReadBases();
int readIndex = 0;
Cigar c = read.getCigar();
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
int cigarElementLength = ce.getLength();
switch ( ce.getOperator() ) {
case M:
for (int j = 0; j < cigarElementLength; j++, readIndex++, refIndex++) {
if ( refIndex >= refBases.length ) {
// TODO -- fix me
return null;
}
byte refChr = refBases[refIndex];
byte readChr = readBases[readIndex];
if ( readChr != refChr )
mismatches.set(readIndex);
}
break;
case I:
case S:
readIndex += cigarElementLength;
break;
case D:
case N:
refIndex += cigarElementLength;
break;
case H:
case P:
break;
}
}
// all bits are set to false by default
BitSet result = new BitSet(readLength);
int currentPos = 0, leftPos = 0, rightPos;
int mismatchCount = 0;
// calculate how many mismatches exist in the windows to the left/right
for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) {
if ( mismatches.get(rightPos) )
mismatchCount++;
}
if ( mismatchCount <= maxMismatches )
result.set(currentPos);
// now, traverse over the read positions
while ( currentPos < readLength ) {
// add a new rightmost position
if ( rightPos < readLength && mismatches.get(rightPos++) )
mismatchCount++;
// re-penalize the previous position
if ( mismatches.get(currentPos++) )
mismatchCount++;
// don't penalize the current position
if ( mismatches.get(currentPos) )
mismatchCount--;
// subtract the leftmost position
if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) )
mismatchCount--;
if ( mismatchCount <= maxMismatches )
result.set(currentPos);
}
return result;
}
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is

View File

@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.sam;
import java.util.*;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
@ -39,6 +38,11 @@ public class GATKSAMRecord extends SAMRecord {
// These attributes exist in memory only, and are never written to disk.
private Map<Object, Object> temporaryAttributes;
// A bitset which represents the bases of the read. If a bit is set, then
// the base is good; otherwise it is a bad base (as defined by the setter).
// TODO: this is a temporary hack. If it works, clean it up.
private BitSet mBitSet = null;
public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities) {
super(null); // it doesn't matter - this isn't used
if ( record == null )
@ -54,7 +58,7 @@ public class GATKSAMRecord extends SAMRecord {
for ( SAMTagAndValue attribute : attributes )
setAttribute(attribute.tag, attribute.value);
// if we are using original quals, set them now if t hey are present in the record
// if we are using original quals, set them now if they are present in the record
if ( useOriginalBaseQualities ) {
byte[] originalQuals = mRecord.getOriginalBaseQualities();
if ( originalQuals != null )
@ -66,6 +70,15 @@ public class GATKSAMRecord extends SAMRecord {
throw new UserException.MalformedBam(this, String.format("Error: the number of base qualities does not match the number of bases in %s (and the GATK does not currently support '*' for the quals)", mRecord.getReadName()));
}
public void setGoodBases(GATKSAMRecordFilter filter, boolean abortIfAlreadySet) {
if ( mBitSet == null || !abortIfAlreadySet )
mBitSet = filter.getGoodBases(this);
}
public boolean isGoodBase(int index) {
return ( mBitSet == null || mBitSet.length() <= index ? true : mBitSet.get(index));
}
///////////////////////////////////////////////////////////////////////////////
// *** The following methods are overloaded to cache the appropriate data ***//
///////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,38 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.sam;
import java.util.BitSet;
/**
* A filtering interface for GATKSAMRecords.
*
* @author ebanks
* @version 0.1
*/
public interface GATKSAMRecordFilter {
public BitSet getGoodBases(final GATKSAMRecord record);
}