Experimental code for better filtering of bases in sam records. Not hooked up yet.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4475 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a0de269c4b
commit
530875817f
|
|
@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
|
||||
|
||||
public class AlignmentUtils {
|
||||
|
|
@ -235,15 +236,112 @@ public class AlignmentUtils {
|
|||
if ( currentPos > windowStart )
|
||||
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
|
||||
break;
|
||||
default:
|
||||
// fail silently
|
||||
return 0;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/** Returns the number of mismatches in the pileup element within the given reference context.
|
||||
*
|
||||
* @param read the SAMRecord
|
||||
* @param ref the reference context
|
||||
* @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
|
||||
* @param windowSize window size (on each side) to test
|
||||
* @return a bitset representing which bases are good
|
||||
*/
|
||||
public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) {
|
||||
// first determine the positions with mismatches
|
||||
int readLength = read.getReadLength();
|
||||
BitSet mismatches = new BitSet(readLength);
|
||||
|
||||
// TODO -- we only care about starting from curpos
|
||||
|
||||
byte[] refBases = ref.getBases();
|
||||
int refIndex = read.getAlignmentStart() - (int)ref.getWindow().getStart();
|
||||
// it's possible we aren't starting at the beginning of a read
|
||||
int startOffset = 0;
|
||||
if ( refIndex < 0 ) {
|
||||
startOffset = -1 * refIndex;
|
||||
refIndex = 0;
|
||||
|
||||
// TODO -- fix me
|
||||
return null;
|
||||
}
|
||||
|
||||
byte[] readBases = read.getReadBases();
|
||||
int readIndex = 0;
|
||||
|
||||
Cigar c = read.getCigar();
|
||||
|
||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
int cigarElementLength = ce.getLength();
|
||||
switch ( ce.getOperator() ) {
|
||||
case M:
|
||||
for (int j = 0; j < cigarElementLength; j++, readIndex++, refIndex++) {
|
||||
if ( refIndex >= refBases.length ) {
|
||||
// TODO -- fix me
|
||||
return null;
|
||||
}
|
||||
byte refChr = refBases[refIndex];
|
||||
byte readChr = readBases[readIndex];
|
||||
if ( readChr != refChr )
|
||||
mismatches.set(readIndex);
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
case S:
|
||||
readIndex += cigarElementLength;
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
refIndex += cigarElementLength;
|
||||
break;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// all bits are set to false by default
|
||||
BitSet result = new BitSet(readLength);
|
||||
|
||||
int currentPos = 0, leftPos = 0, rightPos;
|
||||
int mismatchCount = 0;
|
||||
|
||||
// calculate how many mismatches exist in the windows to the left/right
|
||||
for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) {
|
||||
if ( mismatches.get(rightPos) )
|
||||
mismatchCount++;
|
||||
}
|
||||
if ( mismatchCount <= maxMismatches )
|
||||
result.set(currentPos);
|
||||
|
||||
// now, traverse over the read positions
|
||||
while ( currentPos < readLength ) {
|
||||
// add a new rightmost position
|
||||
if ( rightPos < readLength && mismatches.get(rightPos++) )
|
||||
mismatchCount++;
|
||||
// re-penalize the previous position
|
||||
if ( mismatches.get(currentPos++) )
|
||||
mismatchCount++;
|
||||
// don't penalize the current position
|
||||
if ( mismatches.get(currentPos) )
|
||||
mismatchCount--;
|
||||
// subtract the leftmost position
|
||||
if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) )
|
||||
mismatchCount--;
|
||||
|
||||
if ( mismatchCount <= maxMismatches )
|
||||
result.set(currentPos);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
||||
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
||||
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.sam;
|
|||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
|
|
@ -39,6 +38,11 @@ public class GATKSAMRecord extends SAMRecord {
|
|||
// These attributes exist in memory only, and are never written to disk.
|
||||
private Map<Object, Object> temporaryAttributes;
|
||||
|
||||
// A bitset which represents the bases of the read. If a bit is set, then
|
||||
// the base is good; otherwise it is a bad base (as defined by the setter).
|
||||
// TODO: this is a temporary hack. If it works, clean it up.
|
||||
private BitSet mBitSet = null;
|
||||
|
||||
public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities) {
|
||||
super(null); // it doesn't matter - this isn't used
|
||||
if ( record == null )
|
||||
|
|
@ -54,7 +58,7 @@ public class GATKSAMRecord extends SAMRecord {
|
|||
for ( SAMTagAndValue attribute : attributes )
|
||||
setAttribute(attribute.tag, attribute.value);
|
||||
|
||||
// if we are using original quals, set them now if t hey are present in the record
|
||||
// if we are using original quals, set them now if they are present in the record
|
||||
if ( useOriginalBaseQualities ) {
|
||||
byte[] originalQuals = mRecord.getOriginalBaseQualities();
|
||||
if ( originalQuals != null )
|
||||
|
|
@ -66,6 +70,15 @@ public class GATKSAMRecord extends SAMRecord {
|
|||
throw new UserException.MalformedBam(this, String.format("Error: the number of base qualities does not match the number of bases in %s (and the GATK does not currently support '*' for the quals)", mRecord.getReadName()));
|
||||
}
|
||||
|
||||
public void setGoodBases(GATKSAMRecordFilter filter, boolean abortIfAlreadySet) {
|
||||
if ( mBitSet == null || !abortIfAlreadySet )
|
||||
mBitSet = filter.getGoodBases(this);
|
||||
}
|
||||
|
||||
public boolean isGoodBase(int index) {
|
||||
return ( mBitSet == null || mBitSet.length() <= index ? true : mBitSet.get(index));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// *** The following methods are overloaded to cache the appropriate data ***//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
|||
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* A filtering interface for GATKSAMRecords.
|
||||
*
|
||||
* @author ebanks
|
||||
* @version 0.1
|
||||
*/
|
||||
public interface GATKSAMRecordFilter {
|
||||
public BitSet getGoodBases(final GATKSAMRecord record);
|
||||
}
|
||||
Loading…
Reference in New Issue