Experimental code for better filtering of bases in sam records. Not hooked up yet.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4475 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a0de269c4b
commit
530875817f
|
|
@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
|
||||||
public class AlignmentUtils {
|
public class AlignmentUtils {
|
||||||
|
|
@ -235,15 +236,112 @@ public class AlignmentUtils {
|
||||||
if ( currentPos > windowStart )
|
if ( currentPos > windowStart )
|
||||||
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
|
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
|
||||||
break;
|
break;
|
||||||
default:
|
case H:
|
||||||
// fail silently
|
case P:
|
||||||
return 0;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the number of mismatches in the pileup element within the given reference context.
|
||||||
|
*
|
||||||
|
* @param read the SAMRecord
|
||||||
|
* @param ref the reference context
|
||||||
|
* @param maxMismatches the maximum number of surrounding mismatches we tolerate to consider a base good
|
||||||
|
* @param windowSize window size (on each side) to test
|
||||||
|
* @return a bitset representing which bases are good
|
||||||
|
*/
|
||||||
|
public static BitSet mismatchesInRefWindow(SAMRecord read, ReferenceContext ref, int maxMismatches, int windowSize) {
|
||||||
|
// first determine the positions with mismatches
|
||||||
|
int readLength = read.getReadLength();
|
||||||
|
BitSet mismatches = new BitSet(readLength);
|
||||||
|
|
||||||
|
// TODO -- we only care about starting from curpos
|
||||||
|
|
||||||
|
byte[] refBases = ref.getBases();
|
||||||
|
int refIndex = read.getAlignmentStart() - (int)ref.getWindow().getStart();
|
||||||
|
// it's possible we aren't starting at the beginning of a read
|
||||||
|
int startOffset = 0;
|
||||||
|
if ( refIndex < 0 ) {
|
||||||
|
startOffset = -1 * refIndex;
|
||||||
|
refIndex = 0;
|
||||||
|
|
||||||
|
// TODO -- fix me
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] readBases = read.getReadBases();
|
||||||
|
int readIndex = 0;
|
||||||
|
|
||||||
|
Cigar c = read.getCigar();
|
||||||
|
|
||||||
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
|
CigarElement ce = c.getCigarElement(i);
|
||||||
|
int cigarElementLength = ce.getLength();
|
||||||
|
switch ( ce.getOperator() ) {
|
||||||
|
case M:
|
||||||
|
for (int j = 0; j < cigarElementLength; j++, readIndex++, refIndex++) {
|
||||||
|
if ( refIndex >= refBases.length ) {
|
||||||
|
// TODO -- fix me
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
byte refChr = refBases[refIndex];
|
||||||
|
byte readChr = readBases[readIndex];
|
||||||
|
if ( readChr != refChr )
|
||||||
|
mismatches.set(readIndex);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case I:
|
||||||
|
case S:
|
||||||
|
readIndex += cigarElementLength;
|
||||||
|
break;
|
||||||
|
case D:
|
||||||
|
case N:
|
||||||
|
refIndex += cigarElementLength;
|
||||||
|
break;
|
||||||
|
case H:
|
||||||
|
case P:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// all bits are set to false by default
|
||||||
|
BitSet result = new BitSet(readLength);
|
||||||
|
|
||||||
|
int currentPos = 0, leftPos = 0, rightPos;
|
||||||
|
int mismatchCount = 0;
|
||||||
|
|
||||||
|
// calculate how many mismatches exist in the windows to the left/right
|
||||||
|
for ( rightPos = 1; rightPos <= windowSize && rightPos < readLength; rightPos++) {
|
||||||
|
if ( mismatches.get(rightPos) )
|
||||||
|
mismatchCount++;
|
||||||
|
}
|
||||||
|
if ( mismatchCount <= maxMismatches )
|
||||||
|
result.set(currentPos);
|
||||||
|
|
||||||
|
// now, traverse over the read positions
|
||||||
|
while ( currentPos < readLength ) {
|
||||||
|
// add a new rightmost position
|
||||||
|
if ( rightPos < readLength && mismatches.get(rightPos++) )
|
||||||
|
mismatchCount++;
|
||||||
|
// re-penalize the previous position
|
||||||
|
if ( mismatches.get(currentPos++) )
|
||||||
|
mismatchCount++;
|
||||||
|
// don't penalize the current position
|
||||||
|
if ( mismatches.get(currentPos) )
|
||||||
|
mismatchCount--;
|
||||||
|
// subtract the leftmost position
|
||||||
|
if ( leftPos < currentPos - windowSize && mismatches.get(leftPos++) )
|
||||||
|
mismatchCount--;
|
||||||
|
|
||||||
|
if ( mismatchCount <= maxMismatches )
|
||||||
|
result.set(currentPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
/** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
||||||
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
||||||
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
|
* it only counts blocks without actually allocating and filling the list of blocks themselves. Hence, this method is
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.sam;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -39,6 +38,11 @@ public class GATKSAMRecord extends SAMRecord {
|
||||||
// These attributes exist in memory only, and are never written to disk.
|
// These attributes exist in memory only, and are never written to disk.
|
||||||
private Map<Object, Object> temporaryAttributes;
|
private Map<Object, Object> temporaryAttributes;
|
||||||
|
|
||||||
|
// A bitset which represents the bases of the read. If a bit is set, then
|
||||||
|
// the base is good; otherwise it is a bad base (as defined by the setter).
|
||||||
|
// TODO: this is a temporary hack. If it works, clean it up.
|
||||||
|
private BitSet mBitSet = null;
|
||||||
|
|
||||||
public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities) {
|
public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities) {
|
||||||
super(null); // it doesn't matter - this isn't used
|
super(null); // it doesn't matter - this isn't used
|
||||||
if ( record == null )
|
if ( record == null )
|
||||||
|
|
@ -66,6 +70,15 @@ public class GATKSAMRecord extends SAMRecord {
|
||||||
throw new UserException.MalformedBam(this, String.format("Error: the number of base qualities does not match the number of bases in %s (and the GATK does not currently support '*' for the quals)", mRecord.getReadName()));
|
throw new UserException.MalformedBam(this, String.format("Error: the number of base qualities does not match the number of bases in %s (and the GATK does not currently support '*' for the quals)", mRecord.getReadName()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setGoodBases(GATKSAMRecordFilter filter, boolean abortIfAlreadySet) {
|
||||||
|
if ( mBitSet == null || !abortIfAlreadySet )
|
||||||
|
mBitSet = filter.getGoodBases(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isGoodBase(int index) {
|
||||||
|
return ( mBitSet == null || mBitSet.length() <= index ? true : mBitSet.get(index));
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// *** The following methods are overloaded to cache the appropriate data ***//
|
// *** The following methods are overloaded to cache the appropriate data ***//
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filtering interface for GATKSAMRecords.
|
||||||
|
*
|
||||||
|
* @author ebanks
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public interface GATKSAMRecordFilter {
|
||||||
|
public BitSet getGoodBases(final GATKSAMRecord record);
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue