Deal with reads whose ends are aligned off the end of a chromosome.

Includes update to ignore non-ATCG bases (not just 'N')
(Also, create a BWA dir for future work)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1117 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-29 16:50:05 +00:00
parent 65a788f18a
commit 95e2ae0171
3 changed files with 46 additions and 17 deletions

View File

@ -4,7 +4,6 @@ import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.*;
@ -111,7 +110,6 @@ public class TraverseByLocusWindows extends TraversalEngine {
while (readIter.hasNext() && !done) {
TraversalStatistics.nRecords++;
SAMRecord read = readIter.next();
reads.add(read);
if ( read.getAlignmentStart() < leftmostIndex )
@ -134,12 +132,20 @@ public class TraverseByLocusWindows extends TraversalEngine {
protected <M, T> T carryWalkerOverInterval(LocusWindowWalker<M, T> walker, T sum, LocusContext window) {
String refBases = new String(sequenceFile.getSubsequenceAt(window.getContig(),window.getLocation().getStart(),window.getLocation().getStop()).getBases());
int contigLength = getSAMHeader().getSequence(window.getLocation().getContig()).getSequenceLength();
String refSuffix = "";
if ( window.getLocation().getStop() > contigLength ) {
refSuffix = Utils.dupString('x', (int)window.getLocation().getStop() - contigLength);
window.getLocation().setStop(contigLength);
}
StringBuffer refBases = new StringBuffer(new String(sequenceFile.getSubsequenceAt(window.getContig(),window.getLocation().getStart(),window.getLocation().getStop()).getBases()));
refBases.append(refSuffix);
// Iterate forward to get all reference ordered data covering this interval
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(window.getLocation());
sum = walkAtinterval( walker, sum, window, refBases, tracker );
sum = walkAtinterval( walker, sum, window, refBases.toString(), tracker );
printProgress("intervals", window.getLocation());

View File

@ -171,8 +171,15 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
for (int readIndex = 0 ; readIndex < readSeq.length() ; refIndex++, readIndex++ ) {
if ( refIndex >= refSeq.length() )
sum += MAX_QUAL;
else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
sum += (int)quals.charAt(readIndex) - 33;
else {
char refChr = refSeq.charAt(refIndex);
char readChr = readSeq.charAt(readIndex);
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
continue; // do not count Ns/Xs/etc ?
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
sum += (int)quals.charAt(readIndex) - 33;
}
}
return sum;
}
@ -692,9 +699,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
boolean match = true;
for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) {
if ( Character.toUpperCase(refSeq.charAt(testRefPos)) != indelString.charAt(indelPos) || indelString.charAt(indelPos) == 'N' ) {
match = false;
break;
char indelChr = indelString.charAt(indelPos);
if ( Character.toUpperCase(refSeq.charAt(testRefPos)) != indelChr || BaseUtils.simpleBaseToBaseIndex(indelChr) == -1 ) {
match = false;
break;
}
}
if ( match )

View File

@ -6,7 +6,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.playground.utils.CountedObject;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.*;
import java.util.*;
@ -41,9 +41,13 @@ public class AlignmentUtils {
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ?
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase((char)ref[i_ref]) ) mm_count++;
char refChr = (char)ref[i_ref];
char readChr = r.getReadString().charAt(i_read);
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
continue; // do not count Ns/Xs/etc ?
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
mm_count++;
}
break;
case I: i_read += ce.getLength(); break;
@ -74,9 +78,13 @@ public class AlignmentUtils {
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ?
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase(ref[i_ref]) ) mm_count++;
char refChr = (char)ref[i_ref];
char readChr = r.getReadString().charAt(i_read);
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
continue; // do not count Ns/Xs/etc ?
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
mm_count++;
}
break;
case I: i_read += ce.getLength(); break;
@ -123,7 +131,14 @@ public class AlignmentUtils {
switch ( ce.getOperator() ) {
case M:
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
if ( refIndex < refSeq.length() && Character.toUpperCase(readSeq.charAt(readIdx)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
if ( refIndex >= refSeq.length() )
continue;
char refChr = refSeq.charAt(refIndex);
char readChr = readSeq.charAt(readIdx);
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
continue; // do not count Ns/Xs/etc ?
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
mismatches++;
}
break;