From 95e2ae0171fadb402c18d6be42d9007a94c9f43c Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 29 Jun 2009 16:50:05 +0000 Subject: [PATCH] Deal with reads whose ends are aligned off the end of a chromosome. Includes update to ignore non-ATCG bases (not just 'N') (Also, create a BWA dir for future work) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1117 348d0f76-0448-11de-a6fe-93d51630548a --- .../traversals/TraverseByLocusWindows.java | 14 ++++++--- .../walkers/indels/IntervalCleanerWalker.java | 18 ++++++++--- .../playground/indels/AlignmentUtils.java | 31 ++++++++++++++----- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java index fddfadc60..ca9bf0697 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java @@ -4,7 +4,6 @@ import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.*; import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.*; @@ -111,7 +110,6 @@ public class TraverseByLocusWindows extends TraversalEngine { while (readIter.hasNext() && !done) { TraversalStatistics.nRecords++; - SAMRecord read = readIter.next(); reads.add(read); if ( read.getAlignmentStart() < leftmostIndex ) @@ -134,12 +132,20 @@ public class TraverseByLocusWindows extends TraversalEngine { protected T carryWalkerOverInterval(LocusWindowWalker walker, T sum, LocusContext window) { - String refBases = new String(sequenceFile.getSubsequenceAt(window.getContig(),window.getLocation().getStart(),window.getLocation().getStop()).getBases()); + int contigLength = getSAMHeader().getSequence(window.getLocation().getContig()).getSequenceLength(); + String refSuffix = ""; + if ( window.getLocation().getStop() > contigLength ) { + refSuffix = Utils.dupString('x', (int)window.getLocation().getStop() - contigLength); + window.getLocation().setStop(contigLength); + } + + StringBuffer refBases = new StringBuffer(new String(sequenceFile.getSubsequenceAt(window.getContig(),window.getLocation().getStart(),window.getLocation().getStop()).getBases())); + refBases.append(refSuffix); // Iterate forward to get all reference ordered data covering this interval final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(window.getLocation()); - sum = walkAtinterval( walker, sum, window, refBases, tracker ); + sum = walkAtinterval( walker, sum, window, refBases.toString(), tracker ); printProgress("intervals", window.getLocation()); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java index 8cd306dc7..a602d00dc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java @@ -171,8 +171,15 @@ public class IntervalCleanerWalker extends LocusWindowWalker for (int readIndex = 0 ; readIndex < readSeq.length() ; refIndex++, readIndex++ ) { if ( refIndex >= refSeq.length() ) sum += MAX_QUAL; - else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) ) - sum += (int)quals.charAt(readIndex) - 33; + else { + char refChr = refSeq.charAt(refIndex); + char readChr = readSeq.charAt(readIndex); + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + sum += (int)quals.charAt(readIndex) - 33; + } } return sum; } @@ -692,9 +699,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker boolean match = true; for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) { - if ( Character.toUpperCase(refSeq.charAt(testRefPos)) != indelString.charAt(indelPos) || indelString.charAt(indelPos) == 'N' ) { - match = false; - break; + char indelChr = indelString.charAt(indelPos); + if ( Character.toUpperCase(refSeq.charAt(testRefPos)) != indelChr || BaseUtils.simpleBaseToBaseIndex(indelChr) == -1 ) { + match = false; + break; } } if ( match ) diff --git a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java index af565b0c6..df1eb3749 100644 --- a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java @@ -6,7 +6,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.playground.utils.CountedObject; -import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.*; import java.util.*; @@ -41,9 +41,13 @@ public class AlignmentUtils { switch( ce.getOperator() ) { case M: for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ? - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != - Character.toUpperCase((char)ref[i_ref]) ) mm_count++; + char refChr = (char)ref[i_ref]; + char readChr = r.getReadString().charAt(i_read); + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mm_count++; } break; case I: i_read += ce.getLength(); break; @@ -74,9 +78,13 @@ public class AlignmentUtils { switch( ce.getOperator() ) { case M: for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ? - if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != - Character.toUpperCase(ref[i_ref]) ) mm_count++; + char refChr = (char)ref[i_ref]; + char readChr = r.getReadString().charAt(i_read); + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + mm_count++; } break; case I: i_read += ce.getLength(); break; @@ -123,7 +131,14 @@ public class AlignmentUtils { switch ( ce.getOperator() ) { case M: for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { - if ( refIndex < refSeq.length() && Character.toUpperCase(readSeq.charAt(readIdx)) != Character.toUpperCase(refSeq.charAt(refIndex)) ) + if ( refIndex >= refSeq.length() ) + continue; + char refChr = refSeq.charAt(refIndex); + char readChr = readSeq.charAt(readIdx); + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) mismatches++; } break;