diff --git a/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java index a0ed6cbf5..f307ee696 100755 --- a/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java +++ b/java/src/org/broadinstitute/sting/gatk/filters/TinyFragmentFilter.java @@ -72,7 +72,7 @@ public class TinyFragmentFilter implements SamRecordFilter { long mateStart = rec.getMateAlignmentStart(); long mateEnd = rec.getAlignmentStart() + isize; boolean bad = rec.getReadNegativeStrandFlag() ? start < mateStart : end > mateEnd; - //System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad); + System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad); if ( bad ) { //System.out.printf("TinyFragment: " + rec.format()); return true; @@ -81,4 +81,4 @@ public class TinyFragmentFilter implements SamRecordFilter { } } } -} \ No newline at end of file +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 387f9b02b..4a3b7e8ae 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -75,6 +75,9 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") protected Integer compressionLevel = 5; + @Argument(fullName="cleanPerfectMatches", shortName="cpm", required=false, doc="If true, Realigner will ignore the NM == 0 flag and include reads with supposely no mismatches to reference for cleaning. Useful for malformed BAM files") + protected boolean CLEAN_PERFECT_MATCHES = false; + public enum RealignerSortingStrategy { NO_SORT, ON_DISK, @@ -284,6 +287,30 @@ public class IndelRealigner extends ReadWalker { } } + /** + * returns true if a read has only a single cigar element (indicating its xM) and it has a NM flag + * and NM == 0. + * @param read + * @return + */ + private boolean perfectlyMatchesReference(SAMRecord read) { + if ( CLEAN_PERFECT_MATCHES ) { + return false; + } else { + boolean cigarIsMatches = read.getCigar().numCigarElements() == 1; + Integer NM = read.getIntegerAttribute("NM"); + boolean noMM = NM != null && NM == 0; + boolean perfectMatch = cigarIsMatches && noMM; +// if ( perfectMatch ) { +// System.out.println("Perfect match " + read.format()); +// } + return perfectMatch; + } + } + + int nPerfectMatches = 0; + int nReadsToClean = 0; + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); @@ -310,11 +337,13 @@ public class IndelRealigner extends ReadWalker { } else if ( readLoc.overlapsP(currentInterval) ) { if ( read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) { + read.getNotPrimaryAlignmentFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) { readsNotToClean.add(read); - } else { + } + else { + nReadsToClean++; readsToClean.add(read, ref.getBases()); // add the rods to the list of known variants populateKnownIndels(metaDataTracker, null); @@ -485,6 +514,17 @@ public class IndelRealigner extends ReadWalker { continue; } + // optimization to avoid trying to clean perfect matches to the reference + if ( perfectlyMatchesReference(read) ) { + refReads.add(read); + nPerfectMatches++; + if ( nPerfectMatches % 10000 == 0 ) { + logger.debug(String.format("Perfect matching fraction: %d %d => %.2f%n", nPerfectMatches, nReadsToClean, 100.0 * nPerfectMatches / ( nReadsToClean + 1))); + } + + continue; + } + final AlignedRead aRead = new AlignedRead(read); // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence diff --git a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index 5c67d19e1..087fe2823 100755 --- a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2010, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,15 +12,14 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils; @@ -33,8 +32,6 @@ import java.util.List; import java.util.ArrayList; import java.util.Collections; -import org.broadinstitute.sting.utils.collections.PrimitivePair; - /** * Created by IntelliJ IDEA. * User: asivache @@ -76,144 +73,147 @@ public class SWPairwiseAlignment { public int getAlignmentStart2wrt1() { return alignment_offset; } public void align(final byte[] a, final byte[] b) { - final int n = a.length; - final int m = b.length; - double [][] sw = new double[n+1][m+1]; + final int n = a.length; + final int m = b.length; + double [][] sw = new double[n+1][m+1]; + int [][] btrack = new int[n+1][m+1]; + calculateMatrix(a, b, sw, btrack); + calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) + } - int [][] btrack = new int[n+1][m+1]; + private void calculateMatrix(final byte[] a, final byte[] b, double [][] sw, int [][] btrack ) { + final int n = a.length; + final int m = b.length; - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1 ; i < n+1 ; i++ ) { - byte a_base = a[i-1]; // letter in a at the current pos - for ( int j = 1 ; j < m+1 ; j++ ) { - final byte b_base = b[j-1]; // letter in b at the current pos - double step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_down = 0.0 ; - int kd = 0; - for ( int k = 1 ; k < i ; k++ ) { - final double gap_penalty = wk(k); - if ( step_down < sw[i-k][j]+gap_penalty ) { - step_down=sw[i-k][j]+gap_penalty; - kd = k; - } - } - - double step_right = 0; - int ki = 0; - for ( int k = 1 ; k < j ; k++ ) { - final double gap_penalty = wk(k); - if ( step_right < sw[i][j-k]+gap_penalty ) { - step_right=sw[i][j-k]+gap_penalty; - ki = k; - } - } - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[i][j] = Math.max(0,step_down); - btrack[i][j] = kd; // positive=vertical - } - else { - sw[i][j] = Math.max(0,step_diag); - btrack[i][j] = 0; // 0 = diagonal - } - } else { - // step_down < step_right - if ( step_right > step_diag ) { - sw[i][j] = Math.max(0,step_right); - btrack[i][j] = -ki; // negative = horizontal - } else { - sw[i][j] = Math.max(0,step_diag); - btrack[i][j] = 0; // 0 = diagonal - } - } - sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - } - -// print(sw,a,b); - - PrimitivePair.Int p = new PrimitivePair.Int(); - double maxscore = 0.0; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) - - // look for largest score. we use >= combined with the traversal direction - // to ensure that if two scores are equal, the one closer to diagonal gets picked - for ( int i = 1 ; i < n+1 ; i++ ) { - if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; } - } + // build smith-waterman matrix and keep backtrack info: + for ( int i = 1 ; i < n+1 ; i++ ) { + byte a_base = a[i-1]; // letter in a at the current pos for ( int j = 1 ; j < m+1 ; j++ ) { - if ( sw[n][j] > maxscore || - sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) { - p.first = n; - p.second = j ; - maxscore = sw[n][j]; - segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment - } - } - - // System.out.println("\ni="+p.first+"; j="+p.second); - - // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - - - // we will be placing all insertions and deletions into sequence b, so the state are named w/regard - // to that sequence - - int state = MSTATE; - - List lce = new ArrayList(5); - - do { - - int btr = btrack[p.first][p.second]; - int step_left = ( btr < 0 ? -btr : 1); - int step_up = ( btr > 0 ? btr : 1 ); - - int new_state; - if ( btr > 0 ) new_state = DSTATE; - else if ( btr < 0 ) new_state = ISTATE; - else new_state = MSTATE; - - int step_length = 1; - - // move to next best location in the sw matrix: - switch( new_state ) { - case MSTATE: p.first--; p.second--; break; - case ISTATE: p.second-=step_left; step_length = step_left; break; - case DSTATE: p.first-=step_up; step_length = step_up; break; - } - - // now let's see if the state actually changed: - if ( new_state == state ) segment_length+=step_length; - else { - // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; + final byte b_base = b[j-1]; // letter in b at the current pos + double step_diag = sw[i-1][j-1] + wd(a_base,b_base); + double step_down = 0.0 ; + int kd = 0; + for ( int k = 1 ; k < i ; k++ ) { + final double gap_penalty = wk(k); + if ( step_down < sw[i-k][j]+gap_penalty ) { + step_down=sw[i-k][j]+gap_penalty; + kd = k; } - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - segment_length = step_length; - state = new_state; } - } while ( sw[p.first][p.second] != 0 ); - // post-process the last segment we are still keeping - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; + double step_right = 0; + int ki = 0; + for ( int k = 1 ; k < j ; k++ ) { + final double gap_penalty = wk(k); + if ( step_right < sw[i][j-k]+gap_penalty ) { + step_right=sw[i][j-k]+gap_penalty; + ki = k; + } + } + + if ( step_down > step_right ) { + if ( step_down > step_diag ) { + sw[i][j] = Math.max(0,step_down); + btrack[i][j] = kd; // positive=vertical + } + else { + sw[i][j] = Math.max(0,step_diag); + btrack[i][j] = 0; // 0 = diagonal + } + } else { + // step_down < step_right + if ( step_right > step_diag ) { + sw[i][j] = Math.max(0,step_right); + btrack[i][j] = -ki; // negative = horizontal + } else { + sw[i][j] = Math.max(0,step_diag); + btrack[i][j] = 0; // 0 = diagonal + } + } + + sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); } - alignment_offset = p.first - p.second; - segment_length+=p.second; - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - Collections.reverse(lce); - alignmentCigar = new Cigar(lce); + } +// print(sw,a,b); + } + + private void calculateCigar(int n, int m, double [][] sw, int [][] btrack) { + // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order + //PrimitivePair.Int p = new PrimitivePair.Int(); + int p1 = 0, p2 = 0; + + double maxscore = 0.0; + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + + // look for largest score. we use >= combined with the traversal direction + // to ensure that if two scores are equal, the one closer to diagonal gets picked + for ( int i = 1 ; i < n+1 ; i++ ) { + if ( sw[i][m] >= maxscore ) { + p1 = i; p2 = m ; maxscore = sw[i][m]; + } + } + + for ( int j = 1 ; j < m+1 ; j++ ) { + if ( sw[n][j] > maxscore || sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { + p1 = n; + p2 = j ; + maxscore = sw[n][j]; + segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment + } + } + + + // we will be placing all insertions and deletions into sequence b, so the state are named w/regard + // to that sequence + + int state = MSTATE; + List lce = new ArrayList(5); + + do { + int btr = btrack[p1][p2]; + int step_left = ( btr < 0 ? -btr : 1); + int step_up = ( btr > 0 ? btr : 1 ); + + int new_state; + if ( btr > 0 ) new_state = DSTATE; + else if ( btr < 0 ) new_state = ISTATE; + else new_state = MSTATE; + + int step_length = 1; + + // move to next best location in the sw matrix: + switch( new_state ) { + case MSTATE: p1--; p2--; break; + case ISTATE: p2-=step_left; step_length = step_left; break; + case DSTATE: p1-=step_up; step_length = step_up; break; + } + + // now let's see if the state actually changed: + if ( new_state == state ) segment_length+=step_length; + else { + // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). + lce.add(makeElement(state, segment_length)); + segment_length = step_length; + state = new_state; + } + } while ( sw[p1][p2] != 0 ); + + // post-process the last segment we are still keeping + lce.add(makeElement(state, segment_length + p2)); + alignment_offset = p1 - p2; + + Collections.reverse(lce); + alignmentCigar = new Cigar(lce); + } + + private CigarElement makeElement(int state, int segment_length) { + CigarOperator o = null; + switch(state) { + case MSTATE: o = CigarOperator.M; break; + case ISTATE: o = CigarOperator.I; break; + case DSTATE: o = CigarOperator.D; break; + } + return new CigarElement(segment_length,o); } private double wd(byte x, byte y) { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index 5f3b795f7..3f4c602d6 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -27,7 +27,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { String filename1 = "NA12878.chrom1.SLX.SRP000032.2009_06"; String filename2 = "low_coverage_CEU.chr1.10k-11k"; WalkerTestSpec spec3 = new WalkerTestSpec( - "-T IndelRealigner -nway -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + filename1 + ".bam -I " + validationDataLocation + filename2 + ".bam -L 1:10023900-10024000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O /tmp -snps %s", + "-T IndelRealigner --cleanPerfectMatches -nway -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + filename1 + ".bam -I " + validationDataLocation + filename2 + ".bam -L 1:10023900-10024000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O /tmp -snps %s", 1, Arrays.asList("bd42a4fa66d7ec7a480c2b94313a78d3")); File file1 = new File("/tmp/" + filename1 + ".cleaned.bam");