Reorganization of SW code for clarity. Totally failure at raw optimization. Discovered that ~50% of reads being cleaned were perfect reference matches. New code comes with flag to look at NM field and not clean perfect matches. Can we turned off with command line option (needed for 1KG bams with bad NM fields). Going to rerun cleaning jobs due to accidentally rebuilding of stable codebase and loss of 2 days of runtime.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3452 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-05-27 23:16:00 +00:00
parent e1b0aefb29
commit f2e7582cfc
4 changed files with 184 additions and 144 deletions

View File

@ -72,7 +72,7 @@ public class TinyFragmentFilter implements SamRecordFilter {
long mateStart = rec.getMateAlignmentStart();
long mateEnd = rec.getAlignmentStart() + isize;
boolean bad = rec.getReadNegativeStrandFlag() ? start < mateStart : end > mateEnd;
//System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad);
System.out.printf("%s %d %d %d %d %d => %b%n", rec.getReadName(), start, end, mateStart, mateEnd, isize, bad);
if ( bad ) {
//System.out.printf("TinyFragment: " + rec.format());
return true;
@ -81,4 +81,4 @@ public class TinyFragmentFilter implements SamRecordFilter {
}
}
}
}
}

View File

@ -75,6 +75,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
@Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]")
protected Integer compressionLevel = 5;
@Argument(fullName="cleanPerfectMatches", shortName="cpm", required=false, doc="If true, Realigner will ignore the NM == 0 flag and include reads with supposely no mismatches to reference for cleaning. Useful for malformed BAM files")
protected boolean CLEAN_PERFECT_MATCHES = false;
public enum RealignerSortingStrategy {
NO_SORT,
ON_DISK,
@ -284,6 +287,30 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
}
/**
* returns true if a read has only a single cigar element (indicating its xM) and it has a NM flag
* and NM == 0.
* @param read
* @return
*/
private boolean perfectlyMatchesReference(SAMRecord read) {
if ( CLEAN_PERFECT_MATCHES ) {
return false;
} else {
boolean cigarIsMatches = read.getCigar().numCigarElements() == 1;
Integer NM = read.getIntegerAttribute("NM");
boolean noMM = NM != null && NM == 0;
boolean perfectMatch = cigarIsMatches && noMM;
// if ( perfectMatch ) {
// System.out.println("Perfect match " + read.format());
// }
return perfectMatch;
}
}
int nPerfectMatches = 0;
int nReadsToClean = 0;
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if ( currentInterval == null ) {
emit(read);
@ -310,11 +337,13 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
else if ( readLoc.overlapsP(currentInterval) ) {
if ( read.getReadUnmappedFlag() ||
read.getNotPrimaryAlignmentFlag() ||
read.getMappingQuality() == 0 ||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) {
read.getNotPrimaryAlignmentFlag() ||
read.getMappingQuality() == 0 ||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) {
readsNotToClean.add(read);
} else {
}
else {
nReadsToClean++;
readsToClean.add(read, ref.getBases());
// add the rods to the list of known variants
populateKnownIndels(metaDataTracker, null);
@ -485,6 +514,17 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
continue;
}
// optimization to avoid trying to clean perfect matches to the reference
if ( perfectlyMatchesReference(read) ) {
refReads.add(read);
nPerfectMatches++;
if ( nPerfectMatches % 10000 == 0 ) {
logger.debug(String.format("Perfect matching fraction: %d %d => %.2f%n", nPerfectMatches, nReadsToClean, 100.0 * nPerfectMatches / ( nReadsToClean + 1)));
}
continue;
}
final AlignedRead aRead = new AlignedRead(read);
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The Broad Institute
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -12,15 +12,14 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
@ -33,8 +32,6 @@ import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
/**
* Created by IntelliJ IDEA.
* User: asivache
@ -76,144 +73,147 @@ public class SWPairwiseAlignment {
public int getAlignmentStart2wrt1() { return alignment_offset; }
public void align(final byte[] a, final byte[] b) {
final int n = a.length;
final int m = b.length;
double [][] sw = new double[n+1][m+1];
final int n = a.length;
final int m = b.length;
double [][] sw = new double[n+1][m+1];
int [][] btrack = new int[n+1][m+1];
calculateMatrix(a, b, sw, btrack);
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
}
int [][] btrack = new int[n+1][m+1];
private void calculateMatrix(final byte[] a, final byte[] b, double [][] sw, int [][] btrack ) {
final int n = a.length;
final int m = b.length;
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1 ; i < n+1 ; i++ ) {
byte a_base = a[i-1]; // letter in a at the current pos
for ( int j = 1 ; j < m+1 ; j++ ) {
final byte b_base = b[j-1]; // letter in b at the current pos
double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_down = 0.0 ;
int kd = 0;
for ( int k = 1 ; k < i ; k++ ) {
final double gap_penalty = wk(k);
if ( step_down < sw[i-k][j]+gap_penalty ) {
step_down=sw[i-k][j]+gap_penalty;
kd = k;
}
}
double step_right = 0;
int ki = 0;
for ( int k = 1 ; k < j ; k++ ) {
final double gap_penalty = wk(k);
if ( step_right < sw[i][j-k]+gap_penalty ) {
step_right=sw[i][j-k]+gap_penalty;
ki = k;
}
}
if ( step_down > step_right ) {
if ( step_down > step_diag ) {
sw[i][j] = Math.max(0,step_down);
btrack[i][j] = kd; // positive=vertical
}
else {
sw[i][j] = Math.max(0,step_diag);
btrack[i][j] = 0; // 0 = diagonal
}
} else {
// step_down < step_right
if ( step_right > step_diag ) {
sw[i][j] = Math.max(0,step_right);
btrack[i][j] = -ki; // negative = horizontal
} else {
sw[i][j] = Math.max(0,step_diag);
btrack[i][j] = 0; // 0 = diagonal
}
}
sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
}
}
// print(sw,a,b);
PrimitivePair.Int p = new PrimitivePair.Int();
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; }
}
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1 ; i < n+1 ; i++ ) {
byte a_base = a[i-1]; // letter in a at the current pos
for ( int j = 1 ; j < m+1 ; j++ ) {
if ( sw[n][j] > maxscore ||
sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) {
p.first = n;
p.second = j ;
maxscore = sw[n][j];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
// System.out.println("\ni="+p.first+"; j="+p.second);
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
// we will be placing all insertions and deletions into sequence b, so the state are named w/regard
// to that sequence
int state = MSTATE;
List<CigarElement> lce = new ArrayList<CigarElement>(5);
do {
int btr = btrack[p.first][p.second];
int step_left = ( btr < 0 ? -btr : 1);
int step_up = ( btr > 0 ? btr : 1 );
int new_state;
if ( btr > 0 ) new_state = DSTATE;
else if ( btr < 0 ) new_state = ISTATE;
else new_state = MSTATE;
int step_length = 1;
// move to next best location in the sw matrix:
switch( new_state ) {
case MSTATE: p.first--; p.second--; break;
case ISTATE: p.second-=step_left; step_length = step_left; break;
case DSTATE: p.first-=step_up; step_length = step_up; break;
}
// now let's see if the state actually changed:
if ( new_state == state ) segment_length+=step_length;
else {
// state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
CigarOperator o=null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
final byte b_base = b[j-1]; // letter in b at the current pos
double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_down = 0.0 ;
int kd = 0;
for ( int k = 1 ; k < i ; k++ ) {
final double gap_penalty = wk(k);
if ( step_down < sw[i-k][j]+gap_penalty ) {
step_down=sw[i-k][j]+gap_penalty;
kd = k;
}
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
segment_length = step_length;
state = new_state;
}
} while ( sw[p.first][p.second] != 0 );
// post-process the last segment we are still keeping
CigarOperator o=null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
double step_right = 0;
int ki = 0;
for ( int k = 1 ; k < j ; k++ ) {
final double gap_penalty = wk(k);
if ( step_right < sw[i][j-k]+gap_penalty ) {
step_right=sw[i][j-k]+gap_penalty;
ki = k;
}
}
if ( step_down > step_right ) {
if ( step_down > step_diag ) {
sw[i][j] = Math.max(0,step_down);
btrack[i][j] = kd; // positive=vertical
}
else {
sw[i][j] = Math.max(0,step_diag);
btrack[i][j] = 0; // 0 = diagonal
}
} else {
// step_down < step_right
if ( step_right > step_diag ) {
sw[i][j] = Math.max(0,step_right);
btrack[i][j] = -ki; // negative = horizontal
} else {
sw[i][j] = Math.max(0,step_diag);
btrack[i][j] = 0; // 0 = diagonal
}
}
sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
}
alignment_offset = p.first - p.second;
segment_length+=p.second;
CigarElement e = new CigarElement(segment_length,o);
lce.add(e);
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
}
// print(sw,a,b);
}
private void calculateCigar(int n, int m, double [][] sw, int [][] btrack) {
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
//PrimitivePair.Int p = new PrimitivePair.Int();
int p1 = 0, p2 = 0;
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1 ; i < n+1 ; i++ ) {
if ( sw[i][m] >= maxscore ) {
p1 = i; p2 = m ; maxscore = sw[i][m];
}
}
for ( int j = 1 ; j < m+1 ; j++ ) {
if ( sw[n][j] > maxscore || sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) {
p1 = n;
p2 = j ;
maxscore = sw[n][j];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
// we will be placing all insertions and deletions into sequence b, so the state are named w/regard
// to that sequence
int state = MSTATE;
List<CigarElement> lce = new ArrayList<CigarElement>(5);
do {
int btr = btrack[p1][p2];
int step_left = ( btr < 0 ? -btr : 1);
int step_up = ( btr > 0 ? btr : 1 );
int new_state;
if ( btr > 0 ) new_state = DSTATE;
else if ( btr < 0 ) new_state = ISTATE;
else new_state = MSTATE;
int step_length = 1;
// move to next best location in the sw matrix:
switch( new_state ) {
case MSTATE: p1--; p2--; break;
case ISTATE: p2-=step_left; step_length = step_left; break;
case DSTATE: p1-=step_up; step_length = step_up; break;
}
// now let's see if the state actually changed:
if ( new_state == state ) segment_length+=step_length;
else {
// state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
lce.add(makeElement(state, segment_length));
segment_length = step_length;
state = new_state;
}
} while ( sw[p1][p2] != 0 );
// post-process the last segment we are still keeping
lce.add(makeElement(state, segment_length + p2));
alignment_offset = p1 - p2;
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
}
private CigarElement makeElement(int state, int segment_length) {
CigarOperator o = null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
return new CigarElement(segment_length,o);
}
private double wd(byte x, byte y) {

View File

@ -27,7 +27,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
String filename1 = "NA12878.chrom1.SLX.SRP000032.2009_06";
String filename2 = "low_coverage_CEU.chr1.10k-11k";
WalkerTestSpec spec3 = new WalkerTestSpec(
"-T IndelRealigner -nway -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + filename1 + ".bam -I " + validationDataLocation + filename2 + ".bam -L 1:10023900-10024000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O /tmp -snps %s",
"-T IndelRealigner --cleanPerfectMatches -nway -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + filename1 + ".bam -I " + validationDataLocation + filename2 + ".bam -L 1:10023900-10024000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O /tmp -snps %s",
1,
Arrays.asList("bd42a4fa66d7ec7a480c2b94313a78d3"));
File file1 = new File("/tmp/" + filename1 + ".cleaned.bam");