Change SmithWaterman to use integers instead of doubles.

This commit is contained in:
Nigel Delaney 2014-05-25 17:12:00 -04:00
parent cfde6e72bf
commit cc45e62e8e
10 changed files with 174 additions and 200 deletions

View File

@ -339,18 +339,18 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
return overlap;
}
private static final double SW_MATCH = 4.0;
private static final double SW_MISMATCH = -10.0;
private static final double SW_GAP = -25.0;
private static final double SW_GAP_EXTEND = -1.3;
private static final double SW_EPSILON = 0.00001;
private static final int SW_MATCH = 40;
private static final int SW_MISMATCH = -100;
private static final int SW_GAP = -250;
private static final int SW_GAP_EXTEND = -13;
private void resolveByHaplotype(final ReferenceContext refContext) {
final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext);
final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext);
final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND, SW_EPSILON );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND, SW_EPSILON );
final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
// protect against SW failures
if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 ||

View File

@ -329,7 +329,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// fraction of mismatches that need to no longer mismatch for a column to be considered cleaned
private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75;
private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0,0.0001);
private final static Parameters swParameters = new Parameters(30, -10, -10, -2);
// reference base padding size
// TODO -- make this a command-line argument if the need arises

View File

@ -203,7 +203,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
}
public Map<Integer,VariantContext> calcAlignment() {
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap, new Parameters(1.0,-1.0/3.0,-1 - 1./3., -1./3.,0.00001));
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap, new Parameters(3,-1,-4, -1));
final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar());
return HaplotypeCallerGenotypingEngine.generateVCsFromAlignment(h, ref, genomeLocParser.createGenomeLoc("4", 1, 1 + ref.length), "name");
}

View File

@ -46,6 +46,10 @@
package org.broadinstitute.gatk.utils.smithwaterman;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.broadinstitute.gatk.utils.sam.CigarUtils;
import org.broadinstitute.gatk.utils.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
@ -78,16 +82,16 @@ public class SWPairwiseAlignmentUnitTest extends BaseTest {
final String ref1 = "AAAGACTACTG";
final String read1 = "AACGGACACTG";
tests.add(new Object[]{ref1, read1, 5.0, -10.0, -22.0, -1.2, 0.0001, 1, "2M2I3M1D4M"});
tests.add(new Object[]{ref1, read1, 20.0, -5.0, -30.0, -2.2, 0.0001, 0, "11M"});
tests.add(new Object[]{ref1, read1, 50, -100, -220, -12, 1, "2M2I3M1D4M"});
tests.add(new Object[]{ref1, read1, 200, -50, -300, -22, 0, "11M"});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "OddNoAlignment", enabled = true)
public void testOddNoAlignment(final String reference, final String read, final double match, final double mismatch, final double gap, final double gap_extend, final double epsilon,
public void testOddNoAlignment(final String reference, final String read, final int match, final int mismatch, final int gap, final int gap_extend,
final int expectedStart, final String expectedCigar) {
final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), match, mismatch, gap, gap_extend, epsilon);
final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), match, mismatch, gap, gap_extend);
Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
}
@ -117,4 +121,46 @@ public class SWPairwiseAlignmentUnitTest extends BaseTest {
Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart);
Assert.assertEquals(sw.getCigar().toString(), expectedCigar);
}
@Test(enabled = true)
public void testForIdenticalAlignmentsWithDifferingFlankLengths() {
//This test is designed to ensure that the indels are correctly placed
//if the region flanking these indels is extended by a varying amount.
//It checks for problems caused by floating point rounding leading to different
//paths being selected.
//Create two versions of the same sequence with different flanking regions.
byte[] paddedRef="GCGTCGCAGTCTTAAGGCCCCGCCTTTTCAGACAGCTTCCGCTGGGCCTGGGCCGCTGCGGGGCGGTCACGGCCCCTTTAAGCCTGAGCCCCGCCCCCTGGCTCCCCGCCCCCTCTTCTCCCCTCCCCCAAGCCAGCACCTGGTGCCCCGGCGGGTCGTGCGGCGCGGCGCTCCGCGGTGAGCGCCTGACCCCGAGGGGGCCCGGGGCCGCGTCCCTGGGCCCTCCCCACCCTTGCGGTGGCCTCGCGGGTCCCAGGGGCGGGGCTGGAGCGGCAGCAGGGCCGGGGAGATGGGCGGTGGGGAGCGCGGGAGGGACCGGGCCGAGCCGGGGGAAGGGCTCCGGTGACT".getBytes();
byte[] paddedHap="GCGTCGCAGTCTTAAGGCCCCGCCTTTTCAGACAGCTTCCGCTGGGCCTGGGCCGCTGCGGGGCGGTCACGGCCCCTTTAAGCCTGAGCCCCGCCCCCTGGCTCCCCGCCCCCTCTTCTCCCCTCCCCCAAGCCAGCACCTGGTGCCCCGGCGGGTCGTGCGGCGCGGCGCTCCGCGGTGAGCGCCTGACCCCGA--GGGCC---------------GGGCCCTCCCCACCCTTGCGGTGGCCTCGCGGGTCCCAGGGGCGGGGCTGGAGCGGCAGCAGGGCCGGGGAGATGGGCGGTGGGGAGCGCGGGAGGGACCGGGCCGAGCCGGGGGAAGGGCTCCGGTGACT".replace("-","").getBytes();
byte[] notPaddedRef= "CTTTAAGCCTGAGCCCCGCCCCCTGGCTCCCCGCCCCCTCTTCTCCCCTCCCCCAAGCCAGCACCTGGTGCCCCGGCGGGTCGTGCGGCGCGGCGCTCCGCGGTGAGCGCCTGACCCCGAGGGGGCCCGGGGCCGCGTCCCTGGGCCCTCCCCACCCTTGCGGTGGCCTCGCGGGTCCCAGGGGCGGGGCTGGAGCGGCAGCAGGGCCGGGGAGATGGGCGGTGGGGAGCGCGGGAGGGA".getBytes();
byte[] notPaddedHap= "CTTTAAGCCTGAGCCCCGCCCCCTGGCTCCCCGCCCCCTCTTCTCCCCTCCCCCAAGCCAGCACCTGGTGCCCCGGCGGGTCGTGCGGCGCGGCGCTCCGCGGTGAGCGCCTGACCCCGA---------GGGCC--------GGGCCCTCCCCACCCTTGCGGTGGCCTCGCGGGTCCCAGGGGCGGGGCTGGAGCGGCAGCAGGGCCGGGGAGATGGGCGGTGGGGAGCGCGGGAGGGA".replace("-","").getBytes();
//a simplified version of the getCigar routine in the haplotype caller to align these
final String SW_PAD = "NNNNNNNNNN";
final String paddedsRef = SW_PAD + new String(paddedRef) + SW_PAD;
final String paddedsHap = SW_PAD + new String(paddedHap) + SW_PAD;
final String notPaddedsRef = SW_PAD + new String(notPaddedRef) + SW_PAD;
final String notpaddedsHap = SW_PAD + new String(notPaddedHap) + SW_PAD;
final SmithWaterman paddedAlignment = new SWPairwiseAlignment( paddedsRef.getBytes(), paddedsHap.getBytes(), CigarUtils.NEW_SW_PARAMETERS );
final SmithWaterman notPaddedAlignment = new SWPairwiseAlignment( notPaddedsRef.getBytes(), notpaddedsHap.getBytes(), CigarUtils.NEW_SW_PARAMETERS );
//Now verify that the two sequences have the same alignment and not match positions.
Cigar rawPadded = paddedAlignment.getCigar();
Cigar notPadded= notPaddedAlignment.getCigar();
List<CigarElement> paddedC=rawPadded.getCigarElements();
List<CigarElement> notPaddedC=notPadded.getCigarElements();
Assert.assertEquals(paddedC.size(),notPaddedC.size());
for(int i=0;i<notPaddedC.size();i++)
{
CigarElement pc=paddedC.get(i);
CigarElement npc=notPaddedC.get(i);
if(pc.getOperator()== CigarOperator.M && npc.getOperator()==CigarOperator.M)
{
continue;
}
int l1=pc.getLength();
int l2=npc.getLength();
Assert.assertEquals(l1,l2);
Assert.assertEquals(pc.getOperator(),npc.getOperator());
}
}
}

View File

@ -172,7 +172,7 @@ public class CigarUtils {
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
// these values were chosen via optimization against the NA12878 knowledge base
public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1, 0.00001);
public static final Parameters NEW_SW_PARAMETERS = new Parameters(200, -150, -260, -11);
private final static String SW_PAD = "NNNNNNNNNN";

View File

@ -121,35 +121,29 @@ public final class GlobalEdgeGreedySWPairwiseAlignment extends SWPairwiseAlignme
final byte[] refToAlign = Utils.trimArray(reference, forwardEdgeMatch, reverseEdgeMatch);
final byte[] altToAlign = Utils.trimArray(alternate, forwardEdgeMatch, reverseEdgeMatch);
final double[] sw = new double[(sizeOfRefToAlign+1)*(sizeOfAltToAlign+1)];
final int[][] sw = new int[(sizeOfRefToAlign+1)][(sizeOfAltToAlign+1)];
if ( keepScoringMatrix ) SW = sw;
final int[] btrack = new int[(sizeOfRefToAlign+1)*(sizeOfAltToAlign+1)];
final int[][] btrack = new int[(sizeOfRefToAlign+1)][(sizeOfAltToAlign+1)];
calculateMatrix(refToAlign, altToAlign, sw, btrack, OVERHANG_STRATEGY.INDEL);
if ( DEBUG_MODE ) {
System.out.println(new String(refToAlign) + " vs. " + new String(altToAlign));
debugMatrix(sw, sizeOfRefToAlign+1, sizeOfAltToAlign+1);
debugMatrix(sw);
System.out.println("----");
debugMatrix(btrack, sizeOfRefToAlign + 1, sizeOfAltToAlign + 1);
debugMatrix(btrack);
System.out.println();
}
alignmentResult = calculateCigar(forwardEdgeMatch, reverseEdgeMatch, sizeOfRefToAlign, sizeOfAltToAlign, sw, btrack);
alignmentResult = calculateCigar(forwardEdgeMatch, reverseEdgeMatch, sw, btrack);
}
private void debugMatrix(final double[] matrix, final int dim1, final int dim2) {
for ( int i = 0; i < dim1; i++ ) {
for ( int j = 0; j < dim2; j++ )
System.out.print(String.format("%.1f ", matrix[i * dim2 + j]));
System.out.println();
}
}
private void debugMatrix(final int[] matrix, final int dim1, final int dim2) {
for ( int i = 0; i < dim1; i++ ) {
for ( int j = 0; j < dim2; j++ )
System.out.print(matrix[i*dim2 + j] + " ");
private void debugMatrix(final int[][] matrix) {
for ( int i = 0; i < matrix.length; i++ ) {
int [] cur = matrix[i];
for ( int j = 0; j < cur.length; j++ )
System.out.print(cur[j] + " ");
System.out.println();
}
}
@ -194,17 +188,14 @@ public final class GlobalEdgeGreedySWPairwiseAlignment extends SWPairwiseAlignme
*
* @param matchingPrefix the prefix match size
* @param matchingSuffix the suffix match size
* @param refLength length of the reference sequence
* @param altLength length of the alternate sequence
* @param sw the Smith-Waterman matrix to use
* @param btrack the back track matrix to use
* @return non-null SWPairwiseAlignmentResult object
*/
protected SWPairwiseAlignmentResult calculateCigar(final int matchingPrefix, final int matchingSuffix,
final int refLength, final int altLength,
final double[] sw, final int[] btrack) {
final int[][] sw, final int[][] btrack) {
final SWPairwiseAlignmentResult SW_result = calculateCigar(refLength, altLength, sw, btrack, OVERHANG_STRATEGY.INDEL);
final SWPairwiseAlignmentResult SW_result = calculateCigar(sw, btrack, OVERHANG_STRATEGY.INDEL);
final LinkedList<CigarElement> lce = new LinkedList<CigarElement>(SW_result.cigar.getCigarElements());
if ( matchingPrefix > 0 )

View File

@ -35,11 +35,10 @@ package org.broadinstitute.gatk.utils.smithwaterman;
* Time: 12:03 PM
*/
public final class Parameters {
public final double w_match;
public final double w_mismatch;
public final double w_open;
public final double w_extend;
public final double epsilon;
public final int w_match;
public final int w_mismatch;
public final int w_open;
public final int w_extend;
/**
* Create a new set of SW parameters
@ -47,20 +46,17 @@ public final class Parameters {
* @param w_mismatch the mismatch penalty
* @param w_open the gap open penalty
* @param w_extend the gap extension penalty
* @param epsilon weight comparison error
*/
public Parameters(final double w_match, final double w_mismatch, final double w_open, final double w_extend, final double epsilon) {
public Parameters(final int w_match, final int w_mismatch, final int w_open, final int w_extend) {
if ( w_mismatch > 0 ) throw new IllegalArgumentException("w_mismatch must be <= 0 but got " + w_mismatch);
if ( w_open> 0 ) throw new IllegalArgumentException("w_open must be <= 0 but got " + w_open);
if ( w_extend> 0 ) throw new IllegalArgumentException("w_extend must be <= 0 but got " + w_extend);
if ( Double.isNaN(epsilon)) throw new IllegalArgumentException("epsilon cannot be a NaN");
if ( epsilon < 0.0 ) throw new IllegalArgumentException("epsilon cannot be negative");
this.w_match = w_match;
this.w_mismatch = w_mismatch;
this.w_open = w_open;
this.w_extend = w_extend;
this.epsilon = epsilon;
}
}

View File

@ -99,7 +99,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
/**
* The SW scoring matrix, stored for debugging purposes if keepScoringMatrix is true
*/
protected double[] SW = null;
protected int[][] SW = null;
/**
* Only for testing purposes in the SWPairwiseAlignmentMain function
@ -113,10 +113,8 @@ public class SWPairwiseAlignment implements SmithWaterman {
* @deprecated in favor of constructors using the Parameter or ParameterSet class
*/
@Deprecated
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2,
final double match, final double mismatch, final double open, final double extend,
final double epsilon ) {
this(seq1, seq2, new Parameters(match, mismatch, open, extend, epsilon));
public SWPairwiseAlignment(byte[] seq1, byte[] seq2, int match, int mismatch, int open, int extend ) {
this(seq1, seq2, new Parameters(match, mismatch, open, extend));
}
/**
@ -149,22 +147,6 @@ public class SWPairwiseAlignment implements SmithWaterman {
align(seq1, seq2);
}
/**
* Create a new SW pairwise aligner
*
* After creating the object the two sequences are aligned with an internal call to align(seq1, seq2, parameters, strategy)
*
* @param seq1 the first sequence we want to align
* @param seq2 the second sequence we want to align
* @param parameters the parameters to use
* @param strategy the overhang strategy to use
*/
public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final Parameters parameters, final OVERHANG_STRATEGY strategy) {
this(parameters);
overhang_strategy = strategy;
align(seq1, seq2);
}
/**
* Create a new SW pairwise aligner, without actually doing any alignment yet
*
@ -207,14 +189,14 @@ public class SWPairwiseAlignment implements SmithWaterman {
if ( reference == null || reference.length == 0 || alternate == null || alternate.length == 0 )
throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation");
final int n = reference.length;
final int m = alternate.length;
double [] sw = new double[(n+1)*(m+1)];
final int n = reference.length+1;
final int m = alternate.length+1;
int[][] sw = new int[n][m];
if ( keepScoringMatrix ) SW = sw;
int [] btrack = new int[(n+1)*(m+1)];
int[][] btrack=new int[n][m];
calculateMatrix(reference, alternate, sw, btrack);
alignmentResult = calculateCigar(n, m, sw, btrack, overhang_strategy); // length of the segment (continuous matches, insertions or deletions)
alignmentResult = calculateCigar(sw, btrack, overhang_strategy); // length of the segment (continuous matches, insertions or deletions)
}
/**
@ -225,7 +207,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
* @param sw the Smith-Waterman matrix to populate
* @param btrack the back track matrix to populate
*/
protected void calculateMatrix(final byte[] reference, final byte[] alternate, double[] sw, int[] btrack) {
protected void calculateMatrix(final byte[] reference, final byte[] alternate, final int[][] sw, final int[][] btrack) {
calculateMatrix(reference, alternate, sw, btrack, overhang_strategy);
}
@ -238,62 +220,54 @@ public class SWPairwiseAlignment implements SmithWaterman {
* @param btrack the back track matrix to populate
* @param overhang_strategy the strategy to use for dealing with overhangs
*/
protected void calculateMatrix(final byte[] reference, final byte[] alternate, double[] sw, int[] btrack, final OVERHANG_STRATEGY overhang_strategy) {
protected void calculateMatrix(final byte[] reference, final byte[] alternate, final int[][] sw, final int[][] btrack, final OVERHANG_STRATEGY overhang_strategy) {
if ( reference.length == 0 || alternate.length == 0 )
throw new IllegalArgumentException("Non-null, non-empty sequences are required for the Smith-Waterman calculation");
final int n = reference.length+1;
final int m = alternate.length+1;
final int ncol = sw[0].length;//alternate.length+1; formerly m
final int nrow = sw.length;// reference.length+1; formerly n
//final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff
final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff
if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0;
else MATRIX_MIN_CUTOFF = -1e100;
final int MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff
if ( cutoff ) MATRIX_MIN_CUTOFF = 0;
else MATRIX_MIN_CUTOFF = (int) -1e8;
final double[] best_gap_v = new double[m+1];
Arrays.fill(best_gap_v, -1.0e40);
final int[] gap_size_v = new int[m+1];
final double[] best_gap_h = new double[n+1];
Arrays.fill(best_gap_h,-1.0e40);
final int[] gap_size_h = new int[n+1];
int lowInitValue=Integer.MIN_VALUE/2;
final int[] best_gap_v = new int[ncol+1];
Arrays.fill(best_gap_v, lowInitValue);
final int[] gap_size_v = new int[ncol+1];
final int[] best_gap_h = new int[nrow+1];
Arrays.fill(best_gap_h,lowInitValue);
final int[] gap_size_h = new int[nrow+1];
// we need to initialize the SW matrix with gap penalties if we want to keep track of indels at the edges of alignments
if ( overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL ) {
// initialize the first row
sw[1] = parameters.w_open;
double currentValue = parameters.w_open;
for ( int i = 2; i < m; i++ ) {
int[] topRow=sw[0];
topRow[1]=parameters.w_open;
int currentValue = parameters.w_open;
for ( int i = 2; i < topRow.length; i++ ) {
currentValue += parameters.w_extend;
sw[i] = currentValue;
topRow[i]=currentValue;
}
// initialize the first column
sw[m] = parameters.w_open;
sw[1][0]=parameters.w_open;
currentValue = parameters.w_open;
for ( int i = 2; i < n; i++ ) {
for ( int i = 2; i < sw.length; i++ ) {
currentValue += parameters.w_extend;
sw[i*m] = currentValue;
sw[i][0]=currentValue;
}
}
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop
byte a_base = reference[i-1]; // letter in a at the current pos
final int row_offset = row_offset_1 + m;
// On the entrance into the loop, row_offset_1 is the (linear) offset
// of the first element of row (i-1) and row_offset is the linear offset of the
// start of row i
for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) {
// data_offset_1 is linearized offset of element [i-1][j-1]
int[] curRow=sw[0];
for ( int i = 1; i <sw.length ; i++ ) {
final byte a_base = reference[i-1]; // letter in a at the current pos
final int[] lastRow=curRow;
curRow=sw[i];
final int[] curBackTrackRow=btrack[i];
for ( int j = 1; j < curRow.length; j++) {
final byte b_base = alternate[j-1]; // letter in b at the current pos
// in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
final double step_diag = sw[data_offset_1] + wd(a_base,b_base);
final int step_diag = lastRow[j-1] + wd(a_base,b_base);
// optimized "traversal" of all the matrix cells above the current one (i.e. traversing
// all 'step down' events that would end in the current cell. The optimized code
@ -301,11 +275,9 @@ public class SWPairwiseAlignment implements SmithWaterman {
// the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!
// if a gap (length 1) was just opened above, this is the cost of arriving to the current cell:
double prev_gap = sw[data_offset_1+1]+parameters.w_open;
int prev_gap = lastRow[j]+parameters.w_open;
best_gap_v[j] += parameters.w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend
if ( compareScores(prev_gap,best_gap_v[j]) > 0 ) {
if ( prev_gap > best_gap_v[j] ) {
// opening a gap just before the current cell results in better score than extending by one
// the best previously opened gap. This will hold for ALL cells below: since any gap
// once opened always costs w_extend to extend by another base, we will always get a better score
@ -317,7 +289,7 @@ public class SWPairwiseAlignment implements SmithWaterman {
gap_size_v[j]++;
}
final double step_down = best_gap_v[j] ;
final int step_down = best_gap_v[j] ;
final int kd = gap_size_v[j];
// optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing
@ -325,11 +297,9 @@ public class SWPairwiseAlignment implements SmithWaterman {
// does exactly the same thing as the commented out loop below. IMPORTANT:
// the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!
final int data_offset = row_offset + j; // linearized offset of element [i][j]
prev_gap = sw[data_offset-1]+parameters.w_open; // what would it cost us to open length 1 gap just to the left from current cell
prev_gap =curRow[j-1] + parameters.w_open; // what would it cost us to open length 1 gap just to the left from current cell
best_gap_h[i] += parameters.w_extend; // previous best gap would cost us that much if extended by another base
if ( compareScores(prev_gap,best_gap_h[i]) > 0 ) {
if ( prev_gap > best_gap_h[i] ) {
// newly opened gap is better (score-wise) than any previous gap with the same row index i; since
// gap penalty is linear with k, this new gap location is going to remain better than any previous ones
best_gap_h[i] = prev_gap;
@ -338,34 +308,26 @@ public class SWPairwiseAlignment implements SmithWaterman {
gap_size_h[i]++;
}
final double step_right = best_gap_h[i];
final int step_right = best_gap_h[i];
final int ki = gap_size_h[i];
if ( compareScores(step_down, step_right) > 0 ) {
if ( compareScores(step_down,step_diag) > 0) {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down);
btrack[data_offset] = kd ; // positive=vertical
} else {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
} else {
// step_down <= step_right
if ( compareScores(step_right, step_diag) > 0 ) {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_right);
btrack[data_offset] = -ki; // negative = horizontal
} else {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
//priority here will be step diagonal, step right, step down
final boolean diagHighestOrEqual = (step_diag >= step_down)
&& (step_diag >= step_right);
if ( diagHighestOrEqual ) {
curRow[j]=Math.max(MATRIX_MIN_CUTOFF,step_diag);
curBackTrackRow[j]=0;
}
else if(step_right>=step_down) { //moving right is the highest
curRow[j]=Math.max(MATRIX_MIN_CUTOFF,step_right);
curBackTrackRow[j]=-ki; // negative = horizontal
}
else {
curRow[j]=Math.max(MATRIX_MIN_CUTOFF,step_down);
curBackTrackRow[j]= kd; // positive=vertical
}
}
// IMPORTANT, IMPORTANT, IMPORTANT:
// note that we update this (secondary) outer loop variable here,
// so that we DO NOT need to update it
// in the for() statement itself.
row_offset_1 = row_offset;
}
}
@ -381,40 +343,22 @@ public class SWPairwiseAlignment implements SmithWaterman {
}
}
/**
* Compares alternative (sub)alignments scores considering
* a round-off error: {@link #parameters#epsilon}.
*
* @param score1 first score.
* @param score2 second score.
*
* @return -1 if {@code score1} is less than {@code score2}, 0 if the are the same, 1 if {@code score2} is
* greater than {@code score1}.
*/
private int compareScores(final double score1, final double score2) {
if (score1 == score2)
return 0;
else if (score1 < score2)
return score2 - score1 > parameters.epsilon ? -1 : 0;
else
return score1 - score2 > parameters.epsilon ? +1 : 0;
}
/**
* Calculates the CIGAR for the alignment from the back track matrix
*
* @param refLength length of the reference sequence
* @param altLength length of the alternate sequence
* @param sw the Smith-Waterman matrix to use
* @param btrack the back track matrix to use
* @param overhang_strategy the strategy to use for dealing with overhangs
* @return non-null SWPairwiseAlignmentResult object
*/
protected SWPairwiseAlignmentResult calculateCigar(final int refLength, final int altLength, final double[] sw, final int[] btrack, final OVERHANG_STRATEGY overhang_strategy) {
protected SWPairwiseAlignmentResult calculateCigar(final int[][] sw, final int[][] btrack, final OVERHANG_STRATEGY overhang_strategy) {
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
int p1 = 0, p2 = 0;
double maxscore = Double.NEGATIVE_INFINITY; // sw scores are allowed to be negative
int refLength = sw.length-1;
int altLength = sw[0].length-1;
int maxscore = Integer.MIN_VALUE; // sw scores are allowed to be negative
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// if we want to consider overhangs as legitimate operators, then just start from the corner of the matrix
@ -424,31 +368,34 @@ public class SWPairwiseAlignment implements SmithWaterman {
} else {
// look for the largest score on the rightmost column. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = altLength+1+altLength ; i < refLength+1 ; i++, data_offset += (altLength+1) ) {
// data_offset is the offset of [i][m]
//Note: this is not technically smith-waterman, as by only looking for max values on the right we are
//excluding high scoring local alignments
p2=altLength;
if ( compareScores(sw[data_offset],maxscore) >= 0 ) { // sw[data_offset] >= maxscore
p1 = i; p2 = altLength ; maxscore = sw[data_offset];
}
for(int i=1;i<sw.length;i++) {
final int curScore = sw[i][altLength];
if (curScore >= maxscore ) {
p1 = i;
maxscore = curScore;
}
}
// now look for a larger score on the bottom-most row
if ( overhang_strategy != OVERHANG_STRATEGY.LEADING_INDEL ) {
for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) {
final int[] bottomRow=sw[refLength];
for ( int j = 1 ; j < bottomRow.length; j++) {
int curScore=bottomRow[j];
// data_offset is the offset of [n][j]
final int scoreComparison = compareScores(sw[data_offset],maxscore);
if ( scoreComparison > 0 || scoreComparison == 0 && Math.abs(refLength-j) < Math.abs(p1 - p2)) {
if ( curScore > maxscore ||
(curScore == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2) ) ) {
p1 = refLength;
p2 = j ;
maxscore = sw[data_offset];
maxscore = curScore;
segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
}
}
final List<CigarElement> lce = new ArrayList<CigarElement>(5);
if ( segment_length > 0 && overhang_strategy == OVERHANG_STRATEGY.SOFTCLIP ) {
lce.add(makeElement(State.CLIP, segment_length));
segment_length = 0;
@ -458,14 +405,10 @@ public class SWPairwiseAlignment implements SmithWaterman {
// to that sequence
State state = State.MATCH;
int data_offset = p1*(altLength+1)+p2; // offset of element [p1][p2]
do {
int btr = btrack[data_offset];
int btr = btrack[p1][p2];
State new_state;
int step_length = 1;
if ( btr > 0 ) {
new_state = State.DELETION;
step_length = btr;
@ -476,9 +419,9 @@ public class SWPairwiseAlignment implements SmithWaterman {
// move to next best location in the sw matrix:
switch( new_state ) {
case MATCH: data_offset -= (altLength+2); p1--; p2--; break; // move back along the diag in the sw matrix
case INSERTION: data_offset -= step_length; p2 -= step_length; break; // move left
case DELETION: data_offset -= (altLength+1)*step_length; p1 -= step_length; break; // move up
case MATCH: p1--; p2--; break; // move back along the diag in the sw matrix
case INSERTION: p2 -= step_length; break; // move left
case DELETION: p1 -= step_length; break; // move up
}
// now let's see if the state actually changed:
@ -537,7 +480,8 @@ public class SWPairwiseAlignment implements SmithWaterman {
return new CigarElement(length, op);
}
private double wd(byte x, byte y) {
private int wd(final byte x, final byte y) {
return (x == y ? parameters.w_match : parameters.w_mismatch);
}

View File

@ -90,19 +90,16 @@ public class SWPairwiseAlignmentMain {
System.exit(1);
}
double w_match;
double w_mismatch;
double w_open;
double w_extend;
final int w_match, w_mismatch, w_open, w_extend;
w_match = (m == null ? 30.0 : m.doubleValue());
w_mismatch = (mm == null ? -10.0 : mm.doubleValue());
w_open = (open == null ? -10.0 : open.doubleValue());
w_extend = (ext == null ? -2.0 : ext.doubleValue());
w_match = (m == null ? 30 : m.intValue());
w_mismatch = (mm == null ? -10 : mm.intValue());
w_open = (open == null ? -10 : open.intValue());
w_extend = (ext == null ? -2 : ext.intValue());
SWPairwiseAlignment.keepScoringMatrix = true;
SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend, 0.0001);
SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend);
System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+
" length1="+ref.length()+" length2="+read.length());
@ -117,19 +114,19 @@ public class SWPairwiseAlignmentMain {
}
}
private static void print(double[] s, byte[] a, byte[] b) {
private static void print(final int[][] s, final byte[] a, final byte[] b) {
int n = a.length+1;
int m = b.length+1;
System.out.print(" ");
for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ;
System.out.println();
for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) {
for ( int i = 0 ; i < n ; i++) {
if ( i > 0 ) System.out.print((char)a[i-1]);
else System.out.print(' ');
System.out.print(" ");
for ( int j = 0; j < m ; j++ ) {
System.out.printf(" %5.1f",s[row_offset+j]);
System.out.printf(" %5.1f",s[i][j]);
}
System.out.println();
}

View File

@ -34,12 +34,12 @@ package org.broadinstitute.gatk.utils.smithwaterman;
*/
public enum SWParameterSet {
// match=1, mismatch = -1/3, gap=-(1+k/3)
ORIGINAL_DEFAULT(new Parameters(1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0,0.00001)),
ORIGINAL_DEFAULT(new Parameters(3,-1,-4,-3)),
/**
* A standard set of values for NGS alignments
*/
STANDARD_NGS(new Parameters(5.0, -10.0, -22.0, -1.2,0.00001));
STANDARD_NGS(new Parameters(25, -50, -110, -6));
protected Parameters parameters;