gatk-3.8/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java

838 lines
35 KiB
Java
Executable File

/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 23, 2009
* Time: 1:54:54 PM
* To change this template use File | Settings | File Templates.
*/
public class SWPairwiseAlignment {
private int alignment_offset; // offset of s2 w/respect to s1
private Cigar alignmentCigar;
private final double w_match;
private final double w_mismatch;
private final double w_open;
private final double w_extend;
private static final int MSTATE = 0;
private static final int ISTATE = 1;
private static final int DSTATE = 2;
private static final int CLIP = 3;
private static boolean cutoff = false;
private static boolean DO_SOFTCLIP = true;
double[] SW;
// private double [] best_gap_v ;
// private int [] gap_size_v ;
// private double [] best_gap_h ;
// private int [] gap_size_h ;
// private static double [][] sw = new double[500][500];
// private static int [][] btrack = new int[500][500];
// ************************************************************************
// **** IMPORTANT NOTE: ****
// **** This class assumes that all bytes come from UPPERCASED chars! ****
// ************************************************************************
public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) {
w_match = match;
w_mismatch = mismatch;
w_open = open;
w_extend = extend;
align(seq1,seq2);
}
public SWPairwiseAlignment(byte[] seq1, byte[] seq2) {
this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3)
}
public Cigar getCigar() { return alignmentCigar ; }
public int getAlignmentStart2wrt1() { return alignment_offset; }
public void align(final byte[] a, final byte[] b) {
final int n = a.length;
final int m = b.length;
double [] sw = new double[(n+1)*(m+1)];
SW = sw;
int [] btrack = new int[(n+1)*(m+1)];
// best_gap_v = new double[m+1];
// Arrays.fill(best_gap_v,-1.0e40);
// gap_size_v = new int[m+1];
// best_gap_h = new double[n+1];
// Arrays.fill(best_gap_h,-1.0e40);
// gap_size_h = new int[n+1];
calculateMatrix(a, b, sw, btrack);
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
}
private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) {
final int n = a.length+1;
final int m = b.length+1;
//final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff
final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff
if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0;
else MATRIX_MIN_CUTOFF = -1e100;
double [] best_gap_v = new double[m+1];
Arrays.fill(best_gap_v,-1.0e40);
int [] gap_size_v = new int[m+1];
double [] best_gap_h = new double[n+1];
Arrays.fill(best_gap_h,-1.0e40);
int [] gap_size_h = new int[n+1];
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop
byte a_base = a[i-1]; // letter in a at the current pos
final int row_offset = row_offset_1 + m;
// On the entrance into the loop, row_offset_1 is the (linear) offset
// of the first element of row (i-1) and row_offset is the linear offset of the
// start of row i
for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) {
// data_offset_1 is linearized offset of element [i-1][j-1]
final byte b_base = b[j-1]; // letter in b at the current pos
// in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_diag = sw[data_offset_1] + wd(a_base,b_base);
// optimized "traversal" of all the matrix cells above the current one (i.e. traversing
// all 'step down' events that would end in the current cell. The optimized code
// does exactly the same thing as the commented out loop below. IMPORTANT:
// the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!
// if a gap (length 1) was just opened above, this is the cost of arriving to the current cell:
double prev_gap = sw[data_offset_1+1]+w_open;
best_gap_v[j] += w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend
if ( prev_gap > best_gap_v[j] ) {
// opening a gap just before the current cell results in better score than extending by one
// the best previously opened gap. This will hold for ALL cells below: since any gap
// once opened always costs w_extend to extend by another base, we will always get a better score
// by arriving to any cell below from the gap we just opened (prev_gap) rather than from the previous best gap
best_gap_v[j] = prev_gap;
gap_size_v[j] = 1; // remember that the best step-down gap from above has length 1 (we just opened it)
} else {
// previous best gap is still the best, even after extension by another base, so we just record that extension:
gap_size_v[j]++;
}
final double step_down = best_gap_v[j] ;
final int kd = gap_size_v[j];
/*
for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) {
// data_offset_k is linearized offset of element [i-k][j]
// in other words, trial = sw[i-k][j]+gap_penalty:
final double trial = sw[data_offset_k]+wk(k);
if ( step_down < trial ) {
step_down=trial;
kd = k;
}
}
*/
// optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing
// all 'step right' events that would end in the current cell. The optimized code
// does exactly the same thing as the commented out loop below. IMPORTANT:
// the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!
final int data_offset = row_offset + j; // linearized offset of element [i][j]
prev_gap = sw[data_offset-1]+w_open; // what would it cost us to open length 1 gap just to the left from current cell
best_gap_h[i] += w_extend; // previous best gap would cost us that much if extended by another base
if ( prev_gap > best_gap_h[i] ) {
// newly opened gap is better (score-wise) than any previous gap with the same row index i; since
// gap penalty is linear with k, this new gap location is going to remain better than any previous ones
best_gap_h[i] = prev_gap;
gap_size_h[i] = 1;
} else {
gap_size_h[i]++;
}
final double step_right = best_gap_h[i];
final int ki = gap_size_h[i];
/*
for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) {
// data_offset is linearized offset of element [i][j-k]
// in other words, step_right=sw[i][j-k]+gap_penalty;
final double trial = sw[data_offset]+wk(k);
if ( step_right < trial ) {
step_right=trial;
ki = k;
}
}
final int data_offset = row_offset + j; // linearized offset of element [i][j]
*/
if ( step_down > step_right ) {
if ( step_down > step_diag ) {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down);
btrack[data_offset] = kd ; // positive=vertical
} else {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
} else {
// step_down <= step_right
if ( step_right > step_diag ) {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_right);
btrack[data_offset] = -ki; // negative = horizontal
} else {
sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
}
// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
}
// IMPORTANT, IMPORTANT, IMPORTANT:
// note that we update this (secondary) outer loop variable here,
// so that we DO NOT need to update it
// in the for() statement itself.
row_offset_1 = row_offset;
}
// print(sw,a,b);
}
private void calculateCigar(int n, int m, double [] sw, int [] btrack) {
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
//PrimitivePair.Int p = new PrimitivePair.Int();
int p1 = 0, p2 = 0;
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) {
// data_offset is the offset of [i][m]
if ( sw[data_offset] >= maxscore ) {
p1 = i; p2 = m ; maxscore = sw[data_offset];
}
}
for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) {
p1 = n;
p2 = j ;
// maxscore = sw[n][j];
maxscore = sw[data_offset];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
// System.out.println(" Found max score="+maxscore+" at p1="+p1+ " p2="+p2);
List<CigarElement> lce = new ArrayList<CigarElement>(5);
if ( segment_length > 0 && DO_SOFTCLIP ) {
lce.add(makeElement(CLIP, segment_length));
segment_length = 0;
}
// we will be placing all insertions and deletions into sequence b, so the states are named w/regard
// to that sequence
int state = MSTATE;
int data_offset = p1*(m+1)+p2; // offset of element [p1][p2]
// System.out.println("Backtracking: starts at "+p1+":"+p2+" ("+sw[data_offset]+")");
do {
// int btr = btrack[p1][p2];
int btr = btrack[data_offset];
int new_state;
int step_length = 1;
// System.out.print(" backtrack value: "+btr);
if ( btr > 0 ) {
new_state = DSTATE;
step_length = btr;
} else if ( btr < 0 ) {
new_state = ISTATE;
step_length = (-btr);
} else new_state = MSTATE; // and step_length =1, already set above
// move to next best location in the sw matrix:
switch( new_state ) {
case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in th esw matrix
case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left
case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up
}
// System.out.println("; backtracked to p1="+p1+" p2="+p2);
/*
switch( new_state ) {
case MSTATE: System.out.println(" diag (match) to "+ sw[data_offset]); break; // equivalent to p1--; p2--
case ISTATE: System.out.println(" left (insertion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p2-=step_length;
case DSTATE: System.out.println(" up (deletion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p1 -= step_up
}
*/
// now let's see if the state actually changed:
if ( new_state == state ) segment_length+=step_length;
else {
// System.out.println(" emitting "+segment_length+makeElement(state,segment_length).getOperator().toString());
// state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
lce.add(makeElement(state, segment_length));
segment_length = step_length;
state = new_state;
}
// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2:
} while ( p1 > 0 && p2 > 0 );
// post-process the last segment we are still keeping;
// NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting
// those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false;
// otherwise they will be softclipped. For instance,
// if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only
// last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if
// DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true.
// The consumers need to check for the alignment offset and deal with it properly.
if (DO_SOFTCLIP ) {
lce.add(makeElement(state, segment_length));
if ( p2> 0 ) lce.add(makeElement(CLIP, p2));
alignment_offset = p1 ;
} else {
lce.add(makeElement(state, segment_length + p2));
alignment_offset = p1 - p2;
}
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
}
private CigarElement makeElement(int state, int segment_length) {
CigarOperator o = null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
case CLIP: o = CigarOperator.S; break;
}
return new CigarElement(segment_length,o);
}
private double wd(byte x, byte y) {
return (x == y ? w_match : w_mismatch);
}
private double wk(int k) {
return w_open+(k-1)*w_extend; // gap
}
private void print(int[][] s) {
for ( int i = 0 ; i < s.length ; i++) {
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %4d",s[i][j]);
}
System.out.println();
}
}
private void print(double[][] s) {
for ( int i = 0 ; i < s.length ; i++) {
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %4g",s[i][j]);
}
System.out.println();
}
}
private void print(int[][] s, String a, String b) {
System.out.print(" ");
for ( int j = 1 ; j < s[0].length ; j++) System.out.printf(" %4c",b.charAt(j-1)) ;
System.out.println();
for ( int i = 0 ; i < s.length ; i++) {
if ( i > 0 ) System.out.print(a.charAt(i-1));
else System.out.print(' ');
System.out.print(" ");
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %4d",s[i][j]);
}
System.out.println();
}
}
private void print(double[][] s, String a, String b) {
System.out.print("");
for ( int j = 1 ; j < s[0].length ; j++) System.out.printf(" %4c",b.charAt(j-1)) ;
System.out.println();
for ( int i = 0 ; i < s.length ; i++) {
if ( i > 0 ) System.out.print(a.charAt(i-1));
else System.out.print(' ');
System.out.print(" ");
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %2.1f",s[i][j]);
}
System.out.println();
}
}
private void print(double[] s, byte[] a, byte[] b) {
int n = a.length+1;
int m = b.length+1;
System.out.print(" ");
for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ;
System.out.println();
for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) {
if ( i > 0 ) System.out.print((char)a[i-1]);
else System.out.print(' ');
System.out.print(" ");
for ( int j = 0; j < m ; j++ ) {
System.out.printf(" %5.1f",s[row_offset+j]);
}
System.out.println();
}
}
static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read) {
printAlignment(a,ref,read,100);
}
static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read, int width) {
StringBuilder bread = new StringBuilder();
StringBuilder bref = new StringBuilder();
StringBuilder match = new StringBuilder();
int i = 0;
int j = 0;
final int offset = a.getAlignmentStart2wrt1();
Cigar cigar = a.getCigar();
if ( ! DO_SOFTCLIP ) {
// we need to go through all the hassle below only if we do not do softclipping;
// otherwise offset is never negative
if ( offset < 0 ) {
for ( ; j < (-offset) ; j++ ) {
bread.append((char)read[j]);
bref.append(' ');
match.append(' ');
}
// at negative offsets, our cigar's first element carries overhanging bases
// that we have just printed above. Tweak the first element to
// exclude those bases. Here we create a new list of cigar elements, so the original
// list/original cigar are unchanged (they are unmodifiable anyway!)
List<CigarElement> tweaked = new ArrayList<CigarElement>();
tweaked.addAll(cigar.getCigarElements());
tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset,
cigar.getCigarElement(0).getOperator()));
cigar = new Cigar(tweaked);
}
}
if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0
for ( ; i < a.getAlignmentStart2wrt1() ; i++ ) {
bref.append((char)ref[i]);
bread.append(' ');
match.append(' ');
}
}
for ( CigarElement e : cigar.getCigarElements() ) {
switch (e.getOperator()) {
case M :
for ( int z = 0 ; z < e.getLength() ; z++, i++, j++ ) {
bref.append((i<ref.length)?(char)ref[i]:' ');
bread.append((j < read.length)?(char)read[j]:' ');
match.append( ( i<ref.length && j < read.length ) ? (ref[i] == read[j] ? '.':'*' ) : ' ' );
}
break;
case I :
for ( int z = 0 ; z < e.getLength(); z++, j++ ) {
bref.append('-');
bread.append((char)read[j]);
match.append('I');
}
break;
case S :
for ( int z = 0 ; z < e.getLength(); z++, j++ ) {
bref.append(' ');
bread.append((char)read[j]);
match.append('S');
}
break;
case D:
for ( int z = 0 ; z < e.getLength(); z++ , i++ ) {
bref.append((char)ref[i]);
bread.append('-');
match.append('D');
}
break;
default:
throw new StingException("Unexpected Cigar element:" + e.getOperator());
}
}
for ( ; i < ref.length; i++ ) bref.append((char)ref[i]);
for ( ; j < read.length; j++ ) bread.append((char)read[j]);
int pos = 0 ;
int maxlength = Math.max(match.length(),Math.max(bread.length(),bref.length()));
while ( pos < maxlength ) {
print_cautiously(match,pos,width);
print_cautiously(bread,pos,width);
print_cautiously(bref,pos,width);
System.out.println();
pos += width;
}
}
/** String builder's substring is extremely stupid: instead of trimming and/or returning an empty
* string when one end/both ends of the interval are out of range, it crashes with an
* exception. This utility function simply prints the substring if the interval is within the index range
* or trims accordingly if it is not.
* @param s
* @param start
* @param width
*/
private static void print_cautiously(StringBuilder s, int start, int width) {
if ( start >= s.length() ) {
System.out.println();
return;
}
int end = Math.min(start+width,s.length());
System.out.println(s.substring(start,end));
}
// BELOW: main() method for testing; old implementations of the core methods are commented out below;
// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed.
public static void main(String argv[]) {
// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT";
// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG";
String ref = null;
String read = null;
Map<String,List<String>> args = processArgs(argv);
List<String> l = args.get("SEQ");
args.remove("SEQ");
if ( l == null ) {
System.err.println("SEQ argument is missing. Two input sequences must be provided");
System.exit(1);
}
if ( l.size() != 2 ) {
System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead");
System.exit(1);
}
ref = l.get(0);
read = l.get(1);
Double m = extractSingleDoubleArg("MATCH",args);
Double mm = extractSingleDoubleArg("MISMATCH",args);
Double open = extractSingleDoubleArg("OPEN",args);
Double ext = extractSingleDoubleArg("EXTEND",args);
Boolean reverse = extractSingleBooleanArg("REVERSE",args);
if ( reverse != null && reverse.booleanValue() == true ) {
ref = Utils.reverse(ref);
read = Utils.reverse(read);
}
Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args);
Boolean cut = extractSingleBooleanArg("CUTOFF",args);
if ( cut != null ) SWPairwiseAlignment.cutoff = cut;
if ( args.size() != 0 ) {
System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next());
System.exit(1);
}
double w_match;
double w_mismatch;
double w_open;
double w_extend;
w_match = (m == null ? 30.0 : m.doubleValue());
w_mismatch = (mm == null ? -10.0 : mm.doubleValue());
w_open = (open == null ? -10.0 : open.doubleValue());
w_extend = (ext == null ? -2.0 : ext.doubleValue());
SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend);
System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+
" length1="+ref.length()+" length2="+read.length());
System.out.println();
printAlignment(a,ref.getBytes(),read.getBytes());
System.out.println();
if ( print_mat != null && print_mat == true ) {
a.print(a.SW,ref.getBytes(),read.getBytes());
}
}
static Pair<String,Integer> getArg(String prefix, String argv[], int i) {
String arg = null;
if ( argv[i].startsWith(prefix) ) {
arg = argv[i].substring(prefix.length());
if( arg.length() == 0 ) {
i++;
if ( i < argv.length ) arg = argv[i];
else {
System.err.println("No value found after " + prefix + " argument tag");
System.exit(1);
}
}
i++;
}
return new Pair<String,Integer>(arg,i);
}
static Map<String,List<String>> processArgs(String argv[]) {
Map<String,List<String>> args = new HashMap<String,List<String>>();
for ( int i = 0; i < argv.length ; i++ ) {
String arg = argv[i];
int pos = arg.indexOf('=');
if ( pos < 0 ) {
System.err.println("Argument "+arg+" is not of the form <ARG>=<VAL>");
System.exit(1);
}
String val = arg.substring(pos+1);
if ( val.length() == 0 ) {
// there was a space between '=' and the value
i++;
if ( i < argv.length ) val = argv[i];
else {
System.err.println("No value found after " + arg + " argument tag");
System.exit(1);
}
}
arg = arg.substring(0,pos);
List<String> l = args.get(arg);
if ( l == null ) {
l = new ArrayList<String>();
args.put(arg,l);
}
l.add(val);
}
return args;
}
static Double extractSingleDoubleArg(String argname, Map<String,List<String>> args) {
List<String> l = args.get(argname);
args.remove(argname);
if ( l == null ) return null;
if ( l.size() > 1 ) {
System.err.println("Only one "+argname+" argument is allowed");
System.exit(1);
}
double d=0;
try {
d = Double.parseDouble(l.get(0));
} catch ( NumberFormatException e) {
System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")");
System.exit(1);
}
System.out.println("Argument "+argname+" set to "+d);
return new Double(d);
}
static Boolean extractSingleBooleanArg(String argname, Map<String,List<String>> args) {
List<String> l = args.get(argname);
args.remove(argname);
if ( l == null ) return null;
if ( l.size() > 1 ) {
System.err.println("Only one "+argname+" argument is allowed");
System.exit(1);
}
if ( l.get(0).equals("true") ) return new Boolean(true);
if ( l.get(0).equals("false") ) return new Boolean(false);
System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed");
System.exit(1);
return null;
}
/* ##############################################
public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend, boolean runOld ) {
w_match = match;
w_mismatch = mismatch;
w_open = open;
w_extend = extend;
if ( runOld ) align_old(seq1,seq2);
else align(seq1,seq2);
}
public SWPairwiseAlignment(byte[] seq1, byte[] seq2, boolean runOld) {
this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0,runOld); // match=1, mismatch = -1/3, gap=-(1+k/3)
}
public void align_old(final byte[] a, final byte[] b) {
final int n = a.length;
final int m = b.length;
double [] sw = new double[(n+1)*(m+1)];
int [] btrack = new int[(n+1)*(m+1)];
calculateMatrix_old(a, b, sw, btrack);
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
}
private void calculateMatrix_old(final byte[] a, final byte[] b, double [] sw, int [] btrack ) {
final int n = a.length+1;
final int m = b.length+1;
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop
byte a_base = a[i-1]; // letter in a at the current pos
final int row_offset = row_offset_1 + m;
// On the entrance into the loop, row_offset_1 is the (linear) offset
// of the first element of row (i-1) and row_offset is the linear offset of the
// start of row i
for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) {
// data_offset_1 is linearized offset of element [i-1][j-1]
final byte b_base = b[j-1]; // letter in b at the current pos
// in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_diag = sw[data_offset_1] + wd(a_base,b_base);
int kd = 0;
double step_down = 0;
for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) {
// data_offset_k is linearized offset of element [i-k][j]
// in other words, trial = sw[i-k][j]+gap_penalty:
final double trial = sw[data_offset_k]+wk(k);
if ( step_down < trial ) {
step_down=trial;
kd = k;
}
}
int ki = 0;
// optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing
// all 'step right' events that would end in the current cell. The optimized code
// does exactly the same thing as the commented out loop below. IMPORTANT:
// the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!!
double step_right = 0;
for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) {
// data_offset is linearized offset of element [i][j-k]
// in other words, step_right=sw[i][j-k]+gap_penalty;
final double trial = sw[data_offset]+wk(k);
if ( step_right < trial ) {
step_right=trial;
ki = k;
}
}
final int data_offset = row_offset + j; // linearized offset of element [i][j]
if ( step_down > step_right ) {
if ( step_down > step_diag ) {
sw[data_offset] = Math.max(0,step_down);
btrack[data_offset] = kd ; // positive=vertical
} else {
sw[data_offset] = Math.max(0,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
} else {
// step_down <= step_right
if ( step_right > step_diag ) {
sw[data_offset] = Math.max(0,step_right);
btrack[data_offset] = -ki; // negative = horizontal
} else {
sw[data_offset] = Math.max(0,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
}
// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
}
// IMPORTANT, IMPORTANT, IMPORTANT:
// note that we update this (secondary) outer loop variable here,
// so that we DO NOT need to update it
// in the for() statement itself.
row_offset_1 = row_offset;
}
// print(sw,a,b);
}
#####################
END COMMENTED OUT SECTION
*/
}