diff --git a/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java
new file mode 100644
index 000000000..5a72eec3c
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/indels/AlignmentUtils.java
@@ -0,0 +1,178 @@
+package org.broadinstitute.sting.playground.indels;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import edu.mit.broad.picard.reference.ReferenceSequence;
+import org.broadinstitute.sting.playground.utils.CountedObject;
+
+import java.util.*;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Mar 25, 2009
+ * Time: 12:15:38 AM
+ * To change this template use File | Settings | File Templates.
+ */
+public class AlignmentUtils {
+
+ /** Computes number of mismatches in the read alignment to the refence ref
+ * specified in the record r. Indels are completely ignored by this method:
+ * only base mismatches in the alignment segments where both sequences are present are counted.
+ * @param r
+ * @param ref
+ * @return
+ */
+ public static int numMismatches(SAMRecord r, ReferenceSequence ref) {
+ if ( r.getReadUnmappedFlag() ) return 1000000;
+ int i_ref = r.getAlignmentStart()-1; // position on the ref
+ int i_read = 0; // position on the read
+ int mm_count = 0; // number of mismatches
+ Cigar c = r.getCigar();
+ for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
+ CigarElement ce = c.getCigarElement(k);
+ switch( ce.getOperator() ) {
+ case M:
+ for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
+ if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) != 'N' ) continue; // do not count N's ?
+ if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
+ Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++;
+ }
+ break;
+ case I: i_read += ce.getLength(); break;
+ case D: i_ref += ce.getLength(); break;
+ default: throw new RuntimeException("Unrecognized cigar element");
+ }
+
+ }
+ return mm_count;
+ }
+
+ public static int numMismatches(SAMRecord r, String ref) {
+ if ( r.getReadUnmappedFlag() ) return 1000000;
+ int i_ref = r.getAlignmentStart()-1; // position on the ref
+ int i_read = 0; // position on the read
+ int mm_count = 0; // number of mismatches
+ Cigar c = r.getCigar();
+ for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
+ CigarElement ce = c.getCigarElement(k);
+ switch( ce.getOperator() ) {
+ case M:
+ for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
+ if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ?
+ if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
+ Character.toUpperCase(ref.charAt(i_ref)) ) mm_count++;
+ }
+ break;
+ case I: i_read += ce.getLength(); break;
+ case D: i_ref += ce.getLength(); break;
+ default: throw new RuntimeException("Unrecognized cigar element");
+ }
+
+ }
+ return mm_count;
+ }
+
+ /** Reads through the alignment cigar and returns all indels found in the alignment as a collection
+ * of Indel objects.
+ * @param c
+ * @param start
+ * @return
+ */
+ public static Collection extractIndels(final Cigar c, final int start) {
+ //
+ // firstpos,lastpos span of the indel will be interpreted as follows:
+ // any alignment that ends strictly before firstpos or starts strictly after lastpos
+ // on the *reference* (not inclusive!) does not overlap with an indel; in the case of
+ // insertion it will result in firstpos > lastpos!
+ // lastpos
+ // | firstpos
+ // | |
+ // v v
+ // ---------III----- Ref Insertion: bases I are not in the ref; any alignment that starts
+ // after lastpos or ends before firstpos *on the reference*
+ // is completely over the reference bases to the right or to
+ // the left, respectively, of the insertion site
+ //
+ // firstpos
+ // | lastpos
+ // | |
+ // v v
+ //------------------ Ref Deletion: any alignment that ends before firstpos or starts after lastpos
+ // -----DDD--- alignment on the reference does not overlap with the deletion
+ int runninglength = start; // position on the original reference; start = alignment start position
+
+ List indels = new ArrayList(4);
+
+ if ( c.numCigarElements() <= 1 ) return indels; // most of the reads have no indels, save a few cycles by returning early
+
+ for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+
+ final CigarElement ce = c.getCigarElement(i);
+ Indel curr_indel = null;
+
+ switch(ce.getOperator()) {
+ case I: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.I); break;
+ case D: curr_indel = new Indel(runninglength, ce.getLength(), Indel.IndelType.D);
+ runninglength += ce.getLength();
+ break;
+ case M: runninglength += ce.getLength(); break; // advance along the gapless block in the alignment
+ default :
+ throw new IllegalArgumentException("Unexpected operator in cigar string");
+ }
+
+ if ( curr_indel == null ) continue; // element was not an indel, go grab next element...
+
+ indels.add(curr_indel); // this is a new indel. Add it.
+ } // end for loop over all alignment cigar elements
+
+ return indels;
+ } // end extractIndels() method
+
+ /** Reads through the alignment specified in the record and returns all indels found in the alignment as a collection
+ * of Indel objects. If read is not mapped, silently returns an empty collection.
+ * @param r
+ * @return
+ */
+ public static Collection extractIndels(SAMRecord r) {
+ if ( r.getReadUnmappedFlag() ) return new ArrayList();
+ return extractIndels(r.getCigar(), r.getAlignmentStart());
+ }
+
+ /** Extracts indels from the specified record (@see #extractIndels()) and updates the provided tree object.
+ * Multiple occurences of the same indel (i.e. with support from multiple reads) will be grouped together
+ * in one counted object held by the set.
+ *
+ * @param r
+ * @param t
+ */
+ public static void collectAndCountIndels(SAMRecord r, TreeSet > t) {
+ Collection indels = AlignmentUtils.extractIndels(r);
+ for ( Indel ind : indels ) {
+ CountedObject ci = new CountedObject(ind);
+ CountedObject found = t.floor(ci);
+
+ if ( ci.equals( found ) ) found.increment(); // we did find our indel, advance the counter
+ else t.add(ci); // this is a new indel. Add it.
+
+ }
+
+ }
+
+ public static String toString(Cigar cig) {
+ StringBuilder b = new StringBuilder();
+
+ for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) {
+ char c='?';
+ switch ( cig.getCigarElement(i).getOperator() ) {
+ case M : c = 'M'; break;
+ case D : c = 'D'; break;
+ case I : c = 'I'; break;
+ }
+ b.append(c);
+ b.append(cig.getCigarElement(i).getLength());
+ }
+ return b.toString();
+ }
+}
diff --git a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java
index 75ed67954..518d4bd71 100644
--- a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java
@@ -132,7 +132,7 @@ public class ConsensusSequence {
*/
public char baseAt(int offset) {
assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds";
- int [] cov = coverage.get(offset+(int)referencePos);
+ int [] cov = coverage.get(offset-startOffset);
int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
int bmax = 0;
char base = 'N';
@@ -153,7 +153,7 @@ public class ConsensusSequence {
*/
public Pair baseWithCountAt(int offset) {
assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds";
- int [] cov = coverage.get(offset+(int)referencePos);
+ int [] cov = coverage.get(offset-startOffset);
int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
int bmax = 0;
char base = 'N';
@@ -174,7 +174,7 @@ public class ConsensusSequence {
*/
public int coverageAt(int offset) {
if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0;
- int [] cov = coverage.get(offset+(int)referencePos);
+ int [] cov = coverage.get(offset-startOffset);
return cov[0]+cov[1]+cov[2]+cov[3];
}
diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
index 7116b4264..16d466213 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
@@ -181,33 +181,7 @@ public class IndelInspector extends CommandLineProgram {
return errs;
}
- private static int numMismatchesDirect(SAMRecord r, ReferenceSequence ref) {
- if ( r.getReadUnmappedFlag() ) return 1000000;
- int i_ref = r.getAlignmentStart()-1; // position on the ref
- int i_read = 0; // position on the read
- int mm_count = 0; // number of mismatches
- Cigar c = r.getCigar();
- for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
- CigarElement ce = c.getCigarElement(k);
- switch( ce.getOperator() ) {
- case M:
- for ( int l = 0 ; l < ce.getLength() ; l++ ) {
- if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
- Character.toUpperCase((char)ref.getBases()[i_ref]) ) mm_count++;
- i_ref++;
- i_read++;
- }
- break;
- case I: i_read += ce.getLength(); break;
- case D: i_ref += ce.getLength(); break;
- default: throw new RuntimeException("Unrecognized cigar element");
- }
-
- }
- return mm_count;
- }
-
- /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
+ /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
private static int numMismatches(SAMRecord r) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java
index 2caa4cc6a..37f0965a5 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/IndelRecordPileCollector.java
@@ -191,8 +191,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
* @param r
* @throws RuntimeException
*/
- @Override
- public void receive(final SAMRecord r) throws RuntimeException {
+ public void receive(final SAMRecord r) throws RuntimeException {
if ( r.getReadUnmappedFlag() ) return; // read did not align, nothing to do
diff --git a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java
index 6586d6301..94dbd6c33 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java
@@ -151,8 +151,20 @@ public class MultipleAlignment implements Iterable {
return alignment_offsets.get(index.get(id));
}
+ /** Returns external id of the read the offsets of this multiple alignment are based upon (i.e. all the offsets
+ * are specified wrt the base read).
+ * @return
+ */
+ public int getBaseReadId() { return ext_ids.get(0); }
-
+ /** Returns offset of the read specified by its external id wrt the start of the consensus sequence in this
+ * multiple alignment (consenus sequence is a major vote union of all the reads in this alignment).
+ * @param id
+ * @return
+ */
+ public int getOffsetWrtConsensus(int id) {
+ return getOffsetById (id)- consensus.getStartOffset();
+ }
/** Returns true if the alignment already contains sequence with the specified id.
*
@@ -235,49 +247,16 @@ public class MultipleAlignment implements Iterable {
if ( seqs.size() == 0 ) return b.toString();
-// int first_offset = 0; // offset of the first sequence wrt the start of the MSA
final int first_offset = -consensus.getStartOffset();
-// int msa_length = 0;
-// for ( int i = 0 ; i < seqs.size() ; i++ ) {
-// if ( -alignment_offsets.get(i) > first_offset ) first_offset = -alignment_offsets.get(i);
-// msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
-// }
-// msa_length += first_offset;
final int msa_length = consensus.length();
-// int [] cov = new int[4];
-// char[] bases = { 'A' , 'C', 'G', 'T' };
char[][] consensusString = new char[4][msa_length];
- for ( int i = -first_offset ; i < msa_length - first_offset ; i++ ) {
-// cov[0] = cov[1] = cov[2] = cov[3] = 0;
-// for ( int j = 0 ; j < seqs.size(); j++ ) {
-// // offset of the sequence j wrt start of the msa region
-// int seq_offset = first_offset + alignment_offsets.get(j);
-// if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
-// int base = -1;
-// switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
-// case 'A': base = 0; break;
-// case 'C': base = 1 ; break;
-// case 'G': base = 2 ; break;
-// case 'T': base = 3 ; break;
-// }
-// if ( base >= 0 ) cov[base]++;
-// }
-// int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
-// int bmax = 0;
-// int mm = 0;
+ for ( int i = 0 ; i < msa_length ; i++ ) {
Pair base = consensus.baseWithCountAt(i-first_offset);
consensusString[3][i] = base.first;
-// for ( int z = 0; z < 4 ; z++ ) {
-// if ( cov[z] > bmax ) {
-// bmax = cov[z];
-// consensus[3][i] = bases[z];
-// mm = total_cov - bmax;
-// }
-// }
- int mm = consensus.coverageAt(i) - base.second;
+ int mm = consensus.coverageAt(i-first_offset) - base.second;
if ( mm > 0 ) {
consensusString[2][i] = '*';
if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10);
@@ -307,6 +286,10 @@ public class MultipleAlignment implements Iterable {
return b.toString();
}
+ public String getConsensus() {
+ return consensus.getSequence();
+ }
+
public String toString() { return toString(true); }
public int size() { return seqs.size(); }
@@ -338,7 +321,6 @@ public class MultipleAlignment implements Iterable {
}
- @Override
public Iterator iterator() {
return sequenceIdIterator();
}
diff --git a/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java
index 153d10dd0..23bc37005 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/PairwiseAlignment.java
@@ -128,10 +128,26 @@ public class PairwiseAlignment {
// lets extend these kmer matches and count mismatches:
for ( Integer trial_offset : offsets ) {
- int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm);
- if ( mm_cnt < best_mm ) {
+ int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm+1);
+// if ( (i1==4||i1==8) && i2==18) {
+// if ( i1== 18 ) System.out.print("to " + i2+" : ");
+// else System.out.print("to " + i1+" : ");
+// System.out.println("offset="+trial_offset.toString()+
+// "; mm=" + countMismatches(is1.getSequence(),is2.getSequence(),trial_offset)+
+// "(mm_cnt="+mm_cnt+")"+
+// "; dist="+distance(is1.getSequence(),is2.getSequence(),trial_offset)+
+// "; overlap="+getOverlap(is1.getSequence(),is2.getSequence(),trial_offset));
+// }
+ // save current offset if alignment at this offset has fewer mismatches tham everything we've
+ // seen so far, or if it has same number of mismatches but has larger overlap (i.e. distance
+ // between sequences is smaller)
+ if ( mm_cnt < best_mm ||
+ ( ( mm_cnt == best_mm ) &&
+ getOverlap(is1.getSequence(),is2.getSequence(),alignment_offset) <
+ 0.8*getOverlap(is1.getSequence(),is2.getSequence(),trial_offset) ) ) {
+// if ( (i1==4||i1==8) && i2==18) System.out.println("Saved offset "+trial_offset.toString());
alignment_offset = trial_offset;
- next_mm = best_mm;
+ next_mm = best_mm;
best_mm = mm_cnt;
} else {
if ( mm_cnt < next_mm ) next_mm = mm_cnt;
diff --git a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java
index 4b4712a37..4a071d506 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java
@@ -1,9 +1,13 @@
package org.broadinstitute.sting.playground.indels;
import net.sf.samtools.SAMRecord;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.TextCigarCodec;
import java.util.*;
import org.broadinstitute.sting.utils.PrimitivePair;
+import org.broadinstitute.sting.playground.utils.CountedObject;
public class PileBuilder implements RecordPileReceiver {
@@ -84,28 +88,35 @@ public class PileBuilder implements RecordPileReceiver {
public void receive(Collection c) {
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
- int left = 1000000000;
- int right = 0;
+ int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
+ int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end)
for ( SAMRecord r : c ) {
seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize);
- left = Math.min(left, r.getAlignmentStart() );
- right = Math.max(right,r.getAlignmentEnd());
+ startOnRef = Math.min(startOnRef, r.getAlignmentStart() );
+ stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd());
}
- SequencePile originalAligns = new SequencePile(referenceSequence.substring(left-1,right));
+ // part of the reference covered by original alignments:
+ String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef);
+
+ int totalMismatches = 0; // total mismatches across all reads
+ TreeSet< CountedObject > all_indels = new TreeSet< CountedObject >();
+
+ SequencePile originalAligns = new SequencePile(pileRef);
for ( SAMRecord r : c ) {
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
- r.getCigar(), r.getAlignmentStart() - left );
+ r.getCigar(), r.getAlignmentStart() - startOnRef );
+ totalMismatches += AlignmentUtils.numMismatches(r,referenceSequence);
+ AlignmentUtils.collectAndCountIndels(r,all_indels);
}
+
System.out.println("\n#############################################################################");
System.out.println("ORIGINAL ALIGNMENT: \n");
- originalAligns.colorprint(true);
+ originalAligns.dotprint(true);
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
List piles = doMultipleAlignment2(seqs);
-
-
// System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
@@ -120,10 +131,68 @@ public class PileBuilder implements RecordPileReceiver {
d.set(n,m,distance(piles.get(n), piles.get(m)));
}
}
+
+ int new_mismatches = 0 ; // number of mismatches after re-alignment:
+ TreeSet< CountedObject > new_indels = new TreeSet< CountedObject >(); // new indels after realignment
+ int shifted_reads = 0;
+
+ List as_list = (List)c; // ugly hack; need this to access records by ids
+
System.out.println(d.format("%8.4g"));
for ( int n = 0 ; n < piles.size() ; n++ ) {
- System.out.println("PILE " + n +"\n" +piles.get(n).toString());
+// SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),2.0,-10.0,-2.0,-1.0);
+ SWPairwiseAlignment consToRef = new SWPairwiseAlignment(pileRef,piles.get(n).getConsensus(),3.0,-1.0,-4,-0.5);
+
+ MultipleAlignment ma = piles.get(n);
+ for ( Integer id : ma ) {
+ SAMRecord r = as_list.get(id);
+ int cons_offset = ma.getOffsetWrtConsensus(id); // offset of the read 'id' wrt multiple alignment's full consensus seq
+ int ref_offset = cons_offset + startOnRef + consToRef.getAlignmentStart2wrt1();
+ if ( ref_offset != r.getAlignmentStart()) shifted_reads++;
+ Cigar cig = buildCigar(cons_offset, r.getReadLength(), consToRef.getCigar());
+ System.out.println(AlignmentUtils.toString(cig));
+ }
+
+ System.out.println("PILE " + n + " to REF ("+ (consToRef.getCigar().numCigarElements()-1)/2 +" indels):");
+ System.out.println(consToRef.toString());
+ System.out.println("PILE " + n +" (READS):\n" +piles.get(n).toString());
}
+
+ }
+
+ /** Assuming that a read of length l has a gapless, fully consumed align starting at s (ZERO-based) to some sequence X,
+ * and that sequence's alignment to some reference Y is described by baseCigar, builds a cigar for the direct
+ * alignment of the read to Y (i.e. if the alignment of X to Y contains indel(s) and the read spans them, the
+ * indels will be inserted into the new cigar for read-Y alignment).
+ * @param s
+ * @param l
+ * @param baseCigar
+ * @return
+ */
+ private Cigar buildCigar(int s, int l, Cigar baseCigar) {
+ int readpos = 0;
+ int refpos = 0;
+
+ List lce = new ArrayList(5); // enough to keep 2 indels. should cover 99.999% of cases...
+
+ CigarElement celem = null;
+ int i = 0;
+ while ( refpos <= s ) {
+ celem = baseCigar.getCigarElement(i);
+ refpos+=celem.getLength();
+ i++;
+ }
+ // we now sit on cigar element that contains start s, and refpos points to the end of that element; i points to next element
+
+ lce.add( new CigarElement(Math.min(refpos-s,l),celem.getOperator()) );
+
+ while ( refpos < s+l ) {
+ celem = baseCigar.getCigarElement(i);
+ lce.add( new CigarElement(Math.min(celem.getLength(),l - refpos), celem.getOperator()) );
+ refpos += celem.getLength();
+ i++;
+ }
+ return new Cigar(lce);
}
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
@@ -308,7 +377,7 @@ public class PileBuilder implements RecordPileReceiver {
return d_av;
}
- /** Computes average distance from sequence i to multiple alignment a for the specified offset of i wrt a
+ /** Computes average distance from sequence i to multiple alignment a for the specified offset of 'i' wrt 'a'
* and returns that distance and pair of sequence indices, on which the specified offset is realized
* @param a
* @param i
diff --git a/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java
index 76604016c..16079c8e5 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/SWPairwiseAlignment.java
@@ -24,20 +24,33 @@ public class SWPairwiseAlignment {
private int alignment_offset; // offset of s2 w/respect to s1
private Cigar alignmentCigar;
+ private double w_match;
+ private double w_mismatch;
+ private double w_open;
+ private double w_extend;
+
private int best_mm; // mismatch count
private static final int IMPOSSIBLE = 1000000000;
private static final int MSTATE = 0;
private static final int ISTATE = 1;
private static final int DSTATE = 2;
- public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2 ) {
+ public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2, double match, double mismatch, double open, double extend ) {
s1 = seq1;
s2 = seq2;
i1 = id1;
i2 = id2;
+ w_match = match;
+ w_mismatch = mismatch;
+ w_open = open;
+ w_extend = extend;
best_mm = IMPOSSIBLE;
//next_mm = IMPOSSIBLE;
- align2(s1,s2);
+ align4(s1,s2);
+ }
+
+ public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2) {
+ this(seq1,seq2,id1,id2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3)
}
/** Initializes the alignment with pair of sequences (that will be immediately aligned) and
@@ -48,6 +61,10 @@ public class SWPairwiseAlignment {
this(seq1,seq2,-1,-1);
}
+ public SWPairwiseAlignment(String seq1, String seq2, double match, double mismatch, double open, double extend) {
+ this(seq1,seq2,-1,-1,match,mismatch,open, extend);
+ }
+
public Cigar getCigar() { return alignmentCigar ; }
public int getAlignmentStart2wrt1() { return alignment_offset; }
@@ -258,6 +275,8 @@ public class SWPairwiseAlignment {
}
+
+
/** Allows for separate gap opening and extension penalties, with backtracking.
*
* @param a
@@ -405,6 +424,143 @@ public void align3(String a, String b) {
alignment_offset = p.first - p.second;
}
+ public void align4(String a, String b) {
+ int n = a.length();
+ int m = b.length();
+ double [][] sw = new double[n+1][m+1];
+
+ int [][] btrack = new int[n+1][m+1];
+
+ // build smith-waterman matrix and keep backtrack info:
+ for ( int i = 1 ; i < n+1 ; i++ ) {
+ char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos
+ for ( int j = 1 ; j < m+1 ; j++ ) {
+ char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos
+ double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
+ double step_down = 0.0 ;
+ int kd = 0;
+ for ( int k = 1 ; k < i ; k++ ) {
+ if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) {
+ step_down=sw[i-k][j]+wk(a_base,'-',k);
+ kd = k;
+ }
+ }
+
+ double step_right = 0;
+ int ki = 0;
+ for ( int k = 1 ; k < j ; k++ ) {
+ if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) {
+ step_right=sw[i][j-k]+wk('-',b_base,k);
+ ki = k;
+ }
+ }
+
+ if ( step_down > step_right ) {
+ if ( step_down > step_diag ) {
+ sw[i][j] = Math.max(0,step_down);
+ btrack[i][j] = kd; // positive=vertical
+ }
+ else {
+ sw[i][j] = Math.max(0,step_diag);
+ btrack[i][j] = 0; // 0 = diagonal
+ }
+ } else {
+ // step_down < step_right
+ if ( step_right > step_diag ) {
+ sw[i][j] = Math.max(0,step_right);
+ btrack[i][j] = -ki; // negative = horizontal
+ } else {
+ sw[i][j] = Math.max(0,step_diag);
+ btrack[i][j] = 0; // 0 = diagonal
+ }
+ }
+ sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
+ }
+ }
+
+// print(sw,a,b);
+
+ PrimitivePair.Int p = new PrimitivePair.Int();
+ double maxscore = 0.0;
+ // look for largest score. we use >= combined with the traversal direction
+ // to ensure that if two scores are equal, the one closer to diagonal gets picked
+ for ( int i = 1 ; i < n+1 ; i++ ) {
+ if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; }
+ }
+ for ( int j = 1 ; j < m+1 ; j++ ) {
+ if ( sw[n][j] > maxscore ||
+ sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) {
+ p.first = n;
+ p.second = j ;
+ maxscore = sw[n][j];
+ }
+ }
+
+ // System.out.println("\ni="+p.first+"; j="+p.second);
+
+ // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
+
+
+ // we will be placing all insertions and deletions into sequence b, so the state are named w/regard
+ // to that sequence
+
+ int state = MSTATE;
+ int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
+
+ double [] scores = new double[3];
+
+ List lce = new ArrayList(5);
+
+ do {
+
+ int btr = btrack[p.first][p.second];
+ int step_left = ( btr < 0 ? -btr : 1);
+ int step_up = ( btr > 0 ? btr : 1 );
+
+ int new_state = -1;
+ if ( btr > 0 ) new_state = DSTATE;
+ else if ( btr < 0 ) new_state = ISTATE;
+ else new_state = MSTATE;
+
+ int step_length = 1;
+
+ // move to next best location in the sw matrix:
+ switch( new_state ) {
+ case MSTATE: p.first--; p.second--; break;
+ case ISTATE: p.second-=step_left; step_length = step_left; break;
+ case DSTATE: p.first-=step_up; step_length = step_up; break;
+ }
+
+ // now let's see if the state actually changed:
+ if ( new_state == state ) segment_length+=step_length;
+ else {
+ // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
+ CigarOperator o=null;
+ switch(state) {
+ case MSTATE: o = CigarOperator.M; break;
+ case ISTATE: o = CigarOperator.I; break;
+ case DSTATE: o = CigarOperator.D; break;
+ }
+ CigarElement e = new CigarElement(segment_length,o);
+ lce.add(e);
+ segment_length = step_length;
+ state = new_state;
+ }
+ } while ( sw[p.first][p.second] != 0 );
+
+ // post-process the last segment we are still keeping
+ CigarOperator o=null;
+ switch(state) {
+ case MSTATE: o = CigarOperator.M; break;
+ case ISTATE: o = CigarOperator.I; break;
+ case DSTATE: o = CigarOperator.D; break;
+ }
+ CigarElement e = new CigarElement(segment_length,o);
+ lce.add(e);
+ Collections.reverse(lce);
+ alignmentCigar = new Cigar(lce);
+ alignment_offset = p.first - p.second;
+ }
private int w(char x, char y) {
if ( x == y ) return 2; // match
@@ -413,12 +569,12 @@ public void align3(String a, String b) {
}
private double wd ( char x, char y ) {
- if ( x== y ) return 2.0;
- else return -1.0;
+ if ( x== y ) return w_match;
+ else return w_mismatch;
}
private double wk(char x, char y, int k) {
- return -2.0-k/6; // gap
+ return w_open+(k-1)*w_extend; // gap
// return -1.0 ; // no extension penalty
// return -1.0-Math.log(k+1); // weak extension penalty
}
@@ -507,6 +663,7 @@ public void align3(String a, String b) {
}
public String toString() {
+ StringBuilder bmm = new StringBuilder();
StringBuilder b1 = new StringBuilder();
StringBuilder b2 = new StringBuilder();
@@ -516,12 +673,14 @@ public void align3(String a, String b) {
for ( ; pos2 < -alignment_offset ; pos2++ ) {
b1.append(' ');
b2.append(s2.charAt(pos2));
+ bmm.append(' ');
}
// now pos2 = -alignment_offset;
} else {
for ( ; pos1 < alignment_offset ; pos1++ ) {
b2.append(' ');
b1.append(s1.charAt(pos1));
+ bmm.append(' ');
}
// now pos1 = alignment_offset
}
@@ -530,73 +689,85 @@ public void align3(String a, String b) {
CigarElement ce = getCigar().getCigarElement(i) ;
switch( ce.getOperator() ) {
case M:
- int z = ce.getLength();
- b1.append(s1, pos1, pos1+z);
- b2.append(s2, pos2, pos2+z);
- pos1+=z; pos2+=z;
+ for ( int k = 0 ; k < ce.getLength() ; k++ ) {
+ if ( Character.toUpperCase(s1.charAt(pos1)) != Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*');
+ else bmm.append(' ');
+ b1.append(s1.charAt(pos1++));
+ b2.append(s2.charAt(pos2++));
+ }
break;
case I:
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
b1.append('+');
+ bmm.append('+');
b2.append(s2.charAt(pos2++));
}
break;
case D:
for ( int k = 0 ; k < ce.getLength() ; k++ ) {
b1.append(s1.charAt(pos1++));
+ bmm.append('-');
b2.append('-');
}
break;
}
}
+ bmm.append('\n');
b1.append(s1,pos1,s1.length());
+ bmm.append(b1);
+ bmm.append('\n');
b2.append(s2,pos2,s2.length());
- b1.append('\n');
- b1.append(b2);
- b1.append('\n');
- return b1.toString();
+ bmm.append(b2);
+ bmm.append('\n');
+ return bmm.toString();
}
public static void testMe() {
-// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
-// String s2 = "TGTATATAGGGTAAGG";
+ /* String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
+ String s2 = "TGTATATAGGGTAAGG";
-// String s1 = "GGTAAGGC";
-// String s2 = "GGTCTCAA";
-
-// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
-// String s2 = "TGTTAGGGTCTCAAGG";
-
-// String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
-// String s2 = "TAGGGTAAGGCTGATCCATGTACCG" ;
-
- String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
- String s2 = "CCGTATCATTACCTGGTGTATATAGG";
-
-// String s1 = "GGTGTATATAGGGT" ;
-// String s2 = "TGTTAGGG";
testMe(s1,s2);
+
+ s1 = "GGTAAGGC";
+ s2 = "GGTCTCAA";
+
+ testMe(s1,s2);
+
+ s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
+ s2 = "TGTTAGGGTCTCAAGG";
+ testMe(s1,s2);
+
+
+ s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
+ s2 = "TAGGGTAAGGCTGATCCATGTACCG" ;
+ testMe(s1,s2);
+
+ s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT";
+ s2 = "CCGTATCATTACCTGGTGTATATAGG";
+ testMe(s1,s2);
+
+ s1 = "GGTGTATATAGGGT" ;
+ s2 = "TGTTAGGG";
+ testMe(s1,s2);
+
+ s1 = "AGACAGAGAGAAGG";
+ s2 = "AGACAGAGAAGG";
+ testMe(s1,s2);
+ */
+ String s1 = "CCAGCACACAGGTATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA";
+ String s2 = "CCAGCACACATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA";
+ testMe(s1,s2);
}
public static void testMe(String s1, String s2) {
- SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2);
+ SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2,3.0,-1.0,-4,-0.5);
-
- for ( int i = 0 ; i < swpa.getCigar().numCigarElements() ; i++ ) {
- char c=' ';
- switch ( swpa.getCigar().getCigarElement(i).getOperator() ) {
- case M : c = 'M'; break;
- case D : c = 'D'; break;
- case I : c = 'I'; break;
- }
- System.out.print(c);
- System.out.print(swpa.getCigar().getCigarElement(i).getLength());
- }
- SequencePile sp = new SequencePile(s1);
- sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1());
- System.out.println();
- System.out.println(sp.format());
+ System.out.println(AlignmentUtils.toString(swpa.getCigar()));
+// SequencePile sp = new SequencePile(s1);
+// sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1());
+// System.out.println();
+// System.out.println(sp.format());
System.out.println("--------\n"+swpa.toString());
diff --git a/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java b/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java
index 38a94ce4e..a5984dbe8 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/SequencePile.java
@@ -6,13 +6,16 @@ import net.sf.samtools.*;
public class SequencePile {
private List mSeqGrid;
- private StringBuffer mRefGrid;
+ private StringBuilder mRefGrid;
+ private StringBuilder headerGrid;
private int mDepth;
private List mSeqRC;
public SequencePile(String ref) {
- mRefGrid = new StringBuffer( ref );
+ mRefGrid = new StringBuilder( ref );
+ headerGrid = new StringBuilder();
+ for ( int i = 0; i < ref.length(); i++ ) headerGrid.append(' ');
mSeqGrid = new ArrayList();
for ( int i = 0 ; i < mRefGrid.length(); i++ ) {
mSeqGrid.add(new MSAColumn());
@@ -39,19 +42,21 @@ public class SequencePile {
// alignedSeq = ReverseComplement(seq);
// } else alignedSeq = seq;
mSeqRC.add(isRC);
-
- int pos = 0; // actual position on the grid (reference can have insertions)
+
+ // will hold actual position on the grid; reference can have insertions on the grid,
+ // so position on the grid where we should start placing the read is not refpos!
+ int pos = 0;
for ( int i = 0 ; i < refpos ; i++ ) { // i is the position on the original reference
- // if we got some insertions on the reference prior to refpos, we need to take care of them:
+ // if we got some insertions on the reference prior to refpos, we need to count them in:
while( mRefGrid.charAt(pos) == '+' ) {
- mSeqGrid.get(pos).add(' ');
+ mSeqGrid.get(pos).add(' '); // add additional spaces in the line that will hold sequence seq
pos++;
}
mSeqGrid.get(pos).add(' '); // fill with ' ' to the left of the read
pos++;
}
- // we reached start position of the alignment
+ // we reached start position of the alignment on the reference grid
int readpos = 0; // position on the read
@@ -66,18 +71,17 @@ public class SequencePile {
if ( pos >= 0 ) {
if ( mRefGrid.charAt(pos) !='+' ) { // there was no insertion here yet: add it now!
mRefGrid.insert(pos, '+');
+ headerGrid.insert(pos,'+');
MSAColumn c = new MSAColumn();
+ // reads up to the previous depth (prior to adding current read) did not
+ // have an insertion here, so we insert '*' into all of them:
for ( int k = 0 ; k < mDepth ; k++ ) {
if ( mSeqGrid.get(pos-1).charAt(k) == ' ') c.add(' ');
else c.add('*');
}
- mSeqGrid.add(pos, c);
- }
- try {
- mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(e.getMessage()+": "+seq);
+ mSeqGrid.add(pos, c); // finally, add the base from the current read
}
+ mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
}
readpos++;
pos++;
@@ -91,6 +95,7 @@ public class SequencePile {
}
if ( pos >= mRefGrid.length() ) break;
mSeqGrid.get(pos).add('-'); // mark deletion
+ headerGrid.setCharAt(pos,'-');
pos++;
}
break;
@@ -102,11 +107,10 @@ public class SequencePile {
pos++;
}
if ( pos >= mRefGrid.length() ) break;
- try {
- mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(e.getMessage()+": "+seq);
- }
+ mSeqGrid.get(pos).add(alignedSeq.charAt(readpos));
+ if ( Character.toUpperCase(alignedSeq.charAt(readpos)) !=
+ Character.toUpperCase(mRefGrid.charAt(pos))
+ && headerGrid.charAt(pos)== ' ') headerGrid.setCharAt(pos,'*');
pos++;
readpos++;
}
@@ -158,35 +162,114 @@ public class SequencePile {
public void colorprint() { colorprint(false); }
- public void colorprint(boolean printId) {
- if ( printId ) System.out.print(" ");
- else System.out.print(" ");
- System.out.println(mRefGrid);
+ public void dotprint(boolean printId) {
- StringBuilder sb = new StringBuilder();
- java.util.Formatter frmt = new java.util.Formatter(sb);
-
+ String skip = null;
+ if ( printId ) skip = new String(" ");
+ else skip = new String(" ");
+
+ System.out.print(formatHeader(skip));
+ System.out.print(skip);
+ System.out.println(mRefGrid);
+
+ try {
+ for ( int i = 0 ; i < mDepth; i++ ) {
+ if ( printId ) System.out.printf("%3d",i);
+ if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-");
+ else System.out.print("->");
+ for ( int j = 0 ; j < mRefGrid.length() ; j++) {
+ char seqbase = mSeqGrid.get(j).charAt(i);
+ char refbase = mRefGrid.charAt(j);
+ if ( isBase(refbase) && isBase(seqbase) &&
+ Character.toUpperCase(refbase) ==
+ Character.toUpperCase(seqbase) ) {
+ if ( mSeqRC.get(i) ) System.out.print(',');
+ else System.out.print('.');
+ }
+ else System.out.print(seqbase);
+ }
+ System.out.print('\n');
+ }
+ } catch (Exception e) {}
+ }
+
+
+ public void colorprint(boolean printId) {
+
+ String skip = null;
+ if ( printId ) skip = new String(" ");
+ else skip = new String(" ");
+
+ System.out.print(formatHeader(skip));
+ System.out.print(skip);
+ System.out.println(mRefGrid);
+
try {
for ( int i = 0 ; i < mDepth; i++ ) {
- if ( printId ) {
- sb.delete(0,sb.length());
- frmt.format("%3d:", i);
- System.out.print(frmt.out().toString());
- }
+ if ( printId ) System.out.printf("%3d",i);
if ( mSeqRC.get(i).booleanValue() ) System.out.print("<-");
else System.out.print("->");
for ( int j = 0 ; j < mRefGrid.length() ; j++) {
char seqbase = mSeqGrid.get(j).charAt(i);
char refbase = mRefGrid.charAt(j);
- if ( isBase(refbase) && isBase(seqbase) && refbase != seqbase ) System.out.print("\033[31m"+seqbase+"\033[30m");
+ if ( isBase(refbase) && isBase(seqbase) &&
+ Character.toUpperCase(refbase) !=
+ Character.toUpperCase(seqbase) ) System.out.print("\033[31m"+seqbase+"\033[30m");
else System.out.print(seqbase);
}
System.out.print('\n');
}
} catch (Exception e) {}
}
-
+
+ private String formatHeader(String leadString) {
+ char [][] mm_strings = new char[2][mRefGrid.length()];
+ for ( int i = 0 ; i < mRefGrid.length() ; i++ ) {
+ int count = 0;
+ char refC = mRefGrid.charAt(i);
+ MSAColumn col = mSeqGrid.get(i);
+ if ( refC == '+' ) {
+ // count number of observations for insertion
+ for ( int j = 0 ; j < col.size() ; j++ ) {
+ if ( col.charAt(j) != '*' ) count++;
+ }
+ } else {
+ if ( headerGrid.charAt(i) == '-' ) {
+ // count number of observations for deletion
+ for ( int j = 0 ; j < col.size() ; j++ ) {
+ if ( col.charAt(j) == '-' ) count++;
+ }
+ } else {
+ if ( headerGrid.charAt(i) == '*') {
+ for ( int j = 0 ; j < col.size() ; j++ ) {
+ if ( col.charAt(j)!=' ' &&
+ Character.toUpperCase(col.charAt(j)) !=
+ Character.toUpperCase(refC) ) count++;
+ }
+ }
+ }
+ }
+ if ( count > 9 ) mm_strings[0][i] = Character.forDigit(count/10,10);
+ else mm_strings[0][i] = ' ';
+ if ( count > 0 ) mm_strings[1][i] = Character.forDigit(count%10,10);
+ else mm_strings[1][i] = ' ';
+ }
+
+ StringBuilder b = new StringBuilder();
+ b.append(leadString);
+ b.append(mm_strings[0]);
+ b.append('\n');
+ b.append(leadString);
+ b.append(mm_strings[1]);
+ b.append('\n');
+ b.append(leadString);
+ b.append(headerGrid);
+ b.append('\n');
+ return b.toString();
+ }
+
private boolean isBase(char b) {
+ b = Character.toUpperCase(b);
return ( b=='A' ||b == 'C' || b=='G' || b=='T' || b=='N');
}
}