diff --git a/playground/java/src/org/broadinstitute/sting/indels/ConsensusSequence.java b/playground/java/src/org/broadinstitute/sting/indels/ConsensusSequence.java
new file mode 100644
index 000000000..6cbd688f0
--- /dev/null
+++ b/playground/java/src/org/broadinstitute/sting/indels/ConsensusSequence.java
@@ -0,0 +1,122 @@
+package org.broadinstitute.sting.indels;
+
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Mar 21, 2009
+ * Time: 4:25:12 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ConsensusSequence {
+ private List< int[] > coverage; // counts of observations of every nucleotide at each position
+ private long referencePos;// (arbitrary) reference position; when adding sequences, their offsets should be wrt this position
+ private int startOffset; // offset of the leftmost base of this consensus sequence wrt referencePos; negative=left, positive=right
+ private final static int NUMBINS = 4;
+
+ public ConsensusSequence(int refPos) {
+ coverage = new ArrayList< int[] >();
+ referencePos = refPos;
+ startOffset = 0;
+ }
+
+ public ConsensusSequence() {
+ this(0);
+ }
+
+ /** Adds sequence se to the consensus, in the sense that all bases of seq are counted
+ * into observed coverage kept by the consensus. The position of the sequence is specified by the
+ * offset with respect to the fixed reference position of the consensus (the latter does not
+ * have to be consensus start), and if the sequence extends beyound the consensus on either end, the
+ * consensus will be extended appropriately to accomodate the full sequence.
+ * @param seq nucleotide sequence ('ACGT...')
+ * @param offset position of the start of the sequence relative to the fixed reference position of the consensus
+ */
+ public void addSequence(String seq, int offset) {
+ // if sequence starts before than the currently held consensus oes, extend consensus to the left
+ if ( offset < startOffset ) {
+ coverage.addAll(0,instantiateCoverageList(startOffset-offset));
+ startOffset = offset;
+ }
+ // if the sequence ends beyound the currently held consensus, extend consensus to the right
+ if ( offset + seq.length() > startOffset + coverage.size() ) {
+ coverage.addAll( instantiateCoverageList(offset+seq.length() - startOffset - coverage.size()) );
+ }
+
+ // count bases from the sequence into the coverage
+ int posOnConsensus = offset - startOffset;
+ for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
+ coverage.get(posOnConsensus)[baseToInt(seq.charAt(i))]++;
+ }
+ }
+
+ /** Removes sequence seq from the consensus. More exactly, 1 will be subtracted from current
+ * observation counts kept by the consensus for each observed base at every position of the sequence. The
+ * position of the sequence is specified by the offset with respect to the reference position
+ * of the consensus. NOTE: this method is unchecked and does not verify that the sequence being subtracted
+ * was indeed previously added to the consensus and/or that the consenus does accomodate full length of
+ * the sequence. If it is not the case, the results can be unpredictable or assert failure may occur.
+ *
+ * @param seq nucleotide sequence ('ACGT...')
+ * @param offset position of the start of the sequence relative to the fixed reference position of the consensus
+ */
+ public void removeSequence(String seq, int offset) {
+ assert offset >= startOffset :
+ "Attempt to remove from consensus a sequence that starts prior to consenus start";
+ assert (offset+seq.length() < startOffset + coverage.size()) :
+ "Attempt to remove from consensus a sequence that extends beyond consensus end";
+ // subtract sequence bases from the coverage
+ int posOnConsensus = offset - startOffset;
+ for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
+ coverage.get(posOnConsensus)[ baseToInt(seq.charAt(i)) ]--;
+ }
+ }
+
+ /** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to
+ * the specified sequence seq starting at position offset wrt consenus reference position.
+ * @param seq
+ * @param offset
+ * @return
+ */
+ public double distance(String seq, int offset) {
+ int posOnConsensus; // index into the currently held consensus sequence
+ int i ; // index into the passed sequence argument
+ if ( offset < startOffset ) {
+ posOnConsensus = 0;
+ i = startOffset - offset;
+ } else {
+ i = 0 ;
+ posOnConsensus = offset - startOffset;
+ }
+ // stop position on the passed sequence (can be less than sequence length if consensus stops prematurely)
+ int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset;
+
+ for ( ; i < stop ; i++, posOnConsensus++ ) {
+ int base = baseToInt(seq.charAt(posOnConsensus));
+ int [] cov = coverage.get(posOnConsensus);
+ int totalcov = cov[0]+cov[1]+cov[2]+cov[3];
+
+ }
+ return 0.0;
+ }
+
+ private List instantiateCoverageList(int n) {
+ List< int[] > subseq = new ArrayList(n);
+ for ( int i = 0 ; i < n ; i++ ) subseq.add(new int[NUMBINS]);
+ return subseq;
+ }
+
+ private int baseToInt(char c) {
+ int base;
+ switch( Character.toUpperCase(c) ) {
+ case 'A': base = 0; break;
+ case 'C': base = 1; break;
+ case 'G': base = 2; break;
+ case 'T': base = 3; break;
+ default : throw new IllegalArgumentException("Sequence can contain only ACGT symbols");
+ }
+ return base;
+ }
+}
diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java
index 86aca048d..2d1e0e94a 100755
--- a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java
@@ -60,8 +60,9 @@ public class IndelInspector extends CommandLineProgram {
ReferenceSequence contig_seq = null;
IndelRecordPileCollector col = null;
+ PileBuilder pileBuilder = new PileBuilder();
try {
- col = new IndelRecordPileCollector(new DiscardingReceiver(), new PileBuilder() );
+ col = new IndelRecordPileCollector(new DiscardingReceiver(), pileBuilder );
} catch(Exception e) { System.err.println(e.getMessage()); }
if ( col == null ) return 1;
@@ -78,6 +79,9 @@ public class IndelInspector extends CommandLineProgram {
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
contig_seq = reference.get(r.getReferenceIndex());
+ String refstr = new String(contig_seq.getBases());
+ col.setReferenceSequence(refstr);
+ pileBuilder.setReferenceSequence(refstr);
System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
}
}
@@ -85,9 +89,12 @@ public class IndelInspector extends CommandLineProgram {
// if contig is specified and wqe did not reach it yet, skip the records until we reach that contig:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
+
+ if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;
+
// if stop position is specified and we are past that, stop reading:
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
-
+
if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now
int err = -1;
diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java
index b86645403..a378bce11 100755
--- a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java
@@ -89,6 +89,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
+ private String referenceSequence;
+
public String memStatsString() {
String s = "mRecordPile: ";
return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef;
@@ -110,6 +112,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
}
defaultReceiver = rr;
indelPileReceiver = rp;
+ referenceSequence = null;
setWaitState();
}
@@ -124,7 +127,11 @@ public class IndelRecordPileCollector implements RecordReceiver {
avoiding_region = false;
mState = WAIT_STATE; // got to do this if we were in avoid_region state
}
-
+
+ public void setReferenceSequence(String contig) {
+ referenceSequence = contig;
+ }
+
/** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile
* all the consequtive records with alignment end positions less than or equal to the specified
* position pos, until the first record is encountered that does not meet this condition. Note that
diff --git a/playground/java/src/org/broadinstitute/sting/indels/Matrix.java b/playground/java/src/org/broadinstitute/sting/indels/Matrix.java
index aeca77dec..094f32fc1 100755
--- a/playground/java/src/org/broadinstitute/sting/indels/Matrix.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/Matrix.java
@@ -1,22 +1,37 @@
package org.broadinstitute.sting.indels;
public class Matrix {
- private int size;
+ private int nRows;
+ private int nCols;
private Object [][] data;
-
- public Matrix(int n) {
- size = n;
- data = new Object[n][n];
+
+ /** Instantiates a generic matrix of objects with n rows and m cols.
+ *
+ * @param n number of rows
+ * @param m number of columns
+ */
+ public Matrix(int n, int m) {
+ nRows = n;
+ nCols = m;
+ data = new Object[n][m];
}
-
+
+ /** Instantiates a square n x n matrix of objects.
+ *
+ * @param n size of the matrix
+ */
+ public Matrix (int n) {
+ this(n,n);
+ }
+
@SuppressWarnings("unchecked")
public T get(int i, int j) {
- assert (i {
@@ -26,26 +27,6 @@ public class MultipleAlignment implements Iterable {
ext_ids.clear();
}
- public static Integer[] SortPermutation( List A )
- {
- final Object[] data = A.toArray();
-
- class comparator implements Comparator
- {
- public int compare(Integer a, Integer b)
- {
- return ((T)data[a]).compareTo(data[b]);
- }
- }
- Integer[] permutation = new Integer[A.size()];
- for (int i = 0; i < A.size(); i++)
- {
- permutation[i] = i;
- }
- Arrays.sort(permutation, new comparator());
- return permutation;
- }
-
/** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown
*
* @param seq sequence to add
@@ -108,7 +89,8 @@ public class MultipleAlignment implements Iterable {
alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second ));
}
}
-
+
+
/** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile
*
* @param id query id
@@ -154,7 +136,13 @@ public class MultipleAlignment implements Iterable {
alignment_offsets.add(off2+a.getOffsetById(id));
}
}
-
+
+
+ public void add(MultipleAlignment a, PairwiseAlignment p) {
+ if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set");
+ add(a,p,p.id1(),p.id2());
+ }
+
/** Returns true if the alignment already contains sequence with the specified id
*
* @param id
@@ -168,21 +156,20 @@ public class MultipleAlignment implements Iterable {
return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i));
}
- /** Returns the length of the overlapping region of sequences si and sj .
+ /** Returns the length of the overlapping region of the two sequences specified by their external ids i and j.
*
* @return overlap size
*/
public int getOverlap(int i, int j) {
- if ( ! contains(i) || ! contains(j) ) return -1;
+ if ( ! contains(i) || ! contains(j) ) throw new RuntimeException("Sequence with specified id is not in MSA pile");
int off = getOffsetById(j) - getOffsetById(i);
- if ( off >= 0 ) {
- return Math.min(getSequenceById(i).length()-off, getSequenceById(j).length());
- } else {
- return Math.min(getSequenceById(j).length()+off, getSequenceById(i).length());
- }
+ int L;
+ if ( off >= 0 ) L = Math.min(getSequenceById(i).length()-off, getSequenceById(j).length());
+ else L = Math.min(getSequenceById(j).length()+off, getSequenceById(i).length());
+ return ( L < 0 ? 0 : L );
}
- /** Given the two indices, one of which has to be already in the pile, returns the one that is not in the pile.
+ /** Given the two sequence ids, one of which has to be already in the pile, returns the one that is not in the pile.
*
* @param i sequence id
* @param j sequence id
@@ -199,13 +186,23 @@ public class MultipleAlignment implements Iterable {
}
}
- public String skipN(int n) {
+ /** Returns a string consisting of n spaces.
+ *
+ * @param n
+ * @return
+ */
+ private String skipN(int n) {
StringBuilder b=new StringBuilder();
for ( int k = 0 ; k < n ; k++ ) b.append(' ');
return b.toString();
}
- public void skipN(int n, StringBuilder b) {
+ /** Prints n spaces directly into the specified string builder.
+ *
+ * @param n
+ * @param b
+ */
+ private void skipN(int n, StringBuilder b) {
for ( int k = 0 ; k < n ; k++ ) b.append(' ');
}
@@ -220,12 +217,60 @@ public class MultipleAlignment implements Iterable {
if ( seqs.size() == 0 ) return b.toString();
int skip_first = 0;
+ int msa_length = 0;
for ( int i = 0 ; i < seqs.size() ; i++ ) {
if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i);
+ msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
}
+ msa_length += skip_first;
+ int [] cov = new int[4];
+ char[] bases = { 'A' , 'C', 'G', 'T' };
+ char[][] consensus = new char[4][msa_length];
+
+ for ( int i = 0 ; i < msa_length ; i++ ) {
+ cov[0] = cov[1] = cov[2] = cov[3] = 0;
+ for ( int j = 0 ; j < seqs.size(); j++ ) {
+ // offset of the sequence j wrt start of the msa region
+ int seq_offset = skip_first + alignment_offsets.get(j);
+ if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
+ int base = -1;
+ switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
+ case 'A': base = 0; break;
+ case 'C': base = 1 ; break;
+ case 'G': base = 2 ; break;
+ case 'T': base = 3 ; break;
+ }
+ if ( base >= 0 ) cov[base]++;
+ }
+ int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
+ int bmax = 0;
+ int mm = 0;
+ consensus[3][i] = 'N';
+ for ( int z = 0; z < 4 ; z++ ) {
+ if ( cov[z] > bmax ) {
+ bmax = cov[z];
+ consensus[3][i] = bases[z];
+ mm = total_cov - bmax;
+ }
+ }
+ if ( mm > 0 ) {
+ consensus[2][i] = '*';
+ if ( mm > 9 ) consensus[0][i] = Character.forDigit(mm/10,10);
+ else consensus[0][i] = ' ';
+ consensus[1][i] = Character.forDigit(mm%10,10);
+ } else {
+ consensus[0][i] = consensus[1][i] = consensus[2][i] = ' ';
+ }
+ }
+
+ b.append(" "); b.append(consensus[0]); b.append('\n');
+ b.append(" "); b.append(consensus[1]); b.append('\n');
+ b.append(" "); b.append(consensus[2]); b.append('\n');
+ b.append(" "); b.append(consensus[3]); b.append('\n');
+
Integer[] perm = null;
- if ( inorder ) perm = SortPermutation(alignment_offsets);
+ if ( inorder ) perm = Utils.SortPermutation(alignment_offsets);
for ( int i = 0 ; i < seqs.size() ; i++ ) {
int index = (inorder ? perm[i] : i);
@@ -248,8 +293,12 @@ public class MultipleAlignment implements Iterable {
*/
public Iterator sequenceIdIterator() { return index.keySet().iterator(); }
+ /** Returns an iterator over external seuqnce ids of the sequences stored in the pile, presenting them in
+ * the order of ascending alignment offsets.
+ * @return
+ */
public Iterator sequenceIdByOffsetIterator() {
- final Integer[] perm = SortPermutation(alignment_offsets);
+ final Integer[] perm = Utils.SortPermutation(alignment_offsets);
return new Iterator() {
private int i = 0;
public boolean hasNext() {
diff --git a/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java b/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java
index 87ceb3dee..14f411a0a 100755
--- a/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java
@@ -46,8 +46,26 @@ public class PairwiseAlignment {
* if s2 is shifted left (starts earlier) wrt s1
*/
public int getBestOffset2wrt1() { return alignment_offset; }
+
+ /** Returns offset of the sequence j wrt sequence i in the best pairwise alignment found.
+ *
+ * @param i extrenal id of a sequence, must be one of the sequences kept by this alignment
+ * @param j extrenal id of a sequence, must be one of the sequences kept by this alignment
+ * @return offset of 2nd arg (j) wrt to the first arg (i)
+ */
+ public int getBestOffset2wrt1(int i, int j ) {
+ if ( i == i1 && j == i2 ) return alignment_offset;
+ else if ( i == i2 && j == i1 ) return -alignment_offset;
+ throw new RuntimeException("Specified sequence id not found in the alignment");
+ }
+
public String getSequence1() { return s1; }
public String getSequence2() { return s2; }
+ public String getSequenceById(int i) {
+ if ( i == i1 ) return s1;
+ else if ( i == i2 ) return s2;
+ throw new RuntimeException("Specified sequence id not found in the alignment");
+ }
public int id1() { return i1;}
public int id2() { return i2;}
@@ -78,6 +96,16 @@ public class PairwiseAlignment {
return Math.min(s2.length()+alignment_offset, s1.length());
}
}
+
+ public static int getOverlap(String seq1, String seq2, int offset2wrt1) {
+ int L ;
+ if ( offset2wrt1 >= 0 ) {
+ L = Math.min(seq1.length()-offset2wrt1, seq2.length());
+ } else {
+ L = Math.min(seq2.length()+offset2wrt1, seq1.length());
+ }
+ return ( L < 0 ? 0 : L );
+ }
/** Returns true if at least one alignment, no matter how bad, was found between the two sequences
* (i.e. the sequences have at least one kmer in common).
@@ -116,7 +144,8 @@ public class PairwiseAlignment {
int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 );
int cnt = 0;
while ( pos1 < seq1.length() && pos2 < seq2.length() ) {
- if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue;
+ if ( Character.toUpperCase(seq1.charAt(pos1++)) ==
+ Character.toUpperCase(seq2.charAt(pos2++)) ) continue;
cnt++; // found mismatch
}
return cnt;
@@ -127,7 +156,8 @@ public class PairwiseAlignment {
int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 );
int cnt = 0;
while ( pos1 < seq1.length() && pos2 < seq2.length() && cnt < maxerr ) {
- if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue;
+ if ( Character.toUpperCase(seq1.charAt(pos1++)) ==
+ Character.toUpperCase(seq2.charAt(pos2++)) ) continue;
cnt++; // found mismatch
}
return cnt;
@@ -157,6 +187,14 @@ public class PairwiseAlignment {
return ( l / (double)L );
}
+ public static double distance(String seq1, String seq2, int offset2wrt1) {
+ int L = getOverlap(seq1,seq2,offset2wrt1);
+ if ( L <= 0 ) return 1e100;
+ int mm = countMismatches(seq1,seq2,offset2wrt1);
+ double l = ( mm == 0 ? 1.0 : (double)mm + Math.sqrt((double)mm) );
+ return ( l / (double) L );
+ }
+
}
diff --git a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java
index 801f7c504..396d12a72 100755
--- a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java
@@ -2,16 +2,26 @@ package org.broadinstitute.sting.indels;
import net.sf.samtools.SAMRecord;
-import java.util.Collection;
-import java.util.Iterator;
+import java.util.*;
public class PileBuilder implements RecordPileReceiver {
- private StrictlyUpperTriangularMatrix distances ;
+ private SymmetricMatrix distances ;
private Matrix alignments ;
private static final int KmerSize = 8;
private MultipleAlignment alignments1;
private MultipleAlignment alignments2;
+ private String referenceSequence;
+ private int reference_start;
+
+ private static class IntPair {
+ int first;
+ int second;
+ public IntPair(int i, int j) { set(i,j); }
+ public int first() { return first; }
+ public int second() { return second; }
+ public void set(int i, int j) { first = i; second = j; }
+ }
private static class SelectedPair {
private int i_;
@@ -46,32 +56,88 @@ public class PileBuilder implements RecordPileReceiver {
}
}
+
+ public class SelectedSequence {
+ private int id_;
+ private double d_;
+
+ private SelectedSequence(int i, double d) {
+ set(i,d);
+ }
+
+ private SelectedSequence() { this(-1,1e100) ; }
+ private void set(int i, double d) { id_ = i; d_ = d; }
+
+ public double d() { return d_;}
+ public int i() { return id_; }
+
+ }
- public PileBuilder() {}
+ public PileBuilder() {
+ referenceSequence = null;
+ reference_start = -1;
+ }
+
+ public void setReferenceSequence(String seq, int start) {
+ referenceSequence = seq;
+ reference_start = start;
+ }
+
+ public void setReferenceSequence(String seq) {
+ referenceSequence = seq;
+ reference_start = -1;
+ }
public void receive(Collection c) {
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
+ int left = 1000000000;
+ int right = 0;
for ( SAMRecord r : c ) {
seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize);
+ left = Math.min(left, r.getAlignmentStart() );
+ right = Math.max(right,r.getAlignmentEnd());
}
- doMultipleAlignment(seqs);
- System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
- System.out.print("; diameter of PILE1: "+ diameter(alignments1));
- System.out.println("; diameter of PILE2: "+ diameter(alignments2));
- System.out.println("PILE 1: \n"+alignments1.toString());
- System.out.println("PILE 2: \n"+alignments2.toString());
+ SequencePile originalAligns = new SequencePile(referenceSequence.substring(left-1,right));
+ for ( SAMRecord r : c ) {
+ originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
+ r.getCigar(), r.getAlignmentStart() - left );
+ }
+ System.out.println("\n#############################################################################");
+ System.out.println("ORIGINAL ALIGNMENT: \n");
+ originalAligns.colorprint(true);
+ System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
+ List piles = doMultipleAlignment2(seqs);
+// System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
+// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
+// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
+
+// System.out.println("PILE 1: \n"+alignments1.toString());
+// System.out.println("PILE 2: \n"+alignments2.toString());
+
+ SymmetricMatrix d = new SymmetricMatrix(piles.size());
+ for ( int n = 0 ; n < piles.size() ; n++ ) {
+ d.set(n,n,diameter(piles.get(n)));
+ for ( int m = n+1 ; m < piles.size() ; m++ ) {
+ d.set(n,m,distance(piles.get(n), piles.get(m)));
+ }
+ }
+ System.out.println(d.format("%8.4g"));
+ for ( int n = 0 ; n < piles.size() ; n++ ) {
+ System.out.println("PILE " + n +"\n" +piles.get(n).toString());
+ }
}
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
- distances = new StrictlyUpperTriangularMatrix( seqs.length );
+ distances = new SymmetricMatrix( seqs.length );
alignments = new Matrix( seqs.length );
for( int i = 0; i < seqs.length ; i++ ) {
for ( int j = i+1 ; j < seqs.length ; j++ ) {
PairwiseAlignment a = new PairwiseAlignment(seqs[i],seqs[j],i,j); // compute pairwise alignment
alignments.set(i, j, a); // save it
+ alignments.set(j, i, a); // save it
distances.set(i,j,a.distance());
}
}
@@ -146,21 +212,152 @@ public class PileBuilder implements RecordPileReceiver {
SelectedPair p = new SelectedPair(-1,-1,1e100);
for ( Integer id : a ) {
- for (int i = 0; i < id; i++) {
+ for (int i = 0; i < distances.size(); i++) {
if (a.contains(i)) continue; // a already contains both sequences (i,id)
double d = distances.get(i, id);
if (d < p.d() ) p.set(i,id,d);
}
- for (int j = id + 1; j < distances.size() ; j++) {
- if (a.contains(j)) continue; // a already contains both sequences (id, j)
- double d = distances.get(id, j);
- if (d < p.d()) p.set(id,j,d);
- }
}
return p;
}
-
+ public SelectedPair findClosestToPileAverage(MultipleAlignment a) {
+
+ SelectedPair p = new SelectedPair(-1,-1,1e100);
+
+ // currently, we compute the average distance from each sequence to the pile, but if the average
+ // distance is small enough, we will try to stitch that sequence to the pile based on the *best*
+ // available pairwise alignment, best_id will keep the id of that sequence from the pile that
+ // has the best alignment with the sequence that is the closest on average
+ int best_id=-1;
+
+ Set offsets = new HashSet(); // all putative offsets suggested by different p-wise alignments
+ for ( int i = 0 ; i < distances.size() ; i++ ) { // for each sequence i
+
+ if ( a.contains(i) ) continue; // sequence i is already in the pile, ignore it
+
+ offsets.clear();
+
+ for ( Integer id : a ) { // for all sequences from the msa pile
+ PairwiseAlignment pa = alignments.get(i,id);
+ if ( pa.getOverlap() <= 0 ) continue; // at this step we do not take into account sequences with no overlap
+ // alignment pa suggests this offset of i wrt the first sequence in the msa
+ offsets.add( pa.getBestOffset2wrt1(id,i)+a.getOffsetById(id));
+ }
+ // we got all suggested offsets; now lets recompute distances:
+
+ for( Integer off : offsets ) {
+ SelectedPair spo = averageDistanceForOffset(a,i,off);
+ if ( spo.d() < p.d() ) p.set(spo.i(),spo.j(),spo.d());
+ }
+ }
+ return p;
+ }
+
+ public Matrix averageClosestDistanceMatrix(List la, int n) {
+
+ Matrix mp = new Matrix(n);
+
+ for ( int i = 0 ; i < n ; i++ ) {
+ for ( int j = i + 1 ; j < n ; j++ ) {
+ mp.set(i,j, findBestAlignment(la.get(i),la.get(j)) );
+ mp.set(j,i, mp.get(i,j) );
+ }
+ }
+ return mp;
+ }
+
+ public SelectedPair findBestAlignment(MultipleAlignment a1, MultipleAlignment a2) {
+
+ Map all_offsets = new HashMap();
+ SelectedPair p = new SelectedPair(-1,-1,1e100);
+
+ for ( Integer id1 : a1 ) {
+ for ( Integer id2 : a2 ) {
+ PairwiseAlignment pa = alignments.get(id1,id2);
+ if ( pa.getOverlap() <= 0 ) continue; // id1 and id2 do not overlap and/or we don't have p-wise alignment
+
+ // record suggested offset of a2 wrt a1 (defined by their first sequences), and remember the
+ // pairwise alignment that suggested it
+ int suggested_offset = a1.getOffsetById(id1) + pa.getBestOffset2wrt1(id1,id2) - a2.getOffsetById(id2);
+ if ( ! all_offsets.containsKey(suggested_offset) ) {
+ all_offsets.put( suggested_offset , new IntPair(id1,id2)) ;
+ }
+ }
+ }
+ for ( Map.Entry offset_record : all_offsets.entrySet() ) {
+
+ double d = averageDistanceForOffset(a1,a2,offset_record.getKey());
+ if ( d < p.d() ) p.set(offset_record.getValue().first(),offset_record.getValue().second(),d);
+ }
+ return p;
+ }
+
+ public double averageDistanceForOffset(MultipleAlignment a1, MultipleAlignment a2, int offset) {
+ SelectedPair p = new SelectedPair();
+
+ double d_av = 0;
+ int nseq = 0;
+ int i1 = -1;
+ int i2 = -1;
+
+ for ( Integer id2 : a2 ) {
+ SelectedPair spo = averageDistanceForOffset(a1,id2,offset+a2.getOffsetById(id2));
+ if ( spo.d() > 1e99 ) continue;
+ nseq++;
+ d_av += spo.d();
+ }
+ if ( nseq == 0 ) return 1e100;
+ d_av /= nseq;
+ return d_av;
+ }
+
+ /** Computes average distance from sequence i to multiple alignment a for the specified offset of i wrt a
+ * and returns that distance and pair of sequence indices, on which the specified offset is realized
+ * @param a
+ * @param i
+ * @param offset
+ * @return
+ */
+ public SelectedPair averageDistanceForOffset(MultipleAlignment a, int i, int offset) {
+
+ SelectedPair p = new SelectedPair(-1,-1,1e100);
+
+ double d = 0; // will hold average distance
+ double dmin = 1e100; // used to find the nearest individual sequence in the pile
+ int nseq = 0; // number of sequences in the pile that have distance to sequence i defined
+ int best_id = -1;
+
+ for ( Integer id : a ) { // for all sequences from the msa pile
+ PairwiseAlignment pa = alignments.get(i,id);
+ int new_off = offset - a.getOffsetById(id); // offset of i wrt id as suggested by
+ double dist_for_off; // distance between i and id for the given offset off
+
+ // check if p-wise alignment has data for the specified offset:
+ boolean canuse = false;
+ if ( pa.alignmentExists() && pa.getBestOffset2wrt1(id,i) == new_off ) {
+ dist_for_off = distances.get(i,id);
+ canuse = true; // can use this alignment to stitch i to a
+ }
+ else {
+ // offset is different from what the pwise alignment suggests; recompute!
+ dist_for_off = PairwiseAlignment.distance(pa.getSequenceById(id),pa.getSequenceById(i),new_off);
+ }
+ if ( dist_for_off > 1e99 ) continue; // at this offset, i and id do not overlap, go check next id
+ d += dist_for_off;
+ nseq++;
+ if ( dist_for_off < dmin && canuse ) {
+ dmin = dist_for_off;
+ best_id = id;
+ }
+
+ }
+ if ( nseq == 0 ) return p;
+ d /= (double)nseq;
+ p.set(i,best_id,d);
+ return p;
+ }
+
/** Finds, among all sequences, the one farthest from the specified pile. Being
* the 'farthest' is defined as having the largest lower bound of the distances to all sequences in the pile.
*
@@ -178,7 +375,7 @@ public class PileBuilder implements RecordPileReceiver {
double d_min = 1e100; // smallest distance from sequence i to the pile
for ( Integer id : a ) {
- double d = ( i < id ? distances.get(i, id) : distances.get(id,i) );
+ double d = distances.get(i, id) ;
if (d < d_min ) d_min = d;
}
// d_min is the smallest distance from sequence i to pile a
@@ -195,8 +392,7 @@ public class PileBuilder implements RecordPileReceiver {
double d = 1e100;
for ( Integer id1 : a1 ) {
for ( Integer id2 : a2 ) {
- if ( id1 < id2 && distances.get(id1,id2) < d ) d = distances.get(id1,id2);
- if ( id1 > id2 && distances.get(id2,id1) < d ) d = distances.get(id2,id1);
+ if ( distances.get(id1,id2) < d ) d = distances.get(id1,id2);
}
}
return d;
@@ -217,7 +413,7 @@ public class PileBuilder implements RecordPileReceiver {
double d = 1e100; // will hold distance from id1 to its closest neighbor
for ( Integer id2 : a ) {
if ( id2 == id1 ) continue;
- double dpair = ( id1 < id2 ? distances.get(id1,id2) : distances.get(id2,id1) );
+ double dpair = distances.get(id1,id2) ;
d = Math.min(d,dpair);
}
// d = distance from id1 to its closest neighbor within the pile
@@ -240,7 +436,8 @@ public class PileBuilder implements RecordPileReceiver {
PileBuilder pb = new PileBuilder();
- pb.doMultipleAlignment(seqs);
+ //pb.doMultipleAlignment(seqs);
+ pb.doMultipleAlignment2(seqs);
System.out.print("Distance between final piles: "+pb.distance(pb.alignments1, pb.alignments2));
System.out.print("; diameter of PILE1: "+ pb.diameter(pb.alignments1));
System.out.println("; diameter of PILE2: "+ pb.diameter(pb.alignments2));
@@ -252,6 +449,8 @@ public class PileBuilder implements RecordPileReceiver {
public void doMultipleAlignment(IndexedSequence[] seqs) {
// two piles we are going to grow until all sequences are assigned to one of them.
// we intend to keep the piles disjoint, e.g. no sequence should be placed in both
+
+
MultipleAlignment pile1 = new MultipleAlignment();
MultipleAlignment pile2 = new MultipleAlignment();
@@ -260,7 +459,7 @@ public class PileBuilder implements RecordPileReceiver {
// all the pairwise alignments are computed and disjoint best and next-best pairs are found
- //System.out.println( distances.format("%8.4g "));
+// System.out.println( distances.format("%8.4g "));
SelectedPair pworst = findWorst();
@@ -268,13 +467,14 @@ public class PileBuilder implements RecordPileReceiver {
pile1.add(seqs[pworst.i()].getSequence(), pworst.i());
pile2.add(seqs[pworst.j()].getSequence(), pworst.j());
-/*
+
// initialize piles with best and next-best pairs
+/*
SelectedPair p_best = findClosestPair();
SelectedPair p_nextbest = findNextClosestPairAfter(p_best);
- pile1.add( alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j());
- pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j());
-*/
+ pile1.add( alignments.get(p_best.i(), p_best.j()));
+ pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()));
+*/
/*
System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):");
System.out.println(pile1.toString());
@@ -283,39 +483,44 @@ public class PileBuilder implements RecordPileReceiver {
*/
SelectedPair p1 = null;
SelectedPair p2 = null;
-
+
// grow piles hierarchical clustering-style
while ( pile1.size() + pile2.size() < seqs.length ) {
// find candidate sequences closest to each of the two piles
- p1 = findClosestToPile(pile1);
- p2 = findClosestToPile(pile2);
+
+// p1 = findClosestToPileAverage(pile1); // findClosestToPile(pile1);
+// p2 = findClosestToPileAverage(pile2); //findClosestToPile(pile2);
+ p1 = findClosestToPile(pile1); // findClosestToPile(pile1);
+ p2 = findClosestToPile(pile2); //findClosestToPile(pile2);
int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1
int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2
if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) {
// pile1 and pile 2 are mutually the closest, so we need to merge them.
// if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2),
// so we just merge on one of the (redundant) instances:
- pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j());
+ pile1.add(pile2, alignments.get( p1.i(), p1.j()));
pile2.clear(); // need to reset pile 2 to something else
int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1
pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else {
if ( p1.d() < p2.d() ) {
if ( pile2.contains(id1_cand) ) {
- pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j());
+ pile1.add(pile2, alignments.get( p1.i(), p1.j()));
pile2.clear(); // need to reset pile 2 to something else
int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1
pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else pile1.add( alignments.get(p1.i(), p1.j()) );
} else {
if ( pile1.contains(id2_cand) ) {
- pile2.add(pile1, alignments.get( p2.i(), p2.j()),p2.i(), p2.j());
+ pile2.add(pile1, alignments.get( p2.i(), p2.j()));
pile1.clear(); // need to reset pile 2 to something else
int z = findFarthestFromPile(pile2); // get sequence farthest from merged pile 1
pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else pile2.add( alignments.get(p2.i(), p2.j()) );
}
}
+ System.out.println("PILE 1: \n"+pile1.toString());
+ System.out.println("PILE 2: \n"+pile2.toString());
} // end WHILE
alignments1 = pile1;
@@ -327,6 +532,56 @@ public class PileBuilder implements RecordPileReceiver {
}
*/
}
+
+public List doMultipleAlignment2(IndexedSequence[] seqs) {
+
+ initPairwiseAlignments(seqs);
+
+ List piles = new LinkedList();
+
+ int npiles = seqs.length;
+
+ for ( int i = 0 ; i < seqs.length ; i++ ) {
+ MultipleAlignment m = new MultipleAlignment();
+ m.add(seqs[i].getSequence(),i);
+ piles.add(m);
+ }
+
+ while ( npiles > 2 ) {
+ Matrix dist = averageClosestDistanceMatrix(piles,npiles);
+ int best_i = -1;
+ int best_j = -1;
+ int pile_i = -1;
+ int pile_j = -1;
+ double d = 1e100;
+ for ( int i = 0 ; i < npiles ; i++ ) {
+ for ( int j = i+1 ; j < npiles ; j++ ) {
+ SelectedPair p = dist.get(i,j);
+ if ( p.d() < d ) {
+ d = p.d();
+ pile_i = i;
+ pile_j = j;
+ best_i = p.i();
+ best_j = p.j();
+ }
+ }
+ }
+// System.out.println("joining pile "+pile_i +" and pile " + pile_j +" on seqs " + best_i +" and " + best_j );
+ // got the closest pair
+ piles.get(pile_i).add(piles.get(pile_j),alignments.get(best_i,best_j));
+// System.out.println("JOINED PILE: \n"+piles.get(pile_i).toString());
+ piles.remove(pile_j);
+ npiles--;
+ }
+
+ alignments1 = piles.get(0);
+ alignments2 = piles.get(1);
+
+
+// System.out.println("PILE 1: \n"+piles.get(0).toString());
+// System.out.println("PILE 2: \n"+piles.get(1).toString());
+ return piles;
+}
public static IndexedSequence[] testSet1(int K) {
IndexedSequence [] seqs = new IndexedSequence[9];
diff --git a/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java b/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java
index 37e346eef..54f927027 100755
--- a/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java
@@ -1,81 +1,23 @@
package org.broadinstitute.sting.indels;
-public class StrictlyUpperTriangularMatrix {
- private double [] data;
- private int size;
-
+public class StrictlyUpperTriangularMatrix extends SymmetricMatrix {
+
public StrictlyUpperTriangularMatrix(int dimension) {
+ super(dimension);
assert dimension >=2 : "Distance matrix can not be smaller than 2x2";
- if ( dimension % 2 == 0 ) {
- int k = dimension >> 1; // dimension/2
- data = new double[k*(dimension-1)];
- } else {
- int k = ( dimension -1 ) >> 1; // (dimension -1)/2
- data = new double[k*dimension];
- }
- size = dimension;
}
public double get(int i, int j) {
- assert (i < size) && (j= j ) return 0.0;
-
- // we are guaranteed now that i < j; now
- // translate i,j into the linear offset into our internal data array and return the value:
- return data[ linearOffset(i,j) ];
+ return super.get(i,j);
}
public void set(int i, int j, double value) {
- assert (i < size) && (j>= 1; // k/=2
- // now k is the offset of the first stored element in row i
- return ( k + (j - i - 1));
- }
+
private static void testMe() {
StrictlyUpperTriangularMatrix m = new StrictlyUpperTriangularMatrix(3);
diff --git a/playground/java/src/org/broadinstitute/sting/indels/SymmetricMatrix.java b/playground/java/src/org/broadinstitute/sting/indels/SymmetricMatrix.java
new file mode 100644
index 000000000..9a1ec0459
--- /dev/null
+++ b/playground/java/src/org/broadinstitute/sting/indels/SymmetricMatrix.java
@@ -0,0 +1,87 @@
+package org.broadinstitute.sting.indels;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Mar 22, 2009
+ * Time: 3:43:05 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class SymmetricMatrix {
+
+ protected double [] data;
+ protected int size;
+
+ public SymmetricMatrix(int dimension) {
+ assert dimension >= 0 : "Matrix size can not be negative";
+ if ( dimension % 2 == 0 ) {
+ int k = dimension >> 1; // dimension/2
+ data = new double[k*(dimension+1)];
+ } else {
+ int k = ( dimension + 1 ) >> 1; // (dimension + 1)/2
+ data = new double[k*dimension];
+ }
+ size = dimension;
+ }
+
+ public double get(int i, int j) {
+ assert (i < size) && ( j < size) : "Out of bound index into matrix detected";
+ if ( i >= j ) return data[linearOffset(j,i)]; // we store only the upper triangle in memory
+ return data[ linearOffset(i,j) ];
+ }
+
+ public void set(int i, int j, double value) {
+ assert (i < size) && (j < size) : "Out of bound index into matrix detected";
+
+ if ( i >= j ) data[ linearOffset(j,i) ] = value;
+ else data[ linearOffset(i,j) ] = value;
+ }
+
+ public int size() { return size; }
+
+ public int nRows() { return size; }
+ public int nCols() { return size; }
+
+ /** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!);
+ * each element is formatted according to a default format.
+ * @return
+ * @see #format(String f)
+ */
+ public String format() {
+ return format("%6.3f ");
+ }
+
+
+ /** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!);
+ * each element is formatted according to a specified format string (note: format string must include all desired
+ * whitespaces before and/or after an element, as this method itself does not add any spaces between the formatted elements).
+ * @return
+ * @see #format()
+ */
+ public String format(String f) {
+ StringBuilder b = new StringBuilder();
+ java.util.Formatter frm = new java.util.Formatter(b);
+ for ( int i = 0 ; i < size ; i++ ) {
+ for ( int j = 0 ; j < size ; j++ ) {
+ frm.format(f, get(i,j));
+ }
+ b.append('\n');
+ }
+ return b.toString();
+ }
+
+
+ /** Computes linear offset into the internal array that keeps actual data, given "row" and "column" indices
+ * into the matrix; this method is unchecked, but it expects i <= j otherwise the result is unspecified.
+ * @param i row index
+ * @param j column index
+ * @return linear offset into the data[] member of this class
+ */
+ private int linearOffset(int i, int j) {
+ int k = (( size << 1 ) - i + 1)*i; // [ 2*d - (i+1) ] * i
+ k >>= 1; // k/=2
+ // now k is the offset of the first stored element in row i
+ return ( k + j - i );
+ }
+
+}