diff --git a/playground/java/src/org/broadinstitute/sting/indels/ConsensusSequence.java b/playground/java/src/org/broadinstitute/sting/indels/ConsensusSequence.java new file mode 100644 index 000000000..6cbd688f0 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/indels/ConsensusSequence.java @@ -0,0 +1,122 @@ +package org.broadinstitute.sting.indels; + +import java.util.List; +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Mar 21, 2009 + * Time: 4:25:12 PM + * To change this template use File | Settings | File Templates. + */ +public class ConsensusSequence { + private List< int[] > coverage; // counts of observations of every nucleotide at each position + private long referencePos;// (arbitrary) reference position; when adding sequences, their offsets should be wrt this position + private int startOffset; // offset of the leftmost base of this consensus sequence wrt referencePos; negative=left, positive=right + private final static int NUMBINS = 4; + + public ConsensusSequence(int refPos) { + coverage = new ArrayList< int[] >(); + referencePos = refPos; + startOffset = 0; + } + + public ConsensusSequence() { + this(0); + } + + /** Adds sequence se to the consensus, in the sense that all bases of seq are counted + * into observed coverage kept by the consensus. The position of the sequence is specified by the + * offset with respect to the fixed reference position of the consensus (the latter does not + * have to be consensus start), and if the sequence extends beyound the consensus on either end, the + * consensus will be extended appropriately to accomodate the full sequence. + * @param seq nucleotide sequence ('ACGT...') + * @param offset position of the start of the sequence relative to the fixed reference position of the consensus + */ + public void addSequence(String seq, int offset) { + // if sequence starts before than the currently held consensus oes, extend consensus to the left + if ( offset < startOffset ) { + coverage.addAll(0,instantiateCoverageList(startOffset-offset)); + startOffset = offset; + } + // if the sequence ends beyound the currently held consensus, extend consensus to the right + if ( offset + seq.length() > startOffset + coverage.size() ) { + coverage.addAll( instantiateCoverageList(offset+seq.length() - startOffset - coverage.size()) ); + } + + // count bases from the sequence into the coverage + int posOnConsensus = offset - startOffset; + for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { + coverage.get(posOnConsensus)[baseToInt(seq.charAt(i))]++; + } + } + + /** Removes sequence seq from the consensus. More exactly, 1 will be subtracted from current + * observation counts kept by the consensus for each observed base at every position of the sequence. The + * position of the sequence is specified by the offset with respect to the reference position + * of the consensus. NOTE: this method is unchecked and does not verify that the sequence being subtracted + * was indeed previously added to the consensus and/or that the consenus does accomodate full length of + * the sequence. If it is not the case, the results can be unpredictable or assert failure may occur. + * + * @param seq nucleotide sequence ('ACGT...') + * @param offset position of the start of the sequence relative to the fixed reference position of the consensus + */ + public void removeSequence(String seq, int offset) { + assert offset >= startOffset : + "Attempt to remove from consensus a sequence that starts prior to consenus start"; + assert (offset+seq.length() < startOffset + coverage.size()) : + "Attempt to remove from consensus a sequence that extends beyond consensus end"; + // subtract sequence bases from the coverage + int posOnConsensus = offset - startOffset; + for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { + coverage.get(posOnConsensus)[ baseToInt(seq.charAt(i)) ]--; + } + } + + /** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to + * the specified sequence seq starting at position offset wrt consenus reference position. + * @param seq + * @param offset + * @return + */ + public double distance(String seq, int offset) { + int posOnConsensus; // index into the currently held consensus sequence + int i ; // index into the passed sequence argument + if ( offset < startOffset ) { + posOnConsensus = 0; + i = startOffset - offset; + } else { + i = 0 ; + posOnConsensus = offset - startOffset; + } + // stop position on the passed sequence (can be less than sequence length if consensus stops prematurely) + int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset; + + for ( ; i < stop ; i++, posOnConsensus++ ) { + int base = baseToInt(seq.charAt(posOnConsensus)); + int [] cov = coverage.get(posOnConsensus); + int totalcov = cov[0]+cov[1]+cov[2]+cov[3]; + + } + return 0.0; + } + + private List instantiateCoverageList(int n) { + List< int[] > subseq = new ArrayList(n); + for ( int i = 0 ; i < n ; i++ ) subseq.add(new int[NUMBINS]); + return subseq; + } + + private int baseToInt(char c) { + int base; + switch( Character.toUpperCase(c) ) { + case 'A': base = 0; break; + case 'C': base = 1; break; + case 'G': base = 2; break; + case 'T': base = 3; break; + default : throw new IllegalArgumentException("Sequence can contain only ACGT symbols"); + } + return base; + } +} diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java index 86aca048d..2d1e0e94a 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelInspector.java @@ -60,8 +60,9 @@ public class IndelInspector extends CommandLineProgram { ReferenceSequence contig_seq = null; IndelRecordPileCollector col = null; + PileBuilder pileBuilder = new PileBuilder(); try { - col = new IndelRecordPileCollector(new DiscardingReceiver(), new PileBuilder() ); + col = new IndelRecordPileCollector(new DiscardingReceiver(), pileBuilder ); } catch(Exception e) { System.err.println(e.getMessage()); } if ( col == null ) return 1; @@ -78,6 +79,9 @@ public class IndelInspector extends CommandLineProgram { if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break; if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) { contig_seq = reference.get(r.getReferenceIndex()); + String refstr = new String(contig_seq.getBases()); + col.setReferenceSequence(refstr); + pileBuilder.setReferenceSequence(refstr); System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString()); } } @@ -85,9 +89,12 @@ public class IndelInspector extends CommandLineProgram { // if contig is specified and wqe did not reach it yet, skip the records until we reach that contig: if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue; + + if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue; + // if stop position is specified and we are past that, stop reading: if ( location != null && r.getAlignmentStart() > location.getStop() ) break; - + if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now int err = -1; diff --git a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java index b86645403..a378bce11 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java +++ b/playground/java/src/org/broadinstitute/sting/indels/IndelRecordPileCollector.java @@ -89,6 +89,8 @@ public class IndelRecordPileCollector implements RecordReceiver { private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there + private String referenceSequence; + public String memStatsString() { String s = "mRecordPile: "; return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef; @@ -110,6 +112,7 @@ public class IndelRecordPileCollector implements RecordReceiver { } defaultReceiver = rr; indelPileReceiver = rp; + referenceSequence = null; setWaitState(); } @@ -124,7 +127,11 @@ public class IndelRecordPileCollector implements RecordReceiver { avoiding_region = false; mState = WAIT_STATE; // got to do this if we were in avoid_region state } - + + public void setReferenceSequence(String contig) { + referenceSequence = contig; + } + /** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile * all the consequtive records with alignment end positions less than or equal to the specified * position pos, until the first record is encountered that does not meet this condition. Note that diff --git a/playground/java/src/org/broadinstitute/sting/indels/Matrix.java b/playground/java/src/org/broadinstitute/sting/indels/Matrix.java index aeca77dec..094f32fc1 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/Matrix.java +++ b/playground/java/src/org/broadinstitute/sting/indels/Matrix.java @@ -1,22 +1,37 @@ package org.broadinstitute.sting.indels; public class Matrix { - private int size; + private int nRows; + private int nCols; private Object [][] data; - - public Matrix(int n) { - size = n; - data = new Object[n][n]; + + /** Instantiates a generic matrix of objects with n rows and m cols. + * + * @param n number of rows + * @param m number of columns + */ + public Matrix(int n, int m) { + nRows = n; + nCols = m; + data = new Object[n][m]; } - + + /** Instantiates a square n x n matrix of objects. + * + * @param n size of the matrix + */ + public Matrix (int n) { + this(n,n); + } + @SuppressWarnings("unchecked") public T get(int i, int j) { - assert (i { @@ -26,26 +27,6 @@ public class MultipleAlignment implements Iterable { ext_ids.clear(); } - public static Integer[] SortPermutation( List A ) - { - final Object[] data = A.toArray(); - - class comparator implements Comparator - { - public int compare(Integer a, Integer b) - { - return ((T)data[a]).compareTo(data[b]); - } - } - Integer[] permutation = new Integer[A.size()]; - for (int i = 0; i < A.size(); i++) - { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - /** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown * * @param seq sequence to add @@ -108,7 +89,8 @@ public class MultipleAlignment implements Iterable { alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second )); } } - + + /** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile * * @param id query id @@ -154,7 +136,13 @@ public class MultipleAlignment implements Iterable { alignment_offsets.add(off2+a.getOffsetById(id)); } } - + + + public void add(MultipleAlignment a, PairwiseAlignment p) { + if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set"); + add(a,p,p.id1(),p.id2()); + } + /** Returns true if the alignment already contains sequence with the specified id * * @param id @@ -168,21 +156,20 @@ public class MultipleAlignment implements Iterable { return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i)); } - /** Returns the length of the overlapping region of sequences si and sj . + /** Returns the length of the overlapping region of the two sequences specified by their external ids i and j. * * @return overlap size */ public int getOverlap(int i, int j) { - if ( ! contains(i) || ! contains(j) ) return -1; + if ( ! contains(i) || ! contains(j) ) throw new RuntimeException("Sequence with specified id is not in MSA pile"); int off = getOffsetById(j) - getOffsetById(i); - if ( off >= 0 ) { - return Math.min(getSequenceById(i).length()-off, getSequenceById(j).length()); - } else { - return Math.min(getSequenceById(j).length()+off, getSequenceById(i).length()); - } + int L; + if ( off >= 0 ) L = Math.min(getSequenceById(i).length()-off, getSequenceById(j).length()); + else L = Math.min(getSequenceById(j).length()+off, getSequenceById(i).length()); + return ( L < 0 ? 0 : L ); } - /** Given the two indices, one of which has to be already in the pile, returns the one that is not in the pile. + /** Given the two sequence ids, one of which has to be already in the pile, returns the one that is not in the pile. * * @param i sequence id * @param j sequence id @@ -199,13 +186,23 @@ public class MultipleAlignment implements Iterable { } } - public String skipN(int n) { + /** Returns a string consisting of n spaces. + * + * @param n + * @return + */ + private String skipN(int n) { StringBuilder b=new StringBuilder(); for ( int k = 0 ; k < n ; k++ ) b.append(' '); return b.toString(); } - public void skipN(int n, StringBuilder b) { + /** Prints n spaces directly into the specified string builder. + * + * @param n + * @param b + */ + private void skipN(int n, StringBuilder b) { for ( int k = 0 ; k < n ; k++ ) b.append(' '); } @@ -220,12 +217,60 @@ public class MultipleAlignment implements Iterable { if ( seqs.size() == 0 ) return b.toString(); int skip_first = 0; + int msa_length = 0; for ( int i = 0 ; i < seqs.size() ; i++ ) { if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i); + msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length ); } + msa_length += skip_first; + int [] cov = new int[4]; + char[] bases = { 'A' , 'C', 'G', 'T' }; + char[][] consensus = new char[4][msa_length]; + + for ( int i = 0 ; i < msa_length ; i++ ) { + cov[0] = cov[1] = cov[2] = cov[3] = 0; + for ( int j = 0 ; j < seqs.size(); j++ ) { + // offset of the sequence j wrt start of the msa region + int seq_offset = skip_first + alignment_offsets.get(j); + if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i + int base = -1; + switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) { + case 'A': base = 0; break; + case 'C': base = 1 ; break; + case 'G': base = 2 ; break; + case 'T': base = 3 ; break; + } + if ( base >= 0 ) cov[base]++; + } + int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; + int bmax = 0; + int mm = 0; + consensus[3][i] = 'N'; + for ( int z = 0; z < 4 ; z++ ) { + if ( cov[z] > bmax ) { + bmax = cov[z]; + consensus[3][i] = bases[z]; + mm = total_cov - bmax; + } + } + if ( mm > 0 ) { + consensus[2][i] = '*'; + if ( mm > 9 ) consensus[0][i] = Character.forDigit(mm/10,10); + else consensus[0][i] = ' '; + consensus[1][i] = Character.forDigit(mm%10,10); + } else { + consensus[0][i] = consensus[1][i] = consensus[2][i] = ' '; + } + } + + b.append(" "); b.append(consensus[0]); b.append('\n'); + b.append(" "); b.append(consensus[1]); b.append('\n'); + b.append(" "); b.append(consensus[2]); b.append('\n'); + b.append(" "); b.append(consensus[3]); b.append('\n'); + Integer[] perm = null; - if ( inorder ) perm = SortPermutation(alignment_offsets); + if ( inorder ) perm = Utils.SortPermutation(alignment_offsets); for ( int i = 0 ; i < seqs.size() ; i++ ) { int index = (inorder ? perm[i] : i); @@ -248,8 +293,12 @@ public class MultipleAlignment implements Iterable { */ public Iterator sequenceIdIterator() { return index.keySet().iterator(); } + /** Returns an iterator over external seuqnce ids of the sequences stored in the pile, presenting them in + * the order of ascending alignment offsets. + * @return + */ public Iterator sequenceIdByOffsetIterator() { - final Integer[] perm = SortPermutation(alignment_offsets); + final Integer[] perm = Utils.SortPermutation(alignment_offsets); return new Iterator() { private int i = 0; public boolean hasNext() { diff --git a/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java b/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java index 87ceb3dee..14f411a0a 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java +++ b/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java @@ -46,8 +46,26 @@ public class PairwiseAlignment { * if s2 is shifted left (starts earlier) wrt s1 */ public int getBestOffset2wrt1() { return alignment_offset; } + + /** Returns offset of the sequence j wrt sequence i in the best pairwise alignment found. + * + * @param i extrenal id of a sequence, must be one of the sequences kept by this alignment + * @param j extrenal id of a sequence, must be one of the sequences kept by this alignment + * @return offset of 2nd arg (j) wrt to the first arg (i) + */ + public int getBestOffset2wrt1(int i, int j ) { + if ( i == i1 && j == i2 ) return alignment_offset; + else if ( i == i2 && j == i1 ) return -alignment_offset; + throw new RuntimeException("Specified sequence id not found in the alignment"); + } + public String getSequence1() { return s1; } public String getSequence2() { return s2; } + public String getSequenceById(int i) { + if ( i == i1 ) return s1; + else if ( i == i2 ) return s2; + throw new RuntimeException("Specified sequence id not found in the alignment"); + } public int id1() { return i1;} public int id2() { return i2;} @@ -78,6 +96,16 @@ public class PairwiseAlignment { return Math.min(s2.length()+alignment_offset, s1.length()); } } + + public static int getOverlap(String seq1, String seq2, int offset2wrt1) { + int L ; + if ( offset2wrt1 >= 0 ) { + L = Math.min(seq1.length()-offset2wrt1, seq2.length()); + } else { + L = Math.min(seq2.length()+offset2wrt1, seq1.length()); + } + return ( L < 0 ? 0 : L ); + } /** Returns true if at least one alignment, no matter how bad, was found between the two sequences * (i.e. the sequences have at least one kmer in common). @@ -116,7 +144,8 @@ public class PairwiseAlignment { int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 ); int cnt = 0; while ( pos1 < seq1.length() && pos2 < seq2.length() ) { - if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue; + if ( Character.toUpperCase(seq1.charAt(pos1++)) == + Character.toUpperCase(seq2.charAt(pos2++)) ) continue; cnt++; // found mismatch } return cnt; @@ -127,7 +156,8 @@ public class PairwiseAlignment { int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 ); int cnt = 0; while ( pos1 < seq1.length() && pos2 < seq2.length() && cnt < maxerr ) { - if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue; + if ( Character.toUpperCase(seq1.charAt(pos1++)) == + Character.toUpperCase(seq2.charAt(pos2++)) ) continue; cnt++; // found mismatch } return cnt; @@ -157,6 +187,14 @@ public class PairwiseAlignment { return ( l / (double)L ); } + public static double distance(String seq1, String seq2, int offset2wrt1) { + int L = getOverlap(seq1,seq2,offset2wrt1); + if ( L <= 0 ) return 1e100; + int mm = countMismatches(seq1,seq2,offset2wrt1); + double l = ( mm == 0 ? 1.0 : (double)mm + Math.sqrt((double)mm) ); + return ( l / (double) L ); + } + } diff --git a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java index 801f7c504..396d12a72 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java +++ b/playground/java/src/org/broadinstitute/sting/indels/PileBuilder.java @@ -2,16 +2,26 @@ package org.broadinstitute.sting.indels; import net.sf.samtools.SAMRecord; -import java.util.Collection; -import java.util.Iterator; +import java.util.*; public class PileBuilder implements RecordPileReceiver { - private StrictlyUpperTriangularMatrix distances ; + private SymmetricMatrix distances ; private Matrix alignments ; private static final int KmerSize = 8; private MultipleAlignment alignments1; private MultipleAlignment alignments2; + private String referenceSequence; + private int reference_start; + + private static class IntPair { + int first; + int second; + public IntPair(int i, int j) { set(i,j); } + public int first() { return first; } + public int second() { return second; } + public void set(int i, int j) { first = i; second = j; } + } private static class SelectedPair { private int i_; @@ -46,32 +56,88 @@ public class PileBuilder implements RecordPileReceiver { } } + + public class SelectedSequence { + private int id_; + private double d_; + + private SelectedSequence(int i, double d) { + set(i,d); + } + + private SelectedSequence() { this(-1,1e100) ; } + private void set(int i, double d) { id_ = i; d_ = d; } + + public double d() { return d_;} + public int i() { return id_; } + + } - public PileBuilder() {} + public PileBuilder() { + referenceSequence = null; + reference_start = -1; + } + + public void setReferenceSequence(String seq, int start) { + referenceSequence = seq; + reference_start = start; + } + + public void setReferenceSequence(String seq) { + referenceSequence = seq; + reference_start = -1; + } public void receive(Collection c) { IndexedSequence[] seqs = new IndexedSequence[c.size()]; int i = 0; + int left = 1000000000; + int right = 0; for ( SAMRecord r : c ) { seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize); + left = Math.min(left, r.getAlignmentStart() ); + right = Math.max(right,r.getAlignmentEnd()); } - doMultipleAlignment(seqs); - System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); - System.out.print("; diameter of PILE1: "+ diameter(alignments1)); - System.out.println("; diameter of PILE2: "+ diameter(alignments2)); - System.out.println("PILE 1: \n"+alignments1.toString()); - System.out.println("PILE 2: \n"+alignments2.toString()); + SequencePile originalAligns = new SequencePile(referenceSequence.substring(left-1,right)); + for ( SAMRecord r : c ) { + originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), + r.getCigar(), r.getAlignmentStart() - left ); + } + System.out.println("\n#############################################################################"); + System.out.println("ORIGINAL ALIGNMENT: \n"); + originalAligns.colorprint(true); + System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; + List piles = doMultipleAlignment2(seqs); +// System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); +// System.out.print("; diameter of PILE1: "+ diameter(alignments1)); +// System.out.println("; diameter of PILE2: "+ diameter(alignments2)); + +// System.out.println("PILE 1: \n"+alignments1.toString()); +// System.out.println("PILE 2: \n"+alignments2.toString()); + + SymmetricMatrix d = new SymmetricMatrix(piles.size()); + for ( int n = 0 ; n < piles.size() ; n++ ) { + d.set(n,n,diameter(piles.get(n))); + for ( int m = n+1 ; m < piles.size() ; m++ ) { + d.set(n,m,distance(piles.get(n), piles.get(m))); + } + } + System.out.println(d.format("%8.4g")); + for ( int n = 0 ; n < piles.size() ; n++ ) { + System.out.println("PILE " + n +"\n" +piles.get(n).toString()); + } } public void initPairwiseAlignments( IndexedSequence [] seqs ) { - distances = new StrictlyUpperTriangularMatrix( seqs.length ); + distances = new SymmetricMatrix( seqs.length ); alignments = new Matrix( seqs.length ); for( int i = 0; i < seqs.length ; i++ ) { for ( int j = i+1 ; j < seqs.length ; j++ ) { PairwiseAlignment a = new PairwiseAlignment(seqs[i],seqs[j],i,j); // compute pairwise alignment alignments.set(i, j, a); // save it + alignments.set(j, i, a); // save it distances.set(i,j,a.distance()); } } @@ -146,21 +212,152 @@ public class PileBuilder implements RecordPileReceiver { SelectedPair p = new SelectedPair(-1,-1,1e100); for ( Integer id : a ) { - for (int i = 0; i < id; i++) { + for (int i = 0; i < distances.size(); i++) { if (a.contains(i)) continue; // a already contains both sequences (i,id) double d = distances.get(i, id); if (d < p.d() ) p.set(i,id,d); } - for (int j = id + 1; j < distances.size() ; j++) { - if (a.contains(j)) continue; // a already contains both sequences (id, j) - double d = distances.get(id, j); - if (d < p.d()) p.set(id,j,d); - } } return p; } - + public SelectedPair findClosestToPileAverage(MultipleAlignment a) { + + SelectedPair p = new SelectedPair(-1,-1,1e100); + + // currently, we compute the average distance from each sequence to the pile, but if the average + // distance is small enough, we will try to stitch that sequence to the pile based on the *best* + // available pairwise alignment, best_id will keep the id of that sequence from the pile that + // has the best alignment with the sequence that is the closest on average + int best_id=-1; + + Set offsets = new HashSet(); // all putative offsets suggested by different p-wise alignments + for ( int i = 0 ; i < distances.size() ; i++ ) { // for each sequence i + + if ( a.contains(i) ) continue; // sequence i is already in the pile, ignore it + + offsets.clear(); + + for ( Integer id : a ) { // for all sequences from the msa pile + PairwiseAlignment pa = alignments.get(i,id); + if ( pa.getOverlap() <= 0 ) continue; // at this step we do not take into account sequences with no overlap + // alignment pa suggests this offset of i wrt the first sequence in the msa + offsets.add( pa.getBestOffset2wrt1(id,i)+a.getOffsetById(id)); + } + // we got all suggested offsets; now lets recompute distances: + + for( Integer off : offsets ) { + SelectedPair spo = averageDistanceForOffset(a,i,off); + if ( spo.d() < p.d() ) p.set(spo.i(),spo.j(),spo.d()); + } + } + return p; + } + + public Matrix averageClosestDistanceMatrix(List la, int n) { + + Matrix mp = new Matrix(n); + + for ( int i = 0 ; i < n ; i++ ) { + for ( int j = i + 1 ; j < n ; j++ ) { + mp.set(i,j, findBestAlignment(la.get(i),la.get(j)) ); + mp.set(j,i, mp.get(i,j) ); + } + } + return mp; + } + + public SelectedPair findBestAlignment(MultipleAlignment a1, MultipleAlignment a2) { + + Map all_offsets = new HashMap(); + SelectedPair p = new SelectedPair(-1,-1,1e100); + + for ( Integer id1 : a1 ) { + for ( Integer id2 : a2 ) { + PairwiseAlignment pa = alignments.get(id1,id2); + if ( pa.getOverlap() <= 0 ) continue; // id1 and id2 do not overlap and/or we don't have p-wise alignment + + // record suggested offset of a2 wrt a1 (defined by their first sequences), and remember the + // pairwise alignment that suggested it + int suggested_offset = a1.getOffsetById(id1) + pa.getBestOffset2wrt1(id1,id2) - a2.getOffsetById(id2); + if ( ! all_offsets.containsKey(suggested_offset) ) { + all_offsets.put( suggested_offset , new IntPair(id1,id2)) ; + } + } + } + for ( Map.Entry offset_record : all_offsets.entrySet() ) { + + double d = averageDistanceForOffset(a1,a2,offset_record.getKey()); + if ( d < p.d() ) p.set(offset_record.getValue().first(),offset_record.getValue().second(),d); + } + return p; + } + + public double averageDistanceForOffset(MultipleAlignment a1, MultipleAlignment a2, int offset) { + SelectedPair p = new SelectedPair(); + + double d_av = 0; + int nseq = 0; + int i1 = -1; + int i2 = -1; + + for ( Integer id2 : a2 ) { + SelectedPair spo = averageDistanceForOffset(a1,id2,offset+a2.getOffsetById(id2)); + if ( spo.d() > 1e99 ) continue; + nseq++; + d_av += spo.d(); + } + if ( nseq == 0 ) return 1e100; + d_av /= nseq; + return d_av; + } + + /** Computes average distance from sequence i to multiple alignment a for the specified offset of i wrt a + * and returns that distance and pair of sequence indices, on which the specified offset is realized + * @param a + * @param i + * @param offset + * @return + */ + public SelectedPair averageDistanceForOffset(MultipleAlignment a, int i, int offset) { + + SelectedPair p = new SelectedPair(-1,-1,1e100); + + double d = 0; // will hold average distance + double dmin = 1e100; // used to find the nearest individual sequence in the pile + int nseq = 0; // number of sequences in the pile that have distance to sequence i defined + int best_id = -1; + + for ( Integer id : a ) { // for all sequences from the msa pile + PairwiseAlignment pa = alignments.get(i,id); + int new_off = offset - a.getOffsetById(id); // offset of i wrt id as suggested by + double dist_for_off; // distance between i and id for the given offset off + + // check if p-wise alignment has data for the specified offset: + boolean canuse = false; + if ( pa.alignmentExists() && pa.getBestOffset2wrt1(id,i) == new_off ) { + dist_for_off = distances.get(i,id); + canuse = true; // can use this alignment to stitch i to a + } + else { + // offset is different from what the pwise alignment suggests; recompute! + dist_for_off = PairwiseAlignment.distance(pa.getSequenceById(id),pa.getSequenceById(i),new_off); + } + if ( dist_for_off > 1e99 ) continue; // at this offset, i and id do not overlap, go check next id + d += dist_for_off; + nseq++; + if ( dist_for_off < dmin && canuse ) { + dmin = dist_for_off; + best_id = id; + } + + } + if ( nseq == 0 ) return p; + d /= (double)nseq; + p.set(i,best_id,d); + return p; + } + /** Finds, among all sequences, the one farthest from the specified pile. Being * the 'farthest' is defined as having the largest lower bound of the distances to all sequences in the pile. * @@ -178,7 +375,7 @@ public class PileBuilder implements RecordPileReceiver { double d_min = 1e100; // smallest distance from sequence i to the pile for ( Integer id : a ) { - double d = ( i < id ? distances.get(i, id) : distances.get(id,i) ); + double d = distances.get(i, id) ; if (d < d_min ) d_min = d; } // d_min is the smallest distance from sequence i to pile a @@ -195,8 +392,7 @@ public class PileBuilder implements RecordPileReceiver { double d = 1e100; for ( Integer id1 : a1 ) { for ( Integer id2 : a2 ) { - if ( id1 < id2 && distances.get(id1,id2) < d ) d = distances.get(id1,id2); - if ( id1 > id2 && distances.get(id2,id1) < d ) d = distances.get(id2,id1); + if ( distances.get(id1,id2) < d ) d = distances.get(id1,id2); } } return d; @@ -217,7 +413,7 @@ public class PileBuilder implements RecordPileReceiver { double d = 1e100; // will hold distance from id1 to its closest neighbor for ( Integer id2 : a ) { if ( id2 == id1 ) continue; - double dpair = ( id1 < id2 ? distances.get(id1,id2) : distances.get(id2,id1) ); + double dpair = distances.get(id1,id2) ; d = Math.min(d,dpair); } // d = distance from id1 to its closest neighbor within the pile @@ -240,7 +436,8 @@ public class PileBuilder implements RecordPileReceiver { PileBuilder pb = new PileBuilder(); - pb.doMultipleAlignment(seqs); + //pb.doMultipleAlignment(seqs); + pb.doMultipleAlignment2(seqs); System.out.print("Distance between final piles: "+pb.distance(pb.alignments1, pb.alignments2)); System.out.print("; diameter of PILE1: "+ pb.diameter(pb.alignments1)); System.out.println("; diameter of PILE2: "+ pb.diameter(pb.alignments2)); @@ -252,6 +449,8 @@ public class PileBuilder implements RecordPileReceiver { public void doMultipleAlignment(IndexedSequence[] seqs) { // two piles we are going to grow until all sequences are assigned to one of them. // we intend to keep the piles disjoint, e.g. no sequence should be placed in both + + MultipleAlignment pile1 = new MultipleAlignment(); MultipleAlignment pile2 = new MultipleAlignment(); @@ -260,7 +459,7 @@ public class PileBuilder implements RecordPileReceiver { // all the pairwise alignments are computed and disjoint best and next-best pairs are found - //System.out.println( distances.format("%8.4g ")); +// System.out.println( distances.format("%8.4g ")); SelectedPair pworst = findWorst(); @@ -268,13 +467,14 @@ public class PileBuilder implements RecordPileReceiver { pile1.add(seqs[pworst.i()].getSequence(), pworst.i()); pile2.add(seqs[pworst.j()].getSequence(), pworst.j()); -/* + // initialize piles with best and next-best pairs +/* SelectedPair p_best = findClosestPair(); SelectedPair p_nextbest = findNextClosestPairAfter(p_best); - pile1.add( alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j()); - pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j()); -*/ + pile1.add( alignments.get(p_best.i(), p_best.j())); + pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j())); +*/ /* System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):"); System.out.println(pile1.toString()); @@ -283,39 +483,44 @@ public class PileBuilder implements RecordPileReceiver { */ SelectedPair p1 = null; SelectedPair p2 = null; - + // grow piles hierarchical clustering-style while ( pile1.size() + pile2.size() < seqs.length ) { // find candidate sequences closest to each of the two piles - p1 = findClosestToPile(pile1); - p2 = findClosestToPile(pile2); + +// p1 = findClosestToPileAverage(pile1); // findClosestToPile(pile1); +// p2 = findClosestToPileAverage(pile2); //findClosestToPile(pile2); + p1 = findClosestToPile(pile1); // findClosestToPile(pile1); + p2 = findClosestToPile(pile2); //findClosestToPile(pile2); int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1 int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2 if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) { // pile1 and pile 2 are mutually the closest, so we need to merge them. // if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2), // so we just merge on one of the (redundant) instances: - pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); + pile1.add(pile2, alignments.get( p1.i(), p1.j())); pile2.clear(); // need to reset pile 2 to something else int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1 pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence } else { if ( p1.d() < p2.d() ) { if ( pile2.contains(id1_cand) ) { - pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j()); + pile1.add(pile2, alignments.get( p1.i(), p1.j())); pile2.clear(); // need to reset pile 2 to something else int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1 pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence } else pile1.add( alignments.get(p1.i(), p1.j()) ); } else { if ( pile1.contains(id2_cand) ) { - pile2.add(pile1, alignments.get( p2.i(), p2.j()),p2.i(), p2.j()); + pile2.add(pile1, alignments.get( p2.i(), p2.j())); pile1.clear(); // need to reset pile 2 to something else int z = findFarthestFromPile(pile2); // get sequence farthest from merged pile 1 pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence } else pile2.add( alignments.get(p2.i(), p2.j()) ); } } + System.out.println("PILE 1: \n"+pile1.toString()); + System.out.println("PILE 2: \n"+pile2.toString()); } // end WHILE alignments1 = pile1; @@ -327,6 +532,56 @@ public class PileBuilder implements RecordPileReceiver { } */ } + +public List doMultipleAlignment2(IndexedSequence[] seqs) { + + initPairwiseAlignments(seqs); + + List piles = new LinkedList(); + + int npiles = seqs.length; + + for ( int i = 0 ; i < seqs.length ; i++ ) { + MultipleAlignment m = new MultipleAlignment(); + m.add(seqs[i].getSequence(),i); + piles.add(m); + } + + while ( npiles > 2 ) { + Matrix dist = averageClosestDistanceMatrix(piles,npiles); + int best_i = -1; + int best_j = -1; + int pile_i = -1; + int pile_j = -1; + double d = 1e100; + for ( int i = 0 ; i < npiles ; i++ ) { + for ( int j = i+1 ; j < npiles ; j++ ) { + SelectedPair p = dist.get(i,j); + if ( p.d() < d ) { + d = p.d(); + pile_i = i; + pile_j = j; + best_i = p.i(); + best_j = p.j(); + } + } + } +// System.out.println("joining pile "+pile_i +" and pile " + pile_j +" on seqs " + best_i +" and " + best_j ); + // got the closest pair + piles.get(pile_i).add(piles.get(pile_j),alignments.get(best_i,best_j)); +// System.out.println("JOINED PILE: \n"+piles.get(pile_i).toString()); + piles.remove(pile_j); + npiles--; + } + + alignments1 = piles.get(0); + alignments2 = piles.get(1); + + +// System.out.println("PILE 1: \n"+piles.get(0).toString()); +// System.out.println("PILE 2: \n"+piles.get(1).toString()); + return piles; +} public static IndexedSequence[] testSet1(int K) { IndexedSequence [] seqs = new IndexedSequence[9]; diff --git a/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java b/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java index 37e346eef..54f927027 100755 --- a/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java +++ b/playground/java/src/org/broadinstitute/sting/indels/StrictlyUpperTriangularMatrix.java @@ -1,81 +1,23 @@ package org.broadinstitute.sting.indels; -public class StrictlyUpperTriangularMatrix { - private double [] data; - private int size; - +public class StrictlyUpperTriangularMatrix extends SymmetricMatrix { + public StrictlyUpperTriangularMatrix(int dimension) { + super(dimension); assert dimension >=2 : "Distance matrix can not be smaller than 2x2"; - if ( dimension % 2 == 0 ) { - int k = dimension >> 1; // dimension/2 - data = new double[k*(dimension-1)]; - } else { - int k = ( dimension -1 ) >> 1; // (dimension -1)/2 - data = new double[k*dimension]; - } - size = dimension; } public double get(int i, int j) { - assert (i < size) && (j= j ) return 0.0; - - // we are guaranteed now that i < j; now - // translate i,j into the linear offset into our internal data array and return the value: - return data[ linearOffset(i,j) ]; + return super.get(i,j); } public void set(int i, int j, double value) { - assert (i < size) && (j>= 1; // k/=2 - // now k is the offset of the first stored element in row i - return ( k + (j - i - 1)); - } + private static void testMe() { StrictlyUpperTriangularMatrix m = new StrictlyUpperTriangularMatrix(3); diff --git a/playground/java/src/org/broadinstitute/sting/indels/SymmetricMatrix.java b/playground/java/src/org/broadinstitute/sting/indels/SymmetricMatrix.java new file mode 100644 index 000000000..9a1ec0459 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/indels/SymmetricMatrix.java @@ -0,0 +1,87 @@ +package org.broadinstitute.sting.indels; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Mar 22, 2009 + * Time: 3:43:05 PM + * To change this template use File | Settings | File Templates. + */ +public class SymmetricMatrix { + + protected double [] data; + protected int size; + + public SymmetricMatrix(int dimension) { + assert dimension >= 0 : "Matrix size can not be negative"; + if ( dimension % 2 == 0 ) { + int k = dimension >> 1; // dimension/2 + data = new double[k*(dimension+1)]; + } else { + int k = ( dimension + 1 ) >> 1; // (dimension + 1)/2 + data = new double[k*dimension]; + } + size = dimension; + } + + public double get(int i, int j) { + assert (i < size) && ( j < size) : "Out of bound index into matrix detected"; + if ( i >= j ) return data[linearOffset(j,i)]; // we store only the upper triangle in memory + return data[ linearOffset(i,j) ]; + } + + public void set(int i, int j, double value) { + assert (i < size) && (j < size) : "Out of bound index into matrix detected"; + + if ( i >= j ) data[ linearOffset(j,i) ] = value; + else data[ linearOffset(i,j) ] = value; + } + + public int size() { return size; } + + public int nRows() { return size; } + public int nCols() { return size; } + + /** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!); + * each element is formatted according to a default format. + * @return + * @see #format(String f) + */ + public String format() { + return format("%6.3f "); + } + + + /** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!); + * each element is formatted according to a specified format string (note: format string must include all desired + * whitespaces before and/or after an element, as this method itself does not add any spaces between the formatted elements). + * @return + * @see #format() + */ + public String format(String f) { + StringBuilder b = new StringBuilder(); + java.util.Formatter frm = new java.util.Formatter(b); + for ( int i = 0 ; i < size ; i++ ) { + for ( int j = 0 ; j < size ; j++ ) { + frm.format(f, get(i,j)); + } + b.append('\n'); + } + return b.toString(); + } + + + /** Computes linear offset into the internal array that keeps actual data, given "row" and "column" indices + * into the matrix; this method is unchecked, but it expects i <= j otherwise the result is unspecified. + * @param i row index + * @param j column index + * @return linear offset into the data[] member of this class + */ + private int linearOffset(int i, int j) { + int k = (( size << 1 ) - i + 1)*i; // [ 2*d - (i+1) ] * i + k >>= 1; // k/=2 + // now k is the offset of the first stored element in row i + return ( k + j - i ); + } + +}