git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@146 348d0f76-0448-11de-a6fe-93d51630548a

This commit is contained in:
asivache 2009-03-23 05:46:09 +00:00
parent 046cecb067
commit 835e85374e
9 changed files with 667 additions and 145 deletions

View File

@ -0,0 +1,122 @@
package org.broadinstitute.sting.indels;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 21, 2009
* Time: 4:25:12 PM
* To change this template use File | Settings | File Templates.
*/
public class ConsensusSequence {
private List< int[] > coverage; // counts of observations of every nucleotide at each position
private long referencePos;// (arbitrary) reference position; when adding sequences, their offsets should be wrt this position
private int startOffset; // offset of the leftmost base of this consensus sequence wrt referencePos; negative=left, positive=right
private final static int NUMBINS = 4;
public ConsensusSequence(int refPos) {
coverage = new ArrayList< int[] >();
referencePos = refPos;
startOffset = 0;
}
public ConsensusSequence() {
this(0);
}
/** Adds sequence <code>se</code> to the consensus, in the sense that all bases of seq are counted
* into observed coverage kept by the consensus. The position of the sequence is specified by the
* <code>offset</code> with respect to the fixed reference position of the consensus (the latter does not
* have to be consensus start), and if the sequence extends beyound the consensus on either end, the
* consensus will be extended appropriately to accomodate the full sequence.
* @param seq nucleotide sequence ('ACGT...')
* @param offset position of the start of the sequence relative to the fixed reference position of the consensus
*/
public void addSequence(String seq, int offset) {
// if sequence starts before than the currently held consensus oes, extend consensus to the left
if ( offset < startOffset ) {
coverage.addAll(0,instantiateCoverageList(startOffset-offset));
startOffset = offset;
}
// if the sequence ends beyound the currently held consensus, extend consensus to the right
if ( offset + seq.length() > startOffset + coverage.size() ) {
coverage.addAll( instantiateCoverageList(offset+seq.length() - startOffset - coverage.size()) );
}
// count bases from the sequence into the coverage
int posOnConsensus = offset - startOffset;
for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
coverage.get(posOnConsensus)[baseToInt(seq.charAt(i))]++;
}
}
/** Removes sequence <code>seq</code> from the consensus. More exactly, 1 will be subtracted from current
* observation counts kept by the consensus for each observed base at every position of the sequence. The
* position of the sequence is specified by the <code>offset</code> with respect to the reference position
* of the consensus. NOTE: this method is unchecked and does not verify that the sequence being subtracted
* was indeed previously added to the consensus and/or that the consenus does accomodate full length of
* the sequence. If it is not the case, the results can be unpredictable or assert failure may occur.
*
* @param seq nucleotide sequence ('ACGT...')
* @param offset position of the start of the sequence relative to the fixed reference position of the consensus
*/
public void removeSequence(String seq, int offset) {
assert offset >= startOffset :
"Attempt to remove from consensus a sequence that starts prior to consenus start";
assert (offset+seq.length() < startOffset + coverage.size()) :
"Attempt to remove from consensus a sequence that extends beyond consensus end";
// subtract sequence bases from the coverage
int posOnConsensus = offset - startOffset;
for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
coverage.get(posOnConsensus)[ baseToInt(seq.charAt(i)) ]--;
}
}
/** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to
* the specified sequence <code>seq</code> starting at position <code>offset</code> wrt consenus reference position.
* @param seq
* @param offset
* @return
*/
public double distance(String seq, int offset) {
int posOnConsensus; // index into the currently held consensus sequence
int i ; // index into the passed sequence argument
if ( offset < startOffset ) {
posOnConsensus = 0;
i = startOffset - offset;
} else {
i = 0 ;
posOnConsensus = offset - startOffset;
}
// stop position on the passed sequence (can be less than sequence length if consensus stops prematurely)
int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset;
for ( ; i < stop ; i++, posOnConsensus++ ) {
int base = baseToInt(seq.charAt(posOnConsensus));
int [] cov = coverage.get(posOnConsensus);
int totalcov = cov[0]+cov[1]+cov[2]+cov[3];
}
return 0.0;
}
private List<int[]> instantiateCoverageList(int n) {
List< int[] > subseq = new ArrayList<int[] >(n);
for ( int i = 0 ; i < n ; i++ ) subseq.add(new int[NUMBINS]);
return subseq;
}
private int baseToInt(char c) {
int base;
switch( Character.toUpperCase(c) ) {
case 'A': base = 0; break;
case 'C': base = 1; break;
case 'G': base = 2; break;
case 'T': base = 3; break;
default : throw new IllegalArgumentException("Sequence can contain only ACGT symbols");
}
return base;
}
}

View File

@ -60,8 +60,9 @@ public class IndelInspector extends CommandLineProgram {
ReferenceSequence contig_seq = null;
IndelRecordPileCollector col = null;
PileBuilder pileBuilder = new PileBuilder();
try {
col = new IndelRecordPileCollector(new DiscardingReceiver(), new PileBuilder() );
col = new IndelRecordPileCollector(new DiscardingReceiver(), pileBuilder );
} catch(Exception e) { System.err.println(e.getMessage()); }
if ( col == null ) return 1;
@ -78,6 +79,9 @@ public class IndelInspector extends CommandLineProgram {
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == 1 ) break;
if ( location == null || GenomeLoc.compareContigs(cur_contig, location.getContig()) == 0 ) {
contig_seq = reference.get(r.getReferenceIndex());
String refstr = new String(contig_seq.getBases());
col.setReferenceSequence(refstr);
pileBuilder.setReferenceSequence(refstr);
System.out.println("loaded contig "+cur_contig+" (index="+r.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
}
}
@ -85,9 +89,12 @@ public class IndelInspector extends CommandLineProgram {
// if contig is specified and wqe did not reach it yet, skip the records until we reach that contig:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
if ( location != null && r.getAlignmentEnd() < location.getStart() ) continue;
// if stop position is specified and we are past that, stop reading:
if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now
int err = -1;

View File

@ -89,6 +89,8 @@ public class IndelRecordPileCollector implements RecordReceiver {
private RecordReceiver defaultReceiver; // we will send there records that do not overlap with regions of interest
private RecordPileReceiver indelPileReceiver; // piles over indel regions will be sent there
private String referenceSequence;
public String memStatsString() {
String s = "mRecordPile: ";
return s+mRecordPile.size() + " mAllIndels: "+mAllIndels.size() + " mLastContig=" +mLastContig + " mLastStartOnref="+mLastStartOnRef;
@ -110,6 +112,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
}
defaultReceiver = rr;
indelPileReceiver = rp;
referenceSequence = null;
setWaitState();
}
@ -124,7 +127,11 @@ public class IndelRecordPileCollector implements RecordReceiver {
avoiding_region = false;
mState = WAIT_STATE; // got to do this if we were in avoid_region state
}
public void setReferenceSequence(String contig) {
referenceSequence = contig;
}
/** A utility method: emits into nonindelReceiver and purges from the currently held SAM record pile
* all the consequtive records with alignment end positions less than or equal to the specified
* position <code>pos</code>, until the first record is encountered that does not meet this condition. Note that

View File

@ -1,22 +1,37 @@
package org.broadinstitute.sting.indels;
public class Matrix<T> {
private int size;
private int nRows;
private int nCols;
private Object [][] data;
public Matrix(int n) {
size = n;
data = new Object[n][n];
/** Instantiates a generic matrix of objects with n rows and m cols.
*
* @param n number of rows
* @param m number of columns
*/
public Matrix(int n, int m) {
nRows = n;
nCols = m;
data = new Object[n][m];
}
/** Instantiates a square n x n matrix of objects.
*
* @param n size of the matrix
*/
public Matrix (int n) {
this(n,n);
}
@SuppressWarnings("unchecked")
public T get(int i, int j) {
assert (i<size) && (j < size) : "Matrix index is out of bounds";
assert (i < nRows ) && (j < nCols) : "Matrix index is out of bounds";
return (T) data[i][j];
}
public void set(int i, int j, T value) {
assert (i<size) && (j < size) : "Matrix index is out of bounds";
assert ( i < nRows ) && ( j < nCols ) : "Matrix index is out of bounds";
data[i][j] = value;
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.indels;
import java.util.*;
import org.broadinstitute.sting.utils.Utils;
public class MultipleAlignment implements Iterable<Integer> {
@ -26,26 +27,6 @@ public class MultipleAlignment implements Iterable<Integer> {
ext_ids.clear();
}
public static <T extends Comparable> Integer[] SortPermutation( List<T> A )
{
final Object[] data = A.toArray();
class comparator implements Comparator<Integer>
{
public int compare(Integer a, Integer b)
{
return ((T)data[a]).compareTo(data[b]);
}
}
Integer[] permutation = new Integer[A.size()];
for (int i = 0; i < A.size(); i++)
{
permutation[i] = i;
}
Arrays.sort(permutation, new comparator());
return permutation;
}
/** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown
*
* @param seq sequence to add
@ -108,7 +89,8 @@ public class MultipleAlignment implements Iterable<Integer> {
alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second ));
}
}
/** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile
*
* @param id query id
@ -154,7 +136,13 @@ public class MultipleAlignment implements Iterable<Integer> {
alignment_offsets.add(off2+a.getOffsetById(id));
}
}
public void add(MultipleAlignment a, PairwiseAlignment p) {
if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set");
add(a,p,p.id1(),p.id2());
}
/** Returns true if the alignment already contains sequence with the specified id
*
* @param id
@ -168,21 +156,20 @@ public class MultipleAlignment implements Iterable<Integer> {
return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i));
}
/** Returns the length of the overlapping region of sequences si and sj .
/** Returns the length of the overlapping region of the two sequences specified by their external ids i and j.
*
* @return overlap size
*/
public int getOverlap(int i, int j) {
if ( ! contains(i) || ! contains(j) ) return -1;
if ( ! contains(i) || ! contains(j) ) throw new RuntimeException("Sequence with specified id is not in MSA pile");
int off = getOffsetById(j) - getOffsetById(i);
if ( off >= 0 ) {
return Math.min(getSequenceById(i).length()-off, getSequenceById(j).length());
} else {
return Math.min(getSequenceById(j).length()+off, getSequenceById(i).length());
}
int L;
if ( off >= 0 ) L = Math.min(getSequenceById(i).length()-off, getSequenceById(j).length());
else L = Math.min(getSequenceById(j).length()+off, getSequenceById(i).length());
return ( L < 0 ? 0 : L );
}
/** Given the two indices, one of which has to be already in the pile, returns the one that is not in the pile.
/** Given the two sequence ids, one of which has to be already in the pile, returns the one that is not in the pile.
*
* @param i sequence id
* @param j sequence id
@ -199,13 +186,23 @@ public class MultipleAlignment implements Iterable<Integer> {
}
}
public String skipN(int n) {
/** Returns a string consisting of n spaces.
*
* @param n
* @return
*/
private String skipN(int n) {
StringBuilder b=new StringBuilder();
for ( int k = 0 ; k < n ; k++ ) b.append(' ');
return b.toString();
}
public void skipN(int n, StringBuilder b) {
/** Prints n spaces directly into the specified string builder.
*
* @param n
* @param b
*/
private void skipN(int n, StringBuilder b) {
for ( int k = 0 ; k < n ; k++ ) b.append(' ');
}
@ -220,12 +217,60 @@ public class MultipleAlignment implements Iterable<Integer> {
if ( seqs.size() == 0 ) return b.toString();
int skip_first = 0;
int msa_length = 0;
for ( int i = 0 ; i < seqs.size() ; i++ ) {
if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i);
msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
}
msa_length += skip_first;
int [] cov = new int[4];
char[] bases = { 'A' , 'C', 'G', 'T' };
char[][] consensus = new char[4][msa_length];
for ( int i = 0 ; i < msa_length ; i++ ) {
cov[0] = cov[1] = cov[2] = cov[3] = 0;
for ( int j = 0 ; j < seqs.size(); j++ ) {
// offset of the sequence j wrt start of the msa region
int seq_offset = skip_first + alignment_offsets.get(j);
if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
int base = -1;
switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
case 'A': base = 0; break;
case 'C': base = 1 ; break;
case 'G': base = 2 ; break;
case 'T': base = 3 ; break;
}
if ( base >= 0 ) cov[base]++;
}
int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
int bmax = 0;
int mm = 0;
consensus[3][i] = 'N';
for ( int z = 0; z < 4 ; z++ ) {
if ( cov[z] > bmax ) {
bmax = cov[z];
consensus[3][i] = bases[z];
mm = total_cov - bmax;
}
}
if ( mm > 0 ) {
consensus[2][i] = '*';
if ( mm > 9 ) consensus[0][i] = Character.forDigit(mm/10,10);
else consensus[0][i] = ' ';
consensus[1][i] = Character.forDigit(mm%10,10);
} else {
consensus[0][i] = consensus[1][i] = consensus[2][i] = ' ';
}
}
b.append(" "); b.append(consensus[0]); b.append('\n');
b.append(" "); b.append(consensus[1]); b.append('\n');
b.append(" "); b.append(consensus[2]); b.append('\n');
b.append(" "); b.append(consensus[3]); b.append('\n');
Integer[] perm = null;
if ( inorder ) perm = SortPermutation(alignment_offsets);
if ( inorder ) perm = Utils.SortPermutation(alignment_offsets);
for ( int i = 0 ; i < seqs.size() ; i++ ) {
int index = (inorder ? perm[i] : i);
@ -248,8 +293,12 @@ public class MultipleAlignment implements Iterable<Integer> {
*/
public Iterator<Integer> sequenceIdIterator() { return index.keySet().iterator(); }
/** Returns an iterator over external seuqnce ids of the sequences stored in the pile, presenting them in
* the order of ascending alignment offsets.
* @return
*/
public Iterator<Integer> sequenceIdByOffsetIterator() {
final Integer[] perm = SortPermutation(alignment_offsets);
final Integer[] perm = Utils.SortPermutation(alignment_offsets);
return new Iterator<Integer>() {
private int i = 0;
public boolean hasNext() {

View File

@ -46,8 +46,26 @@ public class PairwiseAlignment {
* if s2 is shifted left (starts earlier) wrt s1
*/
public int getBestOffset2wrt1() { return alignment_offset; }
/** Returns offset of the sequence j wrt sequence i in the best pairwise alignment found.
*
* @param i extrenal id of a sequence, must be one of the sequences kept by this alignment
* @param j extrenal id of a sequence, must be one of the sequences kept by this alignment
* @return offset of 2nd arg (j) wrt to the first arg (i)
*/
public int getBestOffset2wrt1(int i, int j ) {
if ( i == i1 && j == i2 ) return alignment_offset;
else if ( i == i2 && j == i1 ) return -alignment_offset;
throw new RuntimeException("Specified sequence id not found in the alignment");
}
public String getSequence1() { return s1; }
public String getSequence2() { return s2; }
public String getSequenceById(int i) {
if ( i == i1 ) return s1;
else if ( i == i2 ) return s2;
throw new RuntimeException("Specified sequence id not found in the alignment");
}
public int id1() { return i1;}
public int id2() { return i2;}
@ -78,6 +96,16 @@ public class PairwiseAlignment {
return Math.min(s2.length()+alignment_offset, s1.length());
}
}
public static int getOverlap(String seq1, String seq2, int offset2wrt1) {
int L ;
if ( offset2wrt1 >= 0 ) {
L = Math.min(seq1.length()-offset2wrt1, seq2.length());
} else {
L = Math.min(seq2.length()+offset2wrt1, seq1.length());
}
return ( L < 0 ? 0 : L );
}
/** Returns true if at least one alignment, no matter how bad, was found between the two sequences
* (i.e. the sequences have at least one kmer in common).
@ -116,7 +144,8 @@ public class PairwiseAlignment {
int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 );
int cnt = 0;
while ( pos1 < seq1.length() && pos2 < seq2.length() ) {
if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue;
if ( Character.toUpperCase(seq1.charAt(pos1++)) ==
Character.toUpperCase(seq2.charAt(pos2++)) ) continue;
cnt++; // found mismatch
}
return cnt;
@ -127,7 +156,8 @@ public class PairwiseAlignment {
int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 );
int cnt = 0;
while ( pos1 < seq1.length() && pos2 < seq2.length() && cnt < maxerr ) {
if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue;
if ( Character.toUpperCase(seq1.charAt(pos1++)) ==
Character.toUpperCase(seq2.charAt(pos2++)) ) continue;
cnt++; // found mismatch
}
return cnt;
@ -157,6 +187,14 @@ public class PairwiseAlignment {
return ( l / (double)L );
}
public static double distance(String seq1, String seq2, int offset2wrt1) {
int L = getOverlap(seq1,seq2,offset2wrt1);
if ( L <= 0 ) return 1e100;
int mm = countMismatches(seq1,seq2,offset2wrt1);
double l = ( mm == 0 ? 1.0 : (double)mm + Math.sqrt((double)mm) );
return ( l / (double) L );
}
}

View File

@ -2,16 +2,26 @@ package org.broadinstitute.sting.indels;
import net.sf.samtools.SAMRecord;
import java.util.Collection;
import java.util.Iterator;
import java.util.*;
public class PileBuilder implements RecordPileReceiver {
private StrictlyUpperTriangularMatrix distances ;
private SymmetricMatrix distances ;
private Matrix<PairwiseAlignment> alignments ;
private static final int KmerSize = 8;
private MultipleAlignment alignments1;
private MultipleAlignment alignments2;
private String referenceSequence;
private int reference_start;
private static class IntPair {
int first;
int second;
public IntPair(int i, int j) { set(i,j); }
public int first() { return first; }
public int second() { return second; }
public void set(int i, int j) { first = i; second = j; }
}
private static class SelectedPair {
private int i_;
@ -46,32 +56,88 @@ public class PileBuilder implements RecordPileReceiver {
}
}
public class SelectedSequence {
private int id_;
private double d_;
private SelectedSequence(int i, double d) {
set(i,d);
}
private SelectedSequence() { this(-1,1e100) ; }
private void set(int i, double d) { id_ = i; d_ = d; }
public double d() { return d_;}
public int i() { return id_; }
}
public PileBuilder() {}
public PileBuilder() {
referenceSequence = null;
reference_start = -1;
}
public void setReferenceSequence(String seq, int start) {
referenceSequence = seq;
reference_start = start;
}
public void setReferenceSequence(String seq) {
referenceSequence = seq;
reference_start = -1;
}
public void receive(Collection<SAMRecord> c) {
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
int left = 1000000000;
int right = 0;
for ( SAMRecord r : c ) {
seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize);
left = Math.min(left, r.getAlignmentStart() );
right = Math.max(right,r.getAlignmentEnd());
}
doMultipleAlignment(seqs);
System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
System.out.print("; diameter of PILE1: "+ diameter(alignments1));
System.out.println("; diameter of PILE2: "+ diameter(alignments2));
System.out.println("PILE 1: \n"+alignments1.toString());
System.out.println("PILE 2: \n"+alignments2.toString());
SequencePile originalAligns = new SequencePile(referenceSequence.substring(left-1,right));
for ( SAMRecord r : c ) {
originalAligns.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(),
r.getCigar(), r.getAlignmentStart() - left );
}
System.out.println("\n#############################################################################");
System.out.println("ORIGINAL ALIGNMENT: \n");
originalAligns.colorprint(true);
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
List<MultipleAlignment> piles = doMultipleAlignment2(seqs);
// System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));
// System.out.println("PILE 1: \n"+alignments1.toString());
// System.out.println("PILE 2: \n"+alignments2.toString());
SymmetricMatrix d = new SymmetricMatrix(piles.size());
for ( int n = 0 ; n < piles.size() ; n++ ) {
d.set(n,n,diameter(piles.get(n)));
for ( int m = n+1 ; m < piles.size() ; m++ ) {
d.set(n,m,distance(piles.get(n), piles.get(m)));
}
}
System.out.println(d.format("%8.4g"));
for ( int n = 0 ; n < piles.size() ; n++ ) {
System.out.println("PILE " + n +"\n" +piles.get(n).toString());
}
}
public void initPairwiseAlignments( IndexedSequence [] seqs ) {
distances = new StrictlyUpperTriangularMatrix( seqs.length );
distances = new SymmetricMatrix( seqs.length );
alignments = new Matrix<PairwiseAlignment>( seqs.length );
for( int i = 0; i < seqs.length ; i++ ) {
for ( int j = i+1 ; j < seqs.length ; j++ ) {
PairwiseAlignment a = new PairwiseAlignment(seqs[i],seqs[j],i,j); // compute pairwise alignment
alignments.set(i, j, a); // save it
alignments.set(j, i, a); // save it
distances.set(i,j,a.distance());
}
}
@ -146,21 +212,152 @@ public class PileBuilder implements RecordPileReceiver {
SelectedPair p = new SelectedPair(-1,-1,1e100);
for ( Integer id : a ) {
for (int i = 0; i < id; i++) {
for (int i = 0; i < distances.size(); i++) {
if (a.contains(i)) continue; // a already contains both sequences (i,id)
double d = distances.get(i, id);
if (d < p.d() ) p.set(i,id,d);
}
for (int j = id + 1; j < distances.size() ; j++) {
if (a.contains(j)) continue; // a already contains both sequences (id, j)
double d = distances.get(id, j);
if (d < p.d()) p.set(id,j,d);
}
}
return p;
}
public SelectedPair findClosestToPileAverage(MultipleAlignment a) {
SelectedPair p = new SelectedPair(-1,-1,1e100);
// currently, we compute the average distance from each sequence to the pile, but if the average
// distance is small enough, we will try to stitch that sequence to the pile based on the *best*
// available pairwise alignment, best_id will keep the id of that sequence from the pile that
// has the best alignment with the sequence that is the closest on average
int best_id=-1;
Set<Integer> offsets = new HashSet<Integer>(); // all putative offsets suggested by different p-wise alignments
for ( int i = 0 ; i < distances.size() ; i++ ) { // for each sequence i
if ( a.contains(i) ) continue; // sequence i is already in the pile, ignore it
offsets.clear();
for ( Integer id : a ) { // for all sequences from the msa pile
PairwiseAlignment pa = alignments.get(i,id);
if ( pa.getOverlap() <= 0 ) continue; // at this step we do not take into account sequences with no overlap
// alignment pa suggests this offset of i wrt the first sequence in the msa
offsets.add( pa.getBestOffset2wrt1(id,i)+a.getOffsetById(id));
}
// we got all suggested offsets; now lets recompute distances:
for( Integer off : offsets ) {
SelectedPair spo = averageDistanceForOffset(a,i,off);
if ( spo.d() < p.d() ) p.set(spo.i(),spo.j(),spo.d());
}
}
return p;
}
public Matrix<SelectedPair> averageClosestDistanceMatrix(List<MultipleAlignment> la, int n) {
Matrix<SelectedPair> mp = new Matrix<SelectedPair>(n);
for ( int i = 0 ; i < n ; i++ ) {
for ( int j = i + 1 ; j < n ; j++ ) {
mp.set(i,j, findBestAlignment(la.get(i),la.get(j)) );
mp.set(j,i, mp.get(i,j) );
}
}
return mp;
}
public SelectedPair findBestAlignment(MultipleAlignment a1, MultipleAlignment a2) {
Map<Integer, IntPair > all_offsets = new HashMap<Integer, IntPair >();
SelectedPair p = new SelectedPair(-1,-1,1e100);
for ( Integer id1 : a1 ) {
for ( Integer id2 : a2 ) {
PairwiseAlignment pa = alignments.get(id1,id2);
if ( pa.getOverlap() <= 0 ) continue; // id1 and id2 do not overlap and/or we don't have p-wise alignment
// record suggested offset of a2 wrt a1 (defined by their first sequences), and remember the
// pairwise alignment that suggested it
int suggested_offset = a1.getOffsetById(id1) + pa.getBestOffset2wrt1(id1,id2) - a2.getOffsetById(id2);
if ( ! all_offsets.containsKey(suggested_offset) ) {
all_offsets.put( suggested_offset , new IntPair(id1,id2)) ;
}
}
}
for ( Map.Entry<Integer,IntPair> offset_record : all_offsets.entrySet() ) {
double d = averageDistanceForOffset(a1,a2,offset_record.getKey());
if ( d < p.d() ) p.set(offset_record.getValue().first(),offset_record.getValue().second(),d);
}
return p;
}
public double averageDistanceForOffset(MultipleAlignment a1, MultipleAlignment a2, int offset) {
SelectedPair p = new SelectedPair();
double d_av = 0;
int nseq = 0;
int i1 = -1;
int i2 = -1;
for ( Integer id2 : a2 ) {
SelectedPair spo = averageDistanceForOffset(a1,id2,offset+a2.getOffsetById(id2));
if ( spo.d() > 1e99 ) continue;
nseq++;
d_av += spo.d();
}
if ( nseq == 0 ) return 1e100;
d_av /= nseq;
return d_av;
}
/** Computes average distance from sequence i to multiple alignment a for the specified offset of i wrt a
* and returns that distance and pair of sequence indices, on which the specified offset is realized
* @param a
* @param i
* @param offset
* @return
*/
public SelectedPair averageDistanceForOffset(MultipleAlignment a, int i, int offset) {
SelectedPair p = new SelectedPair(-1,-1,1e100);
double d = 0; // will hold average distance
double dmin = 1e100; // used to find the nearest individual sequence in the pile
int nseq = 0; // number of sequences in the pile that have distance to sequence i defined
int best_id = -1;
for ( Integer id : a ) { // for all sequences from the msa pile
PairwiseAlignment pa = alignments.get(i,id);
int new_off = offset - a.getOffsetById(id); // offset of i wrt id as suggested by <offset>
double dist_for_off; // distance between i and id for the given offset off
// check if p-wise alignment has data for the specified offset:
boolean canuse = false;
if ( pa.alignmentExists() && pa.getBestOffset2wrt1(id,i) == new_off ) {
dist_for_off = distances.get(i,id);
canuse = true; // can use this alignment to stitch i to a
}
else {
// offset is different from what the pwise alignment suggests; recompute!
dist_for_off = PairwiseAlignment.distance(pa.getSequenceById(id),pa.getSequenceById(i),new_off);
}
if ( dist_for_off > 1e99 ) continue; // at this offset, i and id do not overlap, go check next id
d += dist_for_off;
nseq++;
if ( dist_for_off < dmin && canuse ) {
dmin = dist_for_off;
best_id = id;
}
}
if ( nseq == 0 ) return p;
d /= (double)nseq;
p.set(i,best_id,d);
return p;
}
/** Finds, among all sequences, the one farthest from the specified pile. Being
* the 'farthest' is defined as having the largest lower bound of the distances to all sequences in the pile.
*
@ -178,7 +375,7 @@ public class PileBuilder implements RecordPileReceiver {
double d_min = 1e100; // smallest distance from sequence i to the pile
for ( Integer id : a ) {
double d = ( i < id ? distances.get(i, id) : distances.get(id,i) );
double d = distances.get(i, id) ;
if (d < d_min ) d_min = d;
}
// d_min is the smallest distance from sequence i to pile a
@ -195,8 +392,7 @@ public class PileBuilder implements RecordPileReceiver {
double d = 1e100;
for ( Integer id1 : a1 ) {
for ( Integer id2 : a2 ) {
if ( id1 < id2 && distances.get(id1,id2) < d ) d = distances.get(id1,id2);
if ( id1 > id2 && distances.get(id2,id1) < d ) d = distances.get(id2,id1);
if ( distances.get(id1,id2) < d ) d = distances.get(id1,id2);
}
}
return d;
@ -217,7 +413,7 @@ public class PileBuilder implements RecordPileReceiver {
double d = 1e100; // will hold distance from id1 to its closest neighbor
for ( Integer id2 : a ) {
if ( id2 == id1 ) continue;
double dpair = ( id1 < id2 ? distances.get(id1,id2) : distances.get(id2,id1) );
double dpair = distances.get(id1,id2) ;
d = Math.min(d,dpair);
}
// d = distance from id1 to its closest neighbor within the pile
@ -240,7 +436,8 @@ public class PileBuilder implements RecordPileReceiver {
PileBuilder pb = new PileBuilder();
pb.doMultipleAlignment(seqs);
//pb.doMultipleAlignment(seqs);
pb.doMultipleAlignment2(seqs);
System.out.print("Distance between final piles: "+pb.distance(pb.alignments1, pb.alignments2));
System.out.print("; diameter of PILE1: "+ pb.diameter(pb.alignments1));
System.out.println("; diameter of PILE2: "+ pb.diameter(pb.alignments2));
@ -252,6 +449,8 @@ public class PileBuilder implements RecordPileReceiver {
public void doMultipleAlignment(IndexedSequence[] seqs) {
// two piles we are going to grow until all sequences are assigned to one of them.
// we intend to keep the piles disjoint, e.g. no sequence should be placed in both
MultipleAlignment pile1 = new MultipleAlignment();
MultipleAlignment pile2 = new MultipleAlignment();
@ -260,7 +459,7 @@ public class PileBuilder implements RecordPileReceiver {
// all the pairwise alignments are computed and disjoint best and next-best pairs are found
//System.out.println( distances.format("%8.4g "));
// System.out.println( distances.format("%8.4g "));
SelectedPair pworst = findWorst();
@ -268,13 +467,14 @@ public class PileBuilder implements RecordPileReceiver {
pile1.add(seqs[pworst.i()].getSequence(), pworst.i());
pile2.add(seqs[pworst.j()].getSequence(), pworst.j());
/*
// initialize piles with best and next-best pairs
/*
SelectedPair p_best = findClosestPair();
SelectedPair p_nextbest = findNextClosestPairAfter(p_best);
pile1.add( alignments.get(p_best.i(), p_best.j()),p_best.i(),p_best.j());
pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()),p_nextbest.i(),p_nextbest.j());
*/
pile1.add( alignments.get(p_best.i(), p_best.j()));
pile2.add( alignments.get(p_nextbest.i(), p_nextbest.j()));
*/
/*
System.out.println("Best pair ("+p_best.i() + "," + p_best.j()+", d="+p_best.d()+"):");
System.out.println(pile1.toString());
@ -283,39 +483,44 @@ public class PileBuilder implements RecordPileReceiver {
*/
SelectedPair p1 = null;
SelectedPair p2 = null;
// grow piles hierarchical clustering-style
while ( pile1.size() + pile2.size() < seqs.length ) {
// find candidate sequences closest to each of the two piles
p1 = findClosestToPile(pile1);
p2 = findClosestToPile(pile2);
// p1 = findClosestToPileAverage(pile1); // findClosestToPile(pile1);
// p2 = findClosestToPileAverage(pile2); //findClosestToPile(pile2);
p1 = findClosestToPile(pile1); // findClosestToPile(pile1);
p2 = findClosestToPile(pile2); //findClosestToPile(pile2);
int id1_cand = pile1.selectExternal(p1.i(), p1.j()); // id of the sequence closest to the pile 1
int id2_cand = pile2.selectExternal(p2.i(), p2.j()); // id of the sequence closest to the pile 2
if ( pile2.contains(id1_cand) && pile1.contains(id2_cand)) {
// pile1 and pile 2 are mutually the closest, so we need to merge them.
// if piles are mutually the closest, then p1 and p2 are the same pair (id1, id2),
// so we just merge on one of the (redundant) instances:
pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j());
pile1.add(pile2, alignments.get( p1.i(), p1.j()));
pile2.clear(); // need to reset pile 2 to something else
int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1
pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else {
if ( p1.d() < p2.d() ) {
if ( pile2.contains(id1_cand) ) {
pile1.add(pile2, alignments.get( p1.i(), p1.j()),p1.i(), p1.j());
pile1.add(pile2, alignments.get( p1.i(), p1.j()));
pile2.clear(); // need to reset pile 2 to something else
int z = findFarthestFromPile(pile1); // get sequence farthest from merged pile 1
pile2.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else pile1.add( alignments.get(p1.i(), p1.j()) );
} else {
if ( pile1.contains(id2_cand) ) {
pile2.add(pile1, alignments.get( p2.i(), p2.j()),p2.i(), p2.j());
pile2.add(pile1, alignments.get( p2.i(), p2.j()));
pile1.clear(); // need to reset pile 2 to something else
int z = findFarthestFromPile(pile2); // get sequence farthest from merged pile 1
pile1.add(seqs[z].getSequence(), z); // and reinitialize pile 2 with that sequence
} else pile2.add( alignments.get(p2.i(), p2.j()) );
}
}
System.out.println("PILE 1: \n"+pile1.toString());
System.out.println("PILE 2: \n"+pile2.toString());
} // end WHILE
alignments1 = pile1;
@ -327,6 +532,56 @@ public class PileBuilder implements RecordPileReceiver {
}
*/
}
public List<MultipleAlignment> doMultipleAlignment2(IndexedSequence[] seqs) {
initPairwiseAlignments(seqs);
List<MultipleAlignment> piles = new LinkedList<MultipleAlignment>();
int npiles = seqs.length;
for ( int i = 0 ; i < seqs.length ; i++ ) {
MultipleAlignment m = new MultipleAlignment();
m.add(seqs[i].getSequence(),i);
piles.add(m);
}
while ( npiles > 2 ) {
Matrix<SelectedPair> dist = averageClosestDistanceMatrix(piles,npiles);
int best_i = -1;
int best_j = -1;
int pile_i = -1;
int pile_j = -1;
double d = 1e100;
for ( int i = 0 ; i < npiles ; i++ ) {
for ( int j = i+1 ; j < npiles ; j++ ) {
SelectedPair p = dist.get(i,j);
if ( p.d() < d ) {
d = p.d();
pile_i = i;
pile_j = j;
best_i = p.i();
best_j = p.j();
}
}
}
// System.out.println("joining pile "+pile_i +" and pile " + pile_j +" on seqs " + best_i +" and " + best_j );
// got the closest pair
piles.get(pile_i).add(piles.get(pile_j),alignments.get(best_i,best_j));
// System.out.println("JOINED PILE: \n"+piles.get(pile_i).toString());
piles.remove(pile_j);
npiles--;
}
alignments1 = piles.get(0);
alignments2 = piles.get(1);
// System.out.println("PILE 1: \n"+piles.get(0).toString());
// System.out.println("PILE 2: \n"+piles.get(1).toString());
return piles;
}
public static IndexedSequence[] testSet1(int K) {
IndexedSequence [] seqs = new IndexedSequence[9];

View File

@ -1,81 +1,23 @@
package org.broadinstitute.sting.indels;
public class StrictlyUpperTriangularMatrix {
private double [] data;
private int size;
public class StrictlyUpperTriangularMatrix extends SymmetricMatrix {
public StrictlyUpperTriangularMatrix(int dimension) {
super(dimension);
assert dimension >=2 : "Distance matrix can not be smaller than 2x2";
if ( dimension % 2 == 0 ) {
int k = dimension >> 1; // dimension/2
data = new double[k*(dimension-1)];
} else {
int k = ( dimension -1 ) >> 1; // (dimension -1)/2
data = new double[k*dimension];
}
size = dimension;
}
public double get(int i, int j) {
assert (i < size) && (j<size) : "Out of bound index into distance matrix detected";
if ( i >= j ) return 0.0;
// we are guaranteed now that i < j; now
// translate i,j into the linear offset into our internal data array and return the value:
return data[ linearOffset(i,j) ];
return super.get(i,j);
}
public void set(int i, int j, double value) {
assert (i < size) && (j<size) : "Out of bound index into distance matrix detected";
assert i < j : "Only i < j elements can be set in strictly upper diagonal matrix" ;
// we are guaranteed now that i < j; now
// translate i,j into the linear offset into our internal data array and set the value
data[ linearOffset(i,j) ] = value;
super.set(i,j,value);
}
public int size() { return size; }
/** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!);
* each element is formatted according to a default format.
* @return
* @see format(String f)
*/
public String format() {
return format("%6.3f ");
}
/** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!);
* each element is formatted according to a specified format string (note: format string must include all desired
* whitespaces before and/or after an element, as this method itself does not add any spaces between the formatted elements).
* @return
* @see format()
*/
public String format(String f) {
StringBuilder b = new StringBuilder();
java.util.Formatter frm = new java.util.Formatter(b);
for ( int i = 0 ; i < size ; i++ ) {
for ( int j = 0 ; j < size ; j++ ) {
if ( i < j ) frm.format(f, get(i,j));
else frm.format(f, 0.0);
}
b.append('\n');
}
return b.toString();
}
/** Computes linear offset into the array that keeps actual data given "row" and "column" indices into the matrix
* this class represents; this method is unchecked, but it expects i < j otherwise the result will be incorrect.
* @param i row index
* @param j column index
* @return linear offset into the data[] member of this class
*/
private int linearOffset(int i, int j) {
int k = (( size << 1 ) - i - 1)*i; // [ 2*d - (i+1) ] * i
k >>= 1; // k/=2
// now k is the offset of the first stored element in row i
return ( k + (j - i - 1));
}
private static void testMe() {
StrictlyUpperTriangularMatrix m = new StrictlyUpperTriangularMatrix(3);

View File

@ -0,0 +1,87 @@
package org.broadinstitute.sting.indels;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 22, 2009
* Time: 3:43:05 PM
* To change this template use File | Settings | File Templates.
*/
public class SymmetricMatrix {
protected double [] data;
protected int size;
public SymmetricMatrix(int dimension) {
assert dimension >= 0 : "Matrix size can not be negative";
if ( dimension % 2 == 0 ) {
int k = dimension >> 1; // dimension/2
data = new double[k*(dimension+1)];
} else {
int k = ( dimension + 1 ) >> 1; // (dimension + 1)/2
data = new double[k*dimension];
}
size = dimension;
}
public double get(int i, int j) {
assert (i < size) && ( j < size) : "Out of bound index into matrix detected";
if ( i >= j ) return data[linearOffset(j,i)]; // we store only the upper triangle in memory
return data[ linearOffset(i,j) ];
}
public void set(int i, int j, double value) {
assert (i < size) && (j < size) : "Out of bound index into matrix detected";
if ( i >= j ) data[ linearOffset(j,i) ] = value;
else data[ linearOffset(i,j) ] = value;
}
public int size() { return size; }
public int nRows() { return size; }
public int nCols() { return size; }
/** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!);
* each element is formatted according to a default format.
* @return
* @see #format(String f)
*/
public String format() {
return format("%6.3f ");
}
/** Returns ready-to-print string representing the full matrix (don't use for 1000x1000 matrices!!);
* each element is formatted according to a specified format string (note: format string must include all desired
* whitespaces before and/or after an element, as this method itself does not add any spaces between the formatted elements).
* @return
* @see #format()
*/
public String format(String f) {
StringBuilder b = new StringBuilder();
java.util.Formatter frm = new java.util.Formatter(b);
for ( int i = 0 ; i < size ; i++ ) {
for ( int j = 0 ; j < size ; j++ ) {
frm.format(f, get(i,j));
}
b.append('\n');
}
return b.toString();
}
/** Computes linear offset into the internal array that keeps actual data, given "row" and "column" indices
* into the matrix; this method is unchecked, but it expects i <= j otherwise the result is unspecified.
* @param i row index
* @param j column index
* @return linear offset into the data[] member of this class
*/
private int linearOffset(int i, int j) {
int k = (( size << 1 ) - i + 1)*i; // [ 2*d - (i+1) ] * i
k >>= 1; // k/=2
// now k is the offset of the first stored element in row i
return ( k + j - i );
}
}