a class that keeps a pile of reads and can perform some simple computations on them; does not perform multiple alignments (so far) - external tools do the job

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@71 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-03-16 21:58:05 +00:00
parent 6d481c64e7
commit 4972b03059
1 changed files with 229 additions and 0 deletions

View File

@ -0,0 +1,229 @@
package org.broadinstitute.sting.indels;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import com.sun.org.apache.bcel.internal.generic.RETURN;
public class MultipleAlignment implements Iterable<Integer> {
private static final int IMPOSSIBLE = 1000000000;
private Map<Integer,Integer> index;
private List<String> seqs;
private List<Integer> ext_ids;
private List<Integer> alignment_offsets; // offset of seqs[i] w/respect to seqs[0]
private int best_mm; // mismatch count
private int next_mm; // next-best mismatch count
public MultipleAlignment() {
index = new HashMap<Integer,Integer>();
seqs = new ArrayList<String>();
alignment_offsets = new ArrayList<Integer>();
ext_ids = new ArrayList<Integer>();
}
public void clear() {
seqs.clear();
index.clear();
alignment_offsets.clear();
ext_ids.clear();
}
/** Adds single sequence with id set to i. Pile must be empty, or IllegalStateException will be thrown
*
* @param seq sequence to add
* @param i id of the sequence (can be use later to query the pile)
*/
public void add( String seq, int i ) throws IllegalStateException {
if ( size() != 0 ) throw new IllegalStateException("Single sequence can be added to an empty pile only");
index.put(i,0);
ext_ids.add(i);
seqs.add(seq);
alignment_offsets.add(0);
}
public void add( PairwiseAlignment a) {
if ( a.id1() == -1 || a.id2() == -1 ) throw new IllegalArgumentException("Attempt to add pairwise alignemnt with sequence ids not properly set");
add(a,a.id1(),a.id2());
}
/** Adds pair of aligned sequences to the pile, with the external ids of the first and second sequences being i and j,
* respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty and either both or
* none of the specified ids are already in the pile, an IllegalStateException will be thrown
* @param a
* @param i
* @param j
*/
public void add( PairwiseAlignment a, int i, int j ) throws IllegalStateException {
if ( seqs.size() == 0 ) {
index.put(i,0);
ext_ids.add(i);
seqs.add(a.getSequence1());
index.put(j,1);
ext_ids.add(j);
seqs.add(a.getSequence2());
alignment_offsets.add(0);
alignment_offsets.add(a.getBestOffset2wrt1());
return;
}
Integer first = index.get(i);
Integer second = index.get(j);
if ( first != null && second != null ) {
throw new IllegalStateException("Attempt to add pairwise alignment for two sequences that are already in the pile");
}
if ( first == null && second == null ) {
throw new IllegalStateException("Attempt to add pairwise alignment for two sequences none of which is already in the pile");
}
if ( second == null ) {
index.put(j,seqs.size());
seqs.add(a.getSequence2());
ext_ids.add(j);
alignment_offsets.add( a.getBestOffset2wrt1() + alignment_offsets.get( first ));
} else {
// first = null
index.put(i,seqs.size());
seqs.add(a.getSequence1());
ext_ids.add(i);
alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second ));
}
}
/** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile
*
* @param id query id
* @return sequence for specified id or null
*/
public String getSequenceById(int id) {
if ( ! contains(id)) return null;
return seqs.get(index.get(id));
}
/** Returns offset relative to the first sequence in the pile for sequence associated with the specified external id
*
* @param id query id
* @return offset for sequence with specified id
*/
public int getOffsetById(int id) {
//TODO: do something meaningful when id is not in the pile (exception?)
if ( ! contains(id)) return 0;
return alignment_offsets.get(index.get(id));
}
/** Adds pile of aligned sequences to this pile, stitching them together using specified pairwise alignment
* p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in the pile being added,
* otherwise an IllegalArgumentException is thrown. The piles can not overlap.
*/
public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) {
int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile
if ( this.contains(i) ) {
if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile");
off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j);
} else {
if ( this.contains(j)) {
if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile");
off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i);
} else throw new IllegalArgumentException("Sequence is not in the pile");
}
// stitch sequences from a into this pile:
for ( Integer id : a ) {
if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one");
index.put(id,seqs.size());
ext_ids.add(id);
seqs.add(a.getSequenceById(id));
alignment_offsets.add(off2+a.getOffsetById(id));
}
}
/** Returns true if the alignment already contains sequence with the specified id
*
* @param id
* @return
*/
public boolean contains(int id) {
return index.containsKey(id);
}
public int countMismatches(int i, int j) {
return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i));
}
/** Returns the length of the overlapping region of sequences si and sj .
*
* @return overlap size
*/
public int getOverlap(int i, int j) {
if ( ! contains(i) || ! contains(j) ) return -1;
int off = getOffsetById(j) - getOffsetById(i);
if ( off >= 0 ) {
return Math.min(getSequenceById(i).length()-off, getSequenceById(j).length());
} else {
return Math.min(getSequenceById(j).length()+off, getSequenceById(i).length());
}
}
/** Given the two indices, one of which has to be already in the pile, returns the one that is not in the pile.
*
* @param i sequence id
* @param j sequence id
* @return one of the input arguments that is not found in the pile
* @throws IllegalArgumentException when either both or none of the specified indices are in the pile
*/
public int selectExternal(int i, int j) {
if ( contains(i) ) {
if ( contains(j) ) throw new IllegalArgumentException("Can not select external when both indices are in the pile");
return j;
} else {
if ( ! contains(j) ) throw new IllegalArgumentException("Attempt to select external when both indices are not in the pile");
return i;
}
}
/** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately
* shifted and ready for printout;
*/
public String toString() {
StringBuilder b = new StringBuilder();
java.util.Formatter frmt = new java.util.Formatter(b);
if ( seqs.size() == 0 ) return b.toString();
int skip_first = 0;
for ( int i = 0 ; i < seqs.size() ; i++ ) {
if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i);
}
frmt.format("%3d:", ext_ids.get(0));
for ( int k = 0 ; k < skip_first ; k++ ) b.append(' ');
b.append(seqs.get(0));
b.append('\n');
for ( int i = 1 ; i < seqs.size() ; i++ ) {
frmt.format("%3d:", ext_ids.get(i));
int skip = alignment_offsets.get(i) + skip_first ;
for ( int k = 0 ; k < skip ; k++ ) b.append(' ');
b.append(seqs.get(i));
b.append('\n');
}
// b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n");
return b.toString();
}
public int size() { return seqs.size(); }
/** Returns an iterator over the id's of the sequences currently stored in the pile
*
* @return
*/
public Iterator<Integer> sequenceIdIterator() { return index.keySet().iterator(); }
@Override
public Iterator<Integer> iterator() {
return sequenceIdIterator();
}
}