diff --git a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java
index 2847972cd..75ed67954 100644
--- a/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/ConsensusSequence.java
@@ -51,7 +51,9 @@ public class ConsensusSequence {
// count bases from the sequence into the coverage
int posOnConsensus = offset - startOffset;
for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
- coverage.get(posOnConsensus)[baseToInt(seq.charAt(i))]++;
+ char base = Character.toUpperCase(seq.charAt(i));
+ if ( base == 'N') continue;
+ coverage.get(posOnConsensus)[baseToInt(base)]++;
}
}
@@ -73,10 +75,24 @@ public class ConsensusSequence {
// subtract sequence bases from the coverage
int posOnConsensus = offset - startOffset;
for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
- coverage.get(posOnConsensus)[ baseToInt(seq.charAt(i)) ]--;
+ char base = Character.toUpperCase(seq.charAt(i));
+ if ( base == 'N') continue;
+ coverage.get(posOnConsensus)[ baseToInt(base) ]--;
}
}
+ /** Returns offset of the start of consensus sequence with respect to the reference position the
+ * consensus is pinned to.
+ * @return
+ */
+ public int getStartOffset() { return startOffset; }
+
+ /** Returns the length (number of bases) of the consensus sequence.
+ *
+ * @return
+ */
+ public int length() { return coverage.size(); }
+
/** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to
* the specified sequence seq starting at position offset wrt consenus reference position.
* @param seq
@@ -97,7 +113,7 @@ public class ConsensusSequence {
int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset;
for ( ; i < stop ; i++, posOnConsensus++ ) {
- int base = baseToInt(seq.charAt(posOnConsensus));
+ int base = baseToInt(Character.toUpperCase(seq.charAt(posOnConsensus)));
int [] cov = coverage.get(posOnConsensus);
int totalcov = cov[0]+cov[1]+cov[2]+cov[3];
@@ -109,7 +125,8 @@ public class ConsensusSequence {
* Specified offset must be within the span of currently held consensus sequence. Consensus base is the
* one with the maximum count of observations. If two different nucleotides were observed exactly the
* same number of times (and that number is greater than the number of observations for othe nucleotides),
- * the "lesser" one, (order being ACGT) will be returned.
+ * the "lesser" one, (order being ACGT) will be returned. If coverage at specified position is zero, 'N' will
+ * be returned.
* @param offset
* @return
*/
@@ -149,6 +166,29 @@ public class ConsensusSequence {
return new Pair(base,bmax);
}
+ /** Returns total coverage (all observations regardless of what base what observed) at position
+ * specified by offset with respect to the conensus' reference position. offset does not have to be within
+ * the bounds of the currently kept consensus sequence, if it falls outside, a 0 will be silently returned.
+ * @param offset
+ * @return
+ */
+ public int coverageAt(int offset) {
+ if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0;
+ int [] cov = coverage.get(offset+(int)referencePos);
+ return cov[0]+cov[1]+cov[2]+cov[3];
+ }
+
+ /** Returns consesus sequence as a astring of bases (ACGTN); N will be returned for positions with zero
+ * coverage.
+ * @return
+ */
+ public String getSequence() {
+ char [] b = new char[coverage.size()];
+ for ( int i = 0 ; i < b.length ; i++ ) {
+ b[i] = baseAt(i+startOffset);
+ }
+ return new String(b);
+ }
private List instantiateCoverageList(int n) {
List< int[] > subseq = new ArrayList(n);
@@ -163,7 +203,8 @@ public class ConsensusSequence {
case 'C': base = 1; break;
case 'G': base = 2; break;
case 'T': base = 3; break;
- default : throw new IllegalArgumentException("Sequence can contain only ACGT symbols");
+ case 'N': base = -1; break;
+ default : throw new IllegalArgumentException("Sequence can contain only ACGTN symbols");
}
return base;
}
diff --git a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
index 893c1bedf..7116b4264 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/IndelInspector.java
@@ -36,7 +36,7 @@ public class IndelInspector extends CommandLineProgram {
protected int doWork() {
- System.out.println("I am at version 0.2");
+ System.out.println("I am at version 0.3");
GenomeLoc location = null;
if ( GENOME_LOCATION != null ) {
location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
diff --git a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java
index 88d759e9e..6586d6301 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/MultipleAlignment.java
@@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.indels;
import java.util.*;
import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.Pair;
public class MultipleAlignment implements Iterable {
@@ -12,12 +13,14 @@ public class MultipleAlignment implements Iterable {
private List alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added)
private int best_mm; // mismatch count
private int next_mm; // next-best mismatch count
-
+ private ConsensusSequence consensus;
+
public MultipleAlignment() {
index = new HashMap();
seqs = new ArrayList();
alignment_offsets = new ArrayList();
ext_ids = new ArrayList();
+ consensus = new ConsensusSequence(); // we use reference position 0, e.g. we hook onto the first read in the pile
}
public void clear() {
@@ -31,37 +34,46 @@ public class MultipleAlignment implements Iterable {
*
* @param seq sequence to add
* @param i id of the sequence (can be use later to query the pile)
+ * @see #add(String,int,int)
*/
public void add( String seq, int i ) throws IllegalStateException {
if ( size() != 0 ) throw new IllegalStateException("Single sequence can be added to an empty pile only");
- index.put(i,0);
- ext_ids.add(i);
- seqs.add(seq);
- alignment_offsets.add(0);
+ add(seq,i,0);
}
+ /** Adds single sequence with id set to i and places it at the specified offset wrt the first sequence
+ * in this pile (i.e. wrt reference position 0).
+ *
+ * @param seq sequence to add
+ * @param i id of the sequence (can be use later to query the pile)
+ * @see #add(String,int)
+ */
+ public void add( String seq, int i, int offset ) throws IllegalStateException {
+ index.put(i,index.size());
+ ext_ids.add(i);
+ seqs.add(seq);
+ alignment_offsets.add(offset);
+ consensus.addSequence(seq,offset);
+ }
+
public void add( PairwiseAlignment a) {
if ( a.id1() == -1 || a.id2() == -1 ) throw new IllegalArgumentException("Attempt to add pairwise alignemnt with sequence ids not properly set");
add(a,a.id1(),a.id2());
}
/** Adds pair of aligned sequences to the pile, with the external ids of the first and second sequences being i and j,
- * respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty and either both or
- * none of the specified ids are already in the pile, an IllegalStateException will be thrown
+ * respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty, exactly
+ * one of the sequences held by the pair-wise alignment should be already in the pile; this sequence (and the
+ * pairwise alignment itself) will be used to stitch the other sequence to the pile. If either both or
+ * none of the specified ids are already in the pile, an IllegalStateException will be thrown.
* @param a
* @param i
* @param j
*/
public void add( PairwiseAlignment a, int i, int j ) throws IllegalStateException {
if ( seqs.size() == 0 ) {
- index.put(i,0);
- ext_ids.add(i);
- seqs.add(a.getSequence1());
- index.put(j,1);
- ext_ids.add(j);
- seqs.add(a.getSequence2());
- alignment_offsets.add(0);
- alignment_offsets.add(a.getBestOffset2wrt1());
+ add(a.getSequence1(),i,0);
+ add(a.getSequence2(),j,a.getBestOffset2wrt1());
return;
}
@@ -76,22 +88,49 @@ public class MultipleAlignment implements Iterable {
throw new IllegalStateException("Attempt to add pairwise alignment for two sequences none of which is already in the pile");
}
- if ( second == null ) {
- index.put(j,seqs.size());
- seqs.add(a.getSequence2());
- ext_ids.add(j);
- alignment_offsets.add( a.getBestOffset2wrt1() + alignment_offsets.get( first ));
- } else {
- // first = null
- index.put(i,seqs.size());
- seqs.add(a.getSequence1());
- ext_ids.add(i);
- alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second ));
- }
+ if ( second == null ) add(a.getSequence2(),j, a.getBestOffset2wrt1() + alignment_offsets.get( first ) );
+ else add(a.getSequence1(),i, -a.getBestOffset2wrt1() + alignment_offsets.get( second ) );
}
+ /** Adds another pile of aligned sequences to this pile, stitching them together using specified pairwise alignment
+ * p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in
+ * the pile being added, otherwise an IllegalArgumentException is thrown. Sequence id's i and j MUST be the ids
+ * of the first and second sequences in the pairwise alignment, in that order. Specified ids override
+ * ids, if any, set for the sequences in the pairwise alignment; it is not checked whether the specified and
+ * stored ids match. The piles can not overlap.
+ */
+ public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) {
+ int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile
+ if ( this.contains(i) ) {
+ if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile");
+ off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j);
+ } else {
+ if ( this.contains(j)) {
+ if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile");
+ off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i);
+ } else throw new IllegalArgumentException("Sequence is not in the pile");
+ }
+ // stitch sequences from a into this pile:
+ for ( Integer id : a ) {
+ if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one");
+ add(a.getSequenceById(id),id,off2+a.getOffsetById(id));
+ }
+ }
- /** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile
+
+ /** Adds another pile of aligned sequences (a) to this pile, stitching them together using specified
+ * pairwise alignment p. Sequence ids must be set in the pairwise alignment, and one of those ids
+ * must be in this pile, and the other in the pile 'a' being added, otherwise an IllegalArgumentException
+ * is thrown. If pairwise alignment does not have sequence ids set, IllegalArgumentException is thrown.
+ * The piles can not overlap.
+ */
+ public void add(MultipleAlignment a, PairwiseAlignment p) {
+ if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set");
+ add(a,p,p.id1(),p.id2());
+ }
+
+ /** Returns sequence associated with the specified external id, or null if sequence with this external id is
+ * not found in the pile
*
* @param id query id
* @return sequence for specified id or null
@@ -101,49 +140,21 @@ public class MultipleAlignment implements Iterable {
return seqs.get(index.get(id));
}
- /** Returns offset relative to the first sequence in the pile for sequence associated with the specified external id
+ /** Returns offset relative to the first sequence in the pile for sequence associated with the specified
+ * external id. If sequence with specified id is not found in the pile, RuntimeException is thrown.
*
* @param id query id
* @return offset for sequence with specified id
*/
public int getOffsetById(int id) {
- //TODO: do something meaningful when id is not in the pile (exception?)
- if ( ! contains(id)) return 0;
+ if ( ! contains(id) ) throw new RuntimeException("Specified id is not in the pile");
return alignment_offsets.get(index.get(id));
}
- /** Adds pile of aligned sequences to this pile, stitching them together using specified pairwise alignment
- * p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in the pile being added,
- * otherwise an IllegalArgumentException is thrown. The piles can not overlap.
- */
- public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) {
- int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile
- if ( this.contains(i) ) {
- if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile");
- off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j);
- } else {
- if ( this.contains(j)) {
- if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile");
- off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i);
- } else throw new IllegalArgumentException("Sequence is not in the pile");
- }
- // stitch sequences from a into this pile:
- for ( Integer id : a ) {
- if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one");
- index.put(id,seqs.size());
- ext_ids.add(id);
- seqs.add(a.getSequenceById(id));
- alignment_offsets.add(off2+a.getOffsetById(id));
- }
- }
- public void add(MultipleAlignment a, PairwiseAlignment p) {
- if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set");
- add(a,p,p.id1(),p.id2());
- }
- /** Returns true if the alignment already contains sequence with the specified id
+ /** Returns true if the alignment already contains sequence with the specified id.
*
* @param id
* @return
@@ -151,7 +162,15 @@ public class MultipleAlignment implements Iterable {
public boolean contains(int id) {
return index.containsKey(id);
}
-
+
+ /** Returns number of mismatches between sequences i and j (external ids) in the currently held multiple alignment.
+ * Will return 0 if sequences do not overlap. Will throw RuntimeException if any of the specified ids is not
+ * found in the current pile.
+ * @param i id of the first sequence
+ * @param j id of the second sequence
+ * @return mismatch count
+ *
+ * */
public int countMismatches(int i, int j) {
return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i));
}
@@ -216,58 +235,63 @@ public class MultipleAlignment implements Iterable {
if ( seqs.size() == 0 ) return b.toString();
- int skip_first = 0;
- int msa_length = 0;
- for ( int i = 0 ; i < seqs.size() ; i++ ) {
- if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i);
- msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
- }
+// int first_offset = 0; // offset of the first sequence wrt the start of the MSA
+ final int first_offset = -consensus.getStartOffset();
+// int msa_length = 0;
+// for ( int i = 0 ; i < seqs.size() ; i++ ) {
+// if ( -alignment_offsets.get(i) > first_offset ) first_offset = -alignment_offsets.get(i);
+// msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
+// }
- msa_length += skip_first;
- int [] cov = new int[4];
- char[] bases = { 'A' , 'C', 'G', 'T' };
- char[][] consensus = new char[4][msa_length];
+// msa_length += first_offset;
+ final int msa_length = consensus.length();
+// int [] cov = new int[4];
+// char[] bases = { 'A' , 'C', 'G', 'T' };
+ char[][] consensusString = new char[4][msa_length];
- for ( int i = 0 ; i < msa_length ; i++ ) {
- cov[0] = cov[1] = cov[2] = cov[3] = 0;
- for ( int j = 0 ; j < seqs.size(); j++ ) {
- // offset of the sequence j wrt start of the msa region
- int seq_offset = skip_first + alignment_offsets.get(j);
- if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
- int base = -1;
- switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
- case 'A': base = 0; break;
- case 'C': base = 1 ; break;
- case 'G': base = 2 ; break;
- case 'T': base = 3 ; break;
- }
- if ( base >= 0 ) cov[base]++;
- }
- int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
- int bmax = 0;
- int mm = 0;
- consensus[3][i] = 'N';
- for ( int z = 0; z < 4 ; z++ ) {
- if ( cov[z] > bmax ) {
- bmax = cov[z];
- consensus[3][i] = bases[z];
- mm = total_cov - bmax;
- }
- }
+ for ( int i = -first_offset ; i < msa_length - first_offset ; i++ ) {
+// cov[0] = cov[1] = cov[2] = cov[3] = 0;
+// for ( int j = 0 ; j < seqs.size(); j++ ) {
+// // offset of the sequence j wrt start of the msa region
+// int seq_offset = first_offset + alignment_offsets.get(j);
+// if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
+// int base = -1;
+// switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
+// case 'A': base = 0; break;
+// case 'C': base = 1 ; break;
+// case 'G': base = 2 ; break;
+// case 'T': base = 3 ; break;
+// }
+// if ( base >= 0 ) cov[base]++;
+// }
+// int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
+// int bmax = 0;
+// int mm = 0;
+
+ Pair base = consensus.baseWithCountAt(i-first_offset);
+ consensusString[3][i] = base.first;
+// for ( int z = 0; z < 4 ; z++ ) {
+// if ( cov[z] > bmax ) {
+// bmax = cov[z];
+// consensus[3][i] = bases[z];
+// mm = total_cov - bmax;
+// }
+// }
+ int mm = consensus.coverageAt(i) - base.second;
if ( mm > 0 ) {
- consensus[2][i] = '*';
- if ( mm > 9 ) consensus[0][i] = Character.forDigit(mm/10,10);
- else consensus[0][i] = ' ';
- consensus[1][i] = Character.forDigit(mm%10,10);
+ consensusString[2][i] = '*';
+ if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10);
+ else consensusString[0][i] = ' ';
+ consensusString[1][i] = Character.forDigit(mm%10,10);
} else {
- consensus[0][i] = consensus[1][i] = consensus[2][i] = ' ';
+ consensusString[0][i] = consensusString[1][i] = consensusString[2][i] = ' ';
}
}
- b.append(" "); b.append(consensus[0]); b.append('\n');
- b.append(" "); b.append(consensus[1]); b.append('\n');
- b.append(" "); b.append(consensus[2]); b.append('\n');
- b.append(" "); b.append(consensus[3]); b.append('\n');
+ b.append(" "); b.append(consensusString[0]); b.append('\n');
+ b.append(" "); b.append(consensusString[1]); b.append('\n');
+ b.append(" "); b.append(consensusString[2]); b.append('\n');
+ b.append(" "); b.append(consensusString[3]); b.append('\n');
Integer[] perm = null;
if ( inorder ) perm = Utils.SortPermutation(alignment_offsets);
@@ -275,7 +299,7 @@ public class MultipleAlignment implements Iterable {
for ( int i = 0 ; i < seqs.size() ; i++ ) {
int index = (inorder ? perm[i] : i);
frmt.format("%3d:", ext_ids.get(index));
- skipN(alignment_offsets.get(index)+skip_first,b);
+ skipN(alignment_offsets.get(index)+ first_offset,b);
b.append(seqs.get(index));
b.append('\n');
}
diff --git a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java
index ceb6bf8d6..4b4712a37 100755
--- a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java
+++ b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java
@@ -103,6 +103,9 @@ public class PileBuilder implements RecordPileReceiver {
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
List piles = doMultipleAlignment2(seqs);
+
+
+
// System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
// System.out.print("; diameter of PILE1: "+ diameter(alignments1));
// System.out.println("; diameter of PILE2: "+ diameter(alignments2));