git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@166 348d0f76-0448-11de-a6fe-93d51630548a

This commit is contained in:
asivache 2009-03-24 14:06:24 +00:00
parent a3b8830855
commit 3d1e0bf079
4 changed files with 181 additions and 113 deletions

View File

@ -51,7 +51,9 @@ public class ConsensusSequence {
// count bases from the sequence into the coverage // count bases from the sequence into the coverage
int posOnConsensus = offset - startOffset; int posOnConsensus = offset - startOffset;
for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
coverage.get(posOnConsensus)[baseToInt(seq.charAt(i))]++; char base = Character.toUpperCase(seq.charAt(i));
if ( base == 'N') continue;
coverage.get(posOnConsensus)[baseToInt(base)]++;
} }
} }
@ -73,10 +75,24 @@ public class ConsensusSequence {
// subtract sequence bases from the coverage // subtract sequence bases from the coverage
int posOnConsensus = offset - startOffset; int posOnConsensus = offset - startOffset;
for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) {
coverage.get(posOnConsensus)[ baseToInt(seq.charAt(i)) ]--; char base = Character.toUpperCase(seq.charAt(i));
if ( base == 'N') continue;
coverage.get(posOnConsensus)[ baseToInt(base) ]--;
} }
} }
/** Returns offset of the start of consensus sequence with respect to the reference position the
* consensus is pinned to.
* @return
*/
public int getStartOffset() { return startOffset; }
/** Returns the length (number of bases) of the consensus sequence.
*
* @return
*/
public int length() { return coverage.size(); }
/** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to /** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to
* the specified sequence <code>seq</code> starting at position <code>offset</code> wrt consenus reference position. * the specified sequence <code>seq</code> starting at position <code>offset</code> wrt consenus reference position.
* @param seq * @param seq
@ -97,7 +113,7 @@ public class ConsensusSequence {
int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset; int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset;
for ( ; i < stop ; i++, posOnConsensus++ ) { for ( ; i < stop ; i++, posOnConsensus++ ) {
int base = baseToInt(seq.charAt(posOnConsensus)); int base = baseToInt(Character.toUpperCase(seq.charAt(posOnConsensus)));
int [] cov = coverage.get(posOnConsensus); int [] cov = coverage.get(posOnConsensus);
int totalcov = cov[0]+cov[1]+cov[2]+cov[3]; int totalcov = cov[0]+cov[1]+cov[2]+cov[3];
@ -109,7 +125,8 @@ public class ConsensusSequence {
* Specified offset must be within the span of currently held consensus sequence. Consensus base is the * Specified offset must be within the span of currently held consensus sequence. Consensus base is the
* one with the maximum count of observations. If two different nucleotides were observed exactly the * one with the maximum count of observations. If two different nucleotides were observed exactly the
* same number of times (and that number is greater than the number of observations for othe nucleotides), * same number of times (and that number is greater than the number of observations for othe nucleotides),
* the "lesser" one, (order being ACGT) will be returned. * the "lesser" one, (order being ACGT) will be returned. If coverage at specified position is zero, 'N' will
* be returned.
* @param offset * @param offset
* @return * @return
*/ */
@ -149,6 +166,29 @@ public class ConsensusSequence {
return new Pair<Character,Integer>(base,bmax); return new Pair<Character,Integer>(base,bmax);
} }
/** Returns total coverage (all observations regardless of what base what observed) at position
* specified by offset with respect to the conensus' reference position. offset does not have to be within
* the bounds of the currently kept consensus sequence, if it falls outside, a 0 will be silently returned.
* @param offset
* @return
*/
public int coverageAt(int offset) {
if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0;
int [] cov = coverage.get(offset+(int)referencePos);
return cov[0]+cov[1]+cov[2]+cov[3];
}
/** Returns consesus sequence as a astring of bases (ACGTN); N will be returned for positions with zero
* coverage.
* @return
*/
public String getSequence() {
char [] b = new char[coverage.size()];
for ( int i = 0 ; i < b.length ; i++ ) {
b[i] = baseAt(i+startOffset);
}
return new String(b);
}
private List<int[]> instantiateCoverageList(int n) { private List<int[]> instantiateCoverageList(int n) {
List< int[] > subseq = new ArrayList<int[] >(n); List< int[] > subseq = new ArrayList<int[] >(n);
@ -163,7 +203,8 @@ public class ConsensusSequence {
case 'C': base = 1; break; case 'C': base = 1; break;
case 'G': base = 2; break; case 'G': base = 2; break;
case 'T': base = 3; break; case 'T': base = 3; break;
default : throw new IllegalArgumentException("Sequence can contain only ACGT symbols"); case 'N': base = -1; break;
default : throw new IllegalArgumentException("Sequence can contain only ACGTN symbols");
} }
return base; return base;
} }

View File

@ -36,7 +36,7 @@ public class IndelInspector extends CommandLineProgram {
protected int doWork() { protected int doWork() {
System.out.println("I am at version 0.2"); System.out.println("I am at version 0.3");
GenomeLoc location = null; GenomeLoc location = null;
if ( GENOME_LOCATION != null ) { if ( GENOME_LOCATION != null ) {
location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.indels;
import java.util.*; import java.util.*;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
public class MultipleAlignment implements Iterable<Integer> { public class MultipleAlignment implements Iterable<Integer> {
@ -12,12 +13,14 @@ public class MultipleAlignment implements Iterable<Integer> {
private List<Integer> alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added) private List<Integer> alignment_offsets; // offset of seqs[i] w/respect to seqs[0] (i.e. in order the seqs were added)
private int best_mm; // mismatch count private int best_mm; // mismatch count
private int next_mm; // next-best mismatch count private int next_mm; // next-best mismatch count
private ConsensusSequence consensus;
public MultipleAlignment() { public MultipleAlignment() {
index = new HashMap<Integer,Integer>(); index = new HashMap<Integer,Integer>();
seqs = new ArrayList<String>(); seqs = new ArrayList<String>();
alignment_offsets = new ArrayList<Integer>(); alignment_offsets = new ArrayList<Integer>();
ext_ids = new ArrayList<Integer>(); ext_ids = new ArrayList<Integer>();
consensus = new ConsensusSequence(); // we use reference position 0, e.g. we hook onto the first read in the pile
} }
public void clear() { public void clear() {
@ -31,37 +34,46 @@ public class MultipleAlignment implements Iterable<Integer> {
* *
* @param seq sequence to add * @param seq sequence to add
* @param i id of the sequence (can be use later to query the pile) * @param i id of the sequence (can be use later to query the pile)
* @see #add(String,int,int)
*/ */
public void add( String seq, int i ) throws IllegalStateException { public void add( String seq, int i ) throws IllegalStateException {
if ( size() != 0 ) throw new IllegalStateException("Single sequence can be added to an empty pile only"); if ( size() != 0 ) throw new IllegalStateException("Single sequence can be added to an empty pile only");
index.put(i,0); add(seq,i,0);
ext_ids.add(i);
seqs.add(seq);
alignment_offsets.add(0);
} }
/** Adds single sequence with id set to i and places it at the specified offset wrt the first sequence
* in this pile (i.e. wrt reference position 0).
*
* @param seq sequence to add
* @param i id of the sequence (can be use later to query the pile)
* @see #add(String,int)
*/
public void add( String seq, int i, int offset ) throws IllegalStateException {
index.put(i,index.size());
ext_ids.add(i);
seqs.add(seq);
alignment_offsets.add(offset);
consensus.addSequence(seq,offset);
}
public void add( PairwiseAlignment a) { public void add( PairwiseAlignment a) {
if ( a.id1() == -1 || a.id2() == -1 ) throw new IllegalArgumentException("Attempt to add pairwise alignemnt with sequence ids not properly set"); if ( a.id1() == -1 || a.id2() == -1 ) throw new IllegalArgumentException("Attempt to add pairwise alignemnt with sequence ids not properly set");
add(a,a.id1(),a.id2()); add(a,a.id1(),a.id2());
} }
/** Adds pair of aligned sequences to the pile, with the external ids of the first and second sequences being i and j, /** Adds pair of aligned sequences to the pile, with the external ids of the first and second sequences being i and j,
* respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty and either both or * respectively. Pairwise alignment can be always added to an empty pile. If the pile is non-empty, exactly
* none of the specified ids are already in the pile, an IllegalStateException will be thrown * one of the sequences held by the pair-wise alignment should be already in the pile; this sequence (and the
* pairwise alignment itself) will be used to stitch the other sequence to the pile. If either both or
* none of the specified ids are already in the pile, an IllegalStateException will be thrown.
* @param a * @param a
* @param i * @param i
* @param j * @param j
*/ */
public void add( PairwiseAlignment a, int i, int j ) throws IllegalStateException { public void add( PairwiseAlignment a, int i, int j ) throws IllegalStateException {
if ( seqs.size() == 0 ) { if ( seqs.size() == 0 ) {
index.put(i,0); add(a.getSequence1(),i,0);
ext_ids.add(i); add(a.getSequence2(),j,a.getBestOffset2wrt1());
seqs.add(a.getSequence1());
index.put(j,1);
ext_ids.add(j);
seqs.add(a.getSequence2());
alignment_offsets.add(0);
alignment_offsets.add(a.getBestOffset2wrt1());
return; return;
} }
@ -76,22 +88,49 @@ public class MultipleAlignment implements Iterable<Integer> {
throw new IllegalStateException("Attempt to add pairwise alignment for two sequences none of which is already in the pile"); throw new IllegalStateException("Attempt to add pairwise alignment for two sequences none of which is already in the pile");
} }
if ( second == null ) { if ( second == null ) add(a.getSequence2(),j, a.getBestOffset2wrt1() + alignment_offsets.get( first ) );
index.put(j,seqs.size()); else add(a.getSequence1(),i, -a.getBestOffset2wrt1() + alignment_offsets.get( second ) );
seqs.add(a.getSequence2());
ext_ids.add(j);
alignment_offsets.add( a.getBestOffset2wrt1() + alignment_offsets.get( first ));
} else {
// first = null
index.put(i,seqs.size());
seqs.add(a.getSequence1());
ext_ids.add(i);
alignment_offsets.add( -a.getBestOffset2wrt1() + alignment_offsets.get( second ));
}
} }
/** Adds another pile of aligned sequences to this pile, stitching them together using specified pairwise alignment
* p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in
* the pile being added, otherwise an IllegalArgumentException is thrown. Sequence id's i and j MUST be the ids
* of the first and second sequences in the pairwise alignment, in that order. Specified ids override
* ids, if any, set for the sequences in the pairwise alignment; it is not checked whether the specified and
* stored ids match. The piles can not overlap.
*/
public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) {
int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile
if ( this.contains(i) ) {
if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile");
off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j);
} else {
if ( this.contains(j)) {
if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile");
off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i);
} else throw new IllegalArgumentException("Sequence is not in the pile");
}
// stitch sequences from a into this pile:
for ( Integer id : a ) {
if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one");
add(a.getSequenceById(id),id,off2+a.getOffsetById(id));
}
}
/** Returns sequence associated with the specified external id, or null if sequence with this external id is not found in the pile
/** Adds another pile of aligned sequences (a) to this pile, stitching them together using specified
* pairwise alignment p. Sequence ids must be set in the pairwise alignment, and one of those ids
* must be in this pile, and the other in the pile 'a' being added, otherwise an IllegalArgumentException
* is thrown. If pairwise alignment does not have sequence ids set, IllegalArgumentException is thrown.
* The piles can not overlap.
*/
public void add(MultipleAlignment a, PairwiseAlignment p) {
if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set");
add(a,p,p.id1(),p.id2());
}
/** Returns sequence associated with the specified external id, or null if sequence with this external id is
* not found in the pile
* *
* @param id query id * @param id query id
* @return sequence for specified id or null * @return sequence for specified id or null
@ -101,49 +140,21 @@ public class MultipleAlignment implements Iterable<Integer> {
return seqs.get(index.get(id)); return seqs.get(index.get(id));
} }
/** Returns offset relative to the first sequence in the pile for sequence associated with the specified external id /** Returns offset relative to the first sequence in the pile for sequence associated with the specified
* external id. If sequence with specified id is not found in the pile, RuntimeException is thrown.
* *
* @param id query id * @param id query id
* @return offset for sequence with specified id * @return offset for sequence with specified id
*/ */
public int getOffsetById(int id) { public int getOffsetById(int id) {
//TODO: do something meaningful when id is not in the pile (exception?) if ( ! contains(id) ) throw new RuntimeException("Specified id is not in the pile");
if ( ! contains(id)) return 0;
return alignment_offsets.get(index.get(id)); return alignment_offsets.get(index.get(id));
} }
/** Adds pile of aligned sequences to this pile, stitching them together using specified pairwise alignment
* p of the sequences with external ids i and j. One of the indices i, j must be in this pile, and the other in the pile being added,
* otherwise an IllegalArgumentException is thrown. The piles can not overlap.
*/
public void add(MultipleAlignment a, PairwiseAlignment p, int i, int j) {
int off2; // offset of the first sequence in pile 'a' wrt the first sequence in this pile
if ( this.contains(i) ) {
if ( ! a.contains(j)) throw new IllegalArgumentException("Sequence is not in the pile");
off2 = getOffsetById(i)+p.getBestOffset2wrt1()-a.getOffsetById(j);
} else {
if ( this.contains(j)) {
if ( ! a.contains(i)) throw new IllegalArgumentException("Sequence is not in the pile");
off2 = getOffsetById(j)-p.getBestOffset2wrt1()-a.getOffsetById(i);
} else throw new IllegalArgumentException("Sequence is not in the pile");
}
// stitch sequences from a into this pile:
for ( Integer id : a ) {
if ( this.contains(id) ) throw new IllegalArgumentException("Attempt to add a pile that shares sequences with the current one");
index.put(id,seqs.size());
ext_ids.add(id);
seqs.add(a.getSequenceById(id));
alignment_offsets.add(off2+a.getOffsetById(id));
}
}
public void add(MultipleAlignment a, PairwiseAlignment p) {
if ( p.id1() == -1 || p.id2() == -1 ) throw new IllegalArgumentException("Attempt to add MSA based on pairwise alignemnt with sequence ids not properly set");
add(a,p,p.id1(),p.id2());
}
/** Returns true if the alignment already contains sequence with the specified id /** Returns true if the alignment already contains sequence with the specified id.
* *
* @param id * @param id
* @return * @return
@ -151,7 +162,15 @@ public class MultipleAlignment implements Iterable<Integer> {
public boolean contains(int id) { public boolean contains(int id) {
return index.containsKey(id); return index.containsKey(id);
} }
/** Returns number of mismatches between sequences i and j (external ids) in the currently held multiple alignment.
* Will return 0 if sequences do not overlap. Will throw RuntimeException if any of the specified ids is not
* found in the current pile.
* @param i id of the first sequence
* @param j id of the second sequence
* @return mismatch count
*
* */
public int countMismatches(int i, int j) { public int countMismatches(int i, int j) {
return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i)); return PairwiseAlignment.countMismatches(getSequenceById(i), getSequenceById(j), getOffsetById(j)-getOffsetById(i));
} }
@ -216,58 +235,63 @@ public class MultipleAlignment implements Iterable<Integer> {
if ( seqs.size() == 0 ) return b.toString(); if ( seqs.size() == 0 ) return b.toString();
int skip_first = 0; // int first_offset = 0; // offset of the first sequence wrt the start of the MSA
int msa_length = 0; final int first_offset = -consensus.getStartOffset();
for ( int i = 0 ; i < seqs.size() ; i++ ) { // int msa_length = 0;
if ( -alignment_offsets.get(i) > skip_first ) skip_first = -alignment_offsets.get(i); // for ( int i = 0 ; i < seqs.size() ; i++ ) {
msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length ); // if ( -alignment_offsets.get(i) > first_offset ) first_offset = -alignment_offsets.get(i);
} // msa_length = Math.max( alignment_offsets.get(i)+seqs.get(i).length() , msa_length );
// }
msa_length += skip_first; // msa_length += first_offset;
int [] cov = new int[4]; final int msa_length = consensus.length();
char[] bases = { 'A' , 'C', 'G', 'T' }; // int [] cov = new int[4];
char[][] consensus = new char[4][msa_length]; // char[] bases = { 'A' , 'C', 'G', 'T' };
char[][] consensusString = new char[4][msa_length];
for ( int i = 0 ; i < msa_length ; i++ ) { for ( int i = -first_offset ; i < msa_length - first_offset ; i++ ) {
cov[0] = cov[1] = cov[2] = cov[3] = 0; // cov[0] = cov[1] = cov[2] = cov[3] = 0;
for ( int j = 0 ; j < seqs.size(); j++ ) { // for ( int j = 0 ; j < seqs.size(); j++ ) {
// offset of the sequence j wrt start of the msa region // // offset of the sequence j wrt start of the msa region
int seq_offset = skip_first + alignment_offsets.get(j); // int seq_offset = first_offset + alignment_offsets.get(j);
if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i // if ( i < seq_offset || i >= seq_offset + seqs.get(j).length() ) continue; // sequence j has no bases at position i
int base = -1; // int base = -1;
switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) { // switch( Character.toUpperCase(seqs.get(j).charAt(i-seq_offset)) ) {
case 'A': base = 0; break; // case 'A': base = 0; break;
case 'C': base = 1 ; break; // case 'C': base = 1 ; break;
case 'G': base = 2 ; break; // case 'G': base = 2 ; break;
case 'T': base = 3 ; break; // case 'T': base = 3 ; break;
} // }
if ( base >= 0 ) cov[base]++; // if ( base >= 0 ) cov[base]++;
} // }
int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; // int total_cov = cov[0] + cov[1] + cov[2] + cov[3];
int bmax = 0; // int bmax = 0;
int mm = 0; // int mm = 0;
consensus[3][i] = 'N';
for ( int z = 0; z < 4 ; z++ ) { Pair<Character,Integer> base = consensus.baseWithCountAt(i-first_offset);
if ( cov[z] > bmax ) { consensusString[3][i] = base.first;
bmax = cov[z]; // for ( int z = 0; z < 4 ; z++ ) {
consensus[3][i] = bases[z]; // if ( cov[z] > bmax ) {
mm = total_cov - bmax; // bmax = cov[z];
} // consensus[3][i] = bases[z];
} // mm = total_cov - bmax;
// }
// }
int mm = consensus.coverageAt(i) - base.second;
if ( mm > 0 ) { if ( mm > 0 ) {
consensus[2][i] = '*'; consensusString[2][i] = '*';
if ( mm > 9 ) consensus[0][i] = Character.forDigit(mm/10,10); if ( mm > 9 ) consensusString[0][i] = Character.forDigit(mm/10,10);
else consensus[0][i] = ' '; else consensusString[0][i] = ' ';
consensus[1][i] = Character.forDigit(mm%10,10); consensusString[1][i] = Character.forDigit(mm%10,10);
} else { } else {
consensus[0][i] = consensus[1][i] = consensus[2][i] = ' '; consensusString[0][i] = consensusString[1][i] = consensusString[2][i] = ' ';
} }
} }
b.append(" "); b.append(consensus[0]); b.append('\n'); b.append(" "); b.append(consensusString[0]); b.append('\n');
b.append(" "); b.append(consensus[1]); b.append('\n'); b.append(" "); b.append(consensusString[1]); b.append('\n');
b.append(" "); b.append(consensus[2]); b.append('\n'); b.append(" "); b.append(consensusString[2]); b.append('\n');
b.append(" "); b.append(consensus[3]); b.append('\n'); b.append(" "); b.append(consensusString[3]); b.append('\n');
Integer[] perm = null; Integer[] perm = null;
if ( inorder ) perm = Utils.SortPermutation(alignment_offsets); if ( inorder ) perm = Utils.SortPermutation(alignment_offsets);
@ -275,7 +299,7 @@ public class MultipleAlignment implements Iterable<Integer> {
for ( int i = 0 ; i < seqs.size() ; i++ ) { for ( int i = 0 ; i < seqs.size() ; i++ ) {
int index = (inorder ? perm[i] : i); int index = (inorder ? perm[i] : i);
frmt.format("%3d:", ext_ids.get(index)); frmt.format("%3d:", ext_ids.get(index));
skipN(alignment_offsets.get(index)+skip_first,b); skipN(alignment_offsets.get(index)+ first_offset,b);
b.append(seqs.get(index)); b.append(seqs.get(index));
b.append('\n'); b.append('\n');
} }

View File

@ -103,6 +103,9 @@ public class PileBuilder implements RecordPileReceiver {
System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ; System.out.println("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ;
List<MultipleAlignment> piles = doMultipleAlignment2(seqs); List<MultipleAlignment> piles = doMultipleAlignment2(seqs);
// System.out.print("Distance between final piles: "+distance(alignments1, alignments2)); // System.out.print("Distance between final piles: "+distance(alignments1, alignments2));
// System.out.print("; diameter of PILE1: "+ diameter(alignments1)); // System.out.print("; diameter of PILE1: "+ diameter(alignments1));
// System.out.println("; diameter of PILE2: "+ diameter(alignments2)); // System.out.println("; diameter of PILE2: "+ diameter(alignments2));