package org.broadinstitute.sting.playground.indels; import org.broadinstitute.sting.utils.Pair; import java.util.List; import java.util.ArrayList; /** * Created by IntelliJ IDEA. * User: asivache * Date: Mar 21, 2009 * Time: 4:25:12 PM * To change this template use File | Settings | File Templates. */ public class ConsensusSequence { private List< int[] > coverage; // counts of observations of every nucleotide at each position private long referencePos;// (arbitrary) reference position; when adding sequences, their offsets should be wrt this position private int startOffset; // offset of the leftmost base of this consensus sequence wrt referencePos; negative=left, positive=right private final static int NUMBINS = 4; private final static char[] BASES = { 'A','C','G','T' }; public ConsensusSequence(int refPos) { coverage = new ArrayList< int[] >(); referencePos = refPos; startOffset = 0; } public ConsensusSequence() { this(0); } /** Adds sequence se to the consensus, in the sense that all bases of seq are counted * into observed coverage kept by the consensus. The position of the sequence is specified by the * offset with respect to the fixed reference position of the consensus (the latter does not * have to be consensus start), and if the sequence extends beyound the consensus on either end, the * consensus will be extended appropriately to accomodate the full sequence. * @param seq nucleotide sequence ('ACGT...') * @param offset position of the start of the sequence relative to the fixed reference position of the consensus */ public void addSequence(String seq, int offset) { // if sequence starts before than the currently held consensus oes, extend consensus to the left if ( offset < startOffset ) { coverage.addAll(0,instantiateCoverageList(startOffset-offset)); startOffset = offset; } // if the sequence ends beyound the currently held consensus, extend consensus to the right if ( offset + seq.length() > startOffset + coverage.size() ) { coverage.addAll( instantiateCoverageList(offset+seq.length() - startOffset - coverage.size()) ); } // count bases from the sequence into the coverage int posOnConsensus = offset - startOffset; for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { char base = Character.toUpperCase(seq.charAt(i)); if ( base == 'N') continue; coverage.get(posOnConsensus)[baseToInt(base)]++; } } /** Removes sequence seq from the consensus. More exactly, 1 will be subtracted from current * observation counts kept by the consensus for each observed base at every position of the sequence. The * position of the sequence is specified by the offset with respect to the reference position * of the consensus. NOTE: this method is unchecked and does not verify that the sequence being subtracted * was indeed previously added to the consensus and/or that the consenus does accomodate full length of * the sequence. If it is not the case, the results can be unpredictable or assert failure may occur. * * @param seq nucleotide sequence ('ACGT...') * @param offset position of the start of the sequence relative to the fixed reference position of the consensus */ public void removeSequence(String seq, int offset) { assert offset >= startOffset : "Attempt to remove from consensus a sequence that starts prior to consenus start"; assert (offset+seq.length() < startOffset + coverage.size()) : "Attempt to remove from consensus a sequence that extends beyond consensus end"; // subtract sequence bases from the coverage int posOnConsensus = offset - startOffset; for ( int i = 0 ; i < seq.length() ; i++, posOnConsensus++ ) { char base = Character.toUpperCase(seq.charAt(i)); if ( base == 'N') continue; coverage.get(posOnConsensus)[ baseToInt(base) ]--; } } /** Returns offset of the start of consensus sequence with respect to the reference position the * consensus is pinned to. * @return */ public int getStartOffset() { return startOffset; } /** Returns the length (number of bases) of the consensus sequence. * * @return */ public int length() { return coverage.size(); } /** Returns the "distance" (score measuring the agreement) from the currently held consensus sequence to * the specified sequence seq starting at position offset wrt consenus reference position. * @param seq * @param offset * @return */ public double distance(String seq, int offset) { int posOnConsensus; // index into the currently held consensus sequence int i ; // index into the passed sequence argument if ( offset < startOffset ) { posOnConsensus = 0; i = startOffset - offset; } else { i = 0 ; posOnConsensus = offset - startOffset; } // stop position on the passed sequence (can be less than sequence length if consensus stops prematurely) int stop = Math.min(offset+seq.length(), startOffset+coverage.size() ) - offset; for ( ; i < stop ; i++, posOnConsensus++ ) { int base = baseToInt(Character.toUpperCase(seq.charAt(posOnConsensus))); int [] cov = coverage.get(posOnConsensus); int totalcov = cov[0]+cov[1]+cov[2]+cov[3]; } return 0.0; } /** Returns consensus base at the specified offset wrt the consesus sequence's reference position. * Specified offset must be within the span of currently held consensus sequence. Consensus base is the * one with the maximum count of observations. If two different nucleotides were observed exactly the * same number of times (and that number is greater than the number of observations for othe nucleotides), * the "lesser" one, (order being ACGT) will be returned. If coverage at specified position is zero, 'N' will * be returned. * @param offset * @return */ public char baseAt(int offset) { assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds"; int [] cov = coverage.get(offset-startOffset); int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; int bmax = 0; char base = 'N'; for ( int z = 0; z < 4 ; z++ ) { if ( cov[z] > bmax ) { bmax = cov[z]; base = BASES[z]; } } return base; } /** Returns consensus base at the specified offset together with its observation count. * * @param offset * @return * @see #baseAt(int) */ public Pair baseWithCountAt(int offset) { assert offset >= startOffset && offset < startOffset + coverage.size() : "Offset out of bounds"; int [] cov = coverage.get(offset-startOffset); int total_cov = cov[0] + cov[1] + cov[2] + cov[3]; int bmax = 0; char base = 'N'; for ( int z = 0; z < 4 ; z++ ) { if ( cov[z] > bmax ) { bmax = cov[z]; base = BASES[z]; } } return new Pair(base,bmax); } /** Returns total coverage (all observations regardless of what base what observed) at position * specified by offset with respect to the conensus' reference position. offset does not have to be within * the bounds of the currently kept consensus sequence, if it falls outside, a 0 will be silently returned. * @param offset * @return */ public int coverageAt(int offset) { if ( offset < startOffset || offset >= startOffset + coverage.size() ) return 0; int [] cov = coverage.get(offset-startOffset); return cov[0]+cov[1]+cov[2]+cov[3]; } /** Returns consesus sequence as a astring of bases (ACGTN); N will be returned for positions with zero * coverage. * @return */ public String getSequence() { char [] b = new char[coverage.size()]; for ( int i = 0 ; i < b.length ; i++ ) { b[i] = baseAt(i+startOffset); } return new String(b); } private List instantiateCoverageList(int n) { List< int[] > subseq = new ArrayList(n); for ( int i = 0 ; i < n ; i++ ) subseq.add(new int[NUMBINS]); return subseq; } private int baseToInt(char c) { int base; switch( Character.toUpperCase(c) ) { case 'A': base = 0; break; case 'C': base = 1; break; case 'G': base = 2; break; case 'T': base = 3; break; case 'N': base = -1; break; default : throw new IllegalArgumentException("Sequence can contain only ACGTN symbols"); } return base; } }