Walks along the sequence and emits a sequence of subsequent, encoded Kmers (uses short int, so currently it's up to K=8)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@68 348d0f76-0448-11de-a6fe-93d51630548a
2009-03-16 21:54:45 +00:00 · 2009-03-16 21:54:45 +00:00 · c68e0cc1fe
parent 34d9af4702
commit c68e0cc1fe
1 changed files with 92 additions and 0 deletions
--- a/playground/java/src/org/broadinstitute/sting/indels/KmerCodeIterator.java
+++ b/playground/java/src/org/broadinstitute/sting/indels/KmerCodeIterator.java
@ -0,0 +1,92 @@
+package org.broadinstitute.sting.indels;
+
+import java.util.Iterator;
+
+public class KmerCodeIterator implements Iterator<Short> {
+	private byte[] m_seq;
+	private int m_last_offset; // offset of the last base to be added to current code when the next() shift occurs
+	private short m_code;
+	private short m_mask; // used to mask out all bits in m_code except the lowest 2*K bits used for kmers
+	
+	public KmerCodeIterator(String s, int K) {
+		assert K <= 8 : "Currently only Kmers of length K <=8 are supported";
+		assert K > 0 : "Kmer length must be positive";
+		m_seq = s.getBytes();
+		m_last_offset = K-1;
+		m_mask = 0; 
+		for ( int i = 0 ; i < K ; i++ ) {
+			m_mask <<= 2; 
+			m_mask |= 0x03;
+		}
+		if ( K <= m_seq.length ) m_code = kmerCode(m_seq, 0, m_last_offset );
+		// m_code now contains first K-1 bases encoded (the last, K-th base will be added when next() is called
+	}
+	
+	@Override
+	public boolean hasNext()  {
+		return m_last_offset <  m_seq.length ;
+	}
+		
+	@Override
+	public Short next() {
+		m_code <<= 2;
+		m_code |= toBaseCode(m_seq[m_last_offset]);
+		m_code &= m_mask;
+		m_last_offset++;
+		return m_code;
+	}
+
+	@Override
+	public void remove() {
+		// TODO Auto-generated method stub
+		
+	}
+
+	/**
+	 * Converts base letter (character passed as byte: 'A', 'C', 'T', or 'G', case-insensitive) into 
+	 * numerical code according to A=0, C=1, G=2, T=3 (i.e. two bits per nucleotide)
+	 * @param b
+	 * @return
+	 */
+	private byte toBaseCode(byte b) {
+		// the following transformation is performed in the lines below:
+		// A,a->0, C,c->1, G,g->3, T,t->2; (we rely on the ASCII codes here!)
+		b >>= 1;
+		b &= 0x03;
+		// to get conventional base codes (A=0, C=1, G=2, T=3), we need to flip 3 (G) and 2 (T). 
+		// In order to do that we xor the lowest bit
+		// with the second one (the latter is 1 for 2 and 3, but 0 for 0 and 1, so A anC will not be affected)
+		b  ^= (b >> 1);
+		return  b;
+	}
+	
+	/** Returns compact code that uniquely represents nucleotide sequence specified
+	 *  as an array of character codes (bytes) such as {'A','C','G','G','T',...}. Case insensitive.
+	 *  Sequence can not be longer than 8 nucleotides. 
+	 * @param s Nucleotide sequence
+	 * @return unique code
+	 */
+//	private short kmerCode(byte [] s) { return kmerCode(s, 0, s.length); }
+	
+	/** Returns compact code that uniquely represents nucleotide sequence found in the interval
+	 * [start,stop) of the specified  array of character codes (bytes) such as {'A','C','G','G','T',...}. Case insensitive.
+	 *  Sequence can not be longer than 8 nucleotides. 
+	 * @param s Nucleotide sequence
+	 * @param start index of the start position of the sub-sequence to be encoded
+	 * @param stop next position after the last element of the sub-sequence to be encoded
+	 * @return unique code
+	 */
+	private short kmerCode(byte [] s, int start, int stop) {
+		assert start <= stop : "Start position of the subsequence can not be greater than the stop position";
+		assert start >= 0 : "Negative subsequence start positions are not allowed";
+		assert stop <= s.length : "Stop position of the subsequence can not extend beyond the sequence end";
+		short code = 0;
+		for ( int i = start ; i < stop ; i++ ) {
+			code  <<= 2;
+			code |= toBaseCode(s[i]);
+		}
+		return code;
+	}
+
+
+}