Walks along the sequence and emits a sequence of subsequent, encoded Kmers (uses short int, so currently it's up to K=8)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@68 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-03-16 21:54:45 +00:00
parent 34d9af4702
commit c68e0cc1fe
1 changed files with 92 additions and 0 deletions

View File

@ -0,0 +1,92 @@
package org.broadinstitute.sting.indels;
import java.util.Iterator;
public class KmerCodeIterator implements Iterator<Short> {
private byte[] m_seq;
private int m_last_offset; // offset of the last base to be added to current code when the next() shift occurs
private short m_code;
private short m_mask; // used to mask out all bits in m_code except the lowest 2*K bits used for kmers
public KmerCodeIterator(String s, int K) {
assert K <= 8 : "Currently only Kmers of length K <=8 are supported";
assert K > 0 : "Kmer length must be positive";
m_seq = s.getBytes();
m_last_offset = K-1;
m_mask = 0;
for ( int i = 0 ; i < K ; i++ ) {
m_mask <<= 2;
m_mask |= 0x03;
}
if ( K <= m_seq.length ) m_code = kmerCode(m_seq, 0, m_last_offset );
// m_code now contains first K-1 bases encoded (the last, K-th base will be added when next() is called
}
@Override
public boolean hasNext() {
return m_last_offset < m_seq.length ;
}
@Override
public Short next() {
m_code <<= 2;
m_code |= toBaseCode(m_seq[m_last_offset]);
m_code &= m_mask;
m_last_offset++;
return m_code;
}
@Override
public void remove() {
// TODO Auto-generated method stub
}
/**
* Converts base letter (character passed as byte: 'A', 'C', 'T', or 'G', case-insensitive) into
* numerical code according to A=0, C=1, G=2, T=3 (i.e. two bits per nucleotide)
* @param b
* @return
*/
private byte toBaseCode(byte b) {
// the following transformation is performed in the lines below:
// A,a->0, C,c->1, G,g->3, T,t->2; (we rely on the ASCII codes here!)
b >>= 1;
b &= 0x03;
// to get conventional base codes (A=0, C=1, G=2, T=3), we need to flip 3 (G) and 2 (T).
// In order to do that we xor the lowest bit
// with the second one (the latter is 1 for 2 and 3, but 0 for 0 and 1, so A anC will not be affected)
b ^= (b >> 1);
return b;
}
/** Returns compact code that uniquely represents nucleotide sequence specified
* as an array of character codes (bytes) such as {'A','C','G','G','T',...}. Case insensitive.
* Sequence can not be longer than 8 nucleotides.
* @param s Nucleotide sequence
* @return unique code
*/
// private short kmerCode(byte [] s) { return kmerCode(s, 0, s.length); }
/** Returns compact code that uniquely represents nucleotide sequence found in the interval
* [start,stop) of the specified array of character codes (bytes) such as {'A','C','G','G','T',...}. Case insensitive.
* Sequence can not be longer than 8 nucleotides.
* @param s Nucleotide sequence
* @param start index of the start position of the sub-sequence to be encoded
* @param stop next position after the last element of the sub-sequence to be encoded
* @return unique code
*/
private short kmerCode(byte [] s, int start, int stop) {
assert start <= stop : "Start position of the subsequence can not be greater than the stop position";
assert start >= 0 : "Negative subsequence start positions are not allowed";
assert stop <= s.length : "Stop position of the subsequence can not extend beyond the sequence end";
short code = 0;
for ( int i = start ; i < stop ; i++ ) {
code <<= 2;
code |= toBaseCode(s[i]);
}
return code;
}
}