Data structure for counts, to isolate the user from wonky 'sometimes counts are cumulative, other times base-by-base' gotchas.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1592 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-09-11 20:53:24 +00:00
parent 7c8b17b456
commit 275707f5f6
9 changed files with 241 additions and 45 deletions

View File

@ -1,7 +1,5 @@
package org.broadinstitute.sting.bwa;
import java.util.Arrays;
/**
* Represents the Burrows-Wheeler Transform of a reference sequence.
*
@ -16,29 +14,29 @@ public class BWT {
public static final int SEQUENCE_BLOCK_SIZE = 128;
public final int inverseSA0;
public final int[] count;
public final Counts counts;
public final SequenceBlock[] sequenceBlocks;
/**
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param count Cumulative count of bases, in A,C,G,T order.
* @param counts Cumulative count of bases, in A,C,G,T order.
* @param sequenceBlocks The full BWT sequence, sans the '$'.
*/
public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) {
public BWT( int inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) {
this.inverseSA0 = inverseSA0;
this.count = count;
this.counts = counts;
this.sequenceBlocks = sequenceBlocks;
}
/**
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param count Cumulative count of bases, in A,C,G,T order.
* @param counts Count of bases, in A,C,G,T order.
* @param sequence The full BWT sequence, sans the '$'.
*/
public BWT( int inverseSA0, int[] count, byte[] sequence ) {
this(inverseSA0,count,generateSequenceBlocks(sequence));
public BWT( int inverseSA0, Counts counts, byte[] sequence ) {
this(inverseSA0,counts,generateSequenceBlocks(sequence));
}
/**
@ -46,7 +44,7 @@ public class BWT {
* @return The full BWT string as a byte array.
*/
public byte[] getSequence() {
byte[] sequence = new byte[count[count.length-1]];
byte[] sequence = new byte[counts.getTotal()];
for( SequenceBlock block: sequenceBlocks )
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
return sequence;
@ -57,7 +55,7 @@ public class BWT {
* @return Number of bases.
*/
public int length() {
return count[count.length-1];
return counts.getTotal();
}
/**
@ -66,7 +64,7 @@ public class BWT {
* @return Array of sequence blocks containing data from the sequence.
*/
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
int[] occurrences = {0,0,0,0};
Counts occurrences = new Counts();
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
@ -78,10 +76,10 @@ public class BWT {
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence);
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence);
for( byte base: subsequence )
occurrences[PackUtils.packBase(base)]++;
occurrences.increment(Base.fromASCII(base));
}
return sequenceBlocks;
@ -106,7 +104,7 @@ class SequenceBlock {
/**
* Occurrences of each letter up to this sequence block.
*/
public final int[] occurrences;
public final Counts occurrences;
/**
* Sequence for this segment.
@ -120,7 +118,7 @@ class SequenceBlock {
* @param occurrences How many of each base has been seen before this sequence began.
* @param sequence The actual sequence from the BWT.
*/
SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) {
SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) {
this.sequenceStart = sequenceStart;
this.sequenceLength = sequenceLength;
this.occurrences = occurrences;

View File

@ -4,7 +4,6 @@ import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* Reads a BWT from a given file.
*
@ -60,14 +59,14 @@ public class BWTReader {
intPackedInputStream.read(occurrences);
basePackedInputStream.read(bwt);
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt);
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt);
}
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
return new BWT(inverseSA0, count, sequenceBlocks);
return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks);
}
/**

View File

@ -40,22 +40,15 @@ public class BWTWriter {
try {
intPackedOutputStream.write(bwt.inverseSA0);
intPackedOutputStream.write(bwt.count);
intPackedOutputStream.write(bwt.counts.toArray(true));
for( SequenceBlock block: bwt.sequenceBlocks ) {
intPackedOutputStream.write(block.occurrences);
intPackedOutputStream.write(block.occurrences.toArray(false));
basePackedOutputStream.write(block.sequence);
}
// The last block is the last set of counts in the structure.
int[] finalCount = new int[bwt.count.length];
for( int i = 0; i < finalCount.length; i++ ) {
if(i != 0)
finalCount[i] = bwt.count[i]-bwt.count[i-1];
else
finalCount[i] = bwt.count[i];
}
intPackedOutputStream.write(finalCount);
intPackedOutputStream.write(bwt.counts.toArray(false));
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);

View File

@ -0,0 +1,97 @@
package org.broadinstitute.sting.bwa;
import java.util.EnumSet;
import java.util.Map;
import java.util.HashMap;
/**
* Enhanced enum representation of a base.
*
* @author mhanna
* @version 0.1
*/
public enum Base
{
A((byte)'A',0),
C((byte)'C',1),
G((byte)'G',2),
T((byte)'T',3);
/**
* The ASCII representation of a given base.
*/
private final byte ascii;
/**
* The 2-bit packed value of the base.
*/
private final int pack;
/**
* Representation of the base broken down by packed value.
*/
private static final Map<Integer,Base> basesByPack = new HashMap<Integer,Base>();
/**
* Representation of the base broken down by ASCII code.
*/
private static final Map<Byte,Base> basesByASCII = new HashMap<Byte,Base>();
static {
for(Base base : EnumSet.allOf(Base.class)) {
basesByPack.put(base.pack,base);
basesByASCII.put(base.ascii,base);
}
}
/**
* Create a new base with the given ascii representation and
* pack value.
* @param ascii ASCII representation of a given base.
* @param pack Packed value of a given base.
*/
private Base( byte ascii, int pack ) {
this.ascii = ascii;
this.pack = pack;
}
/**
* Get the given base from the packed representation.
* @param pack Packed representation.
* @return base.
*/
public static Base fromPack( int pack ) { return basesByPack.get(pack); }
/**
* Convert the given base to its packed value.
* @return Packed value.
*/
public int toPack() { return pack; }
/**
* Convert the given base to its packed value.
* @param ascii ASCII representation of the base.
* @return Packed value.
*/
public static int toPack( byte ascii ) { return basesByASCII.get(ascii).pack; }
/**
* Get the given base from the ASCII representation.
* @param ascii ASCII representation.
* @return base.
*/
public static Base fromASCII( byte ascii ) { return basesByASCII.get(ascii); }
/**
* Convert the given base to its ASCII value.
* @return ASCII value.
*/
public byte toASCII() { return ascii; }
/**
* Convert the given base to its ASCII value.
* @param pack The packed representation of the base.
* @return ASCII value.
*/
public static byte toASCII( int pack ) { return basesByPack.get(pack).ascii; }
}

View File

@ -0,0 +1,113 @@
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.util.EnumSet;
/**
* Counts of how many bases of each type have been seen.
*
* @author mhanna
* @version 0.1
*/
public class Counts implements Cloneable {
/**
* Internal representation of counts, broken down by pack value.
*/
private int[] counts = new int[EnumSet.allOf(Base.class).size()];
/**
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
*/
public Counts() {}
/**
* Create a counts data structure with the given initial values.
* @param data Count data, broken down by base.
* @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example).
*/
Counts( int[] data, boolean cumulative ) {
for( Base base: EnumSet.allOf(Base.class))
counts[base.toPack()] = data[base.toPack()];
// De-cumulatize data as necessary.
if(cumulative) {
for( int i = EnumSet.allOf(Base.class).size()-1; i > 0; i-- )
counts[i] -= counts[i-1];
}
}
/**
* Convert to an array for persistence.
* @param cumulative Use a cumulative representation.
* @return Array of count values.
*/
public int[] toArray(boolean cumulative) {
int[] countArray = counts.clone();
if(cumulative) {
for( int i = 1; i < counts.length; i++ )
countArray[i] += countArray[i-1];
}
return countArray;
}
/**
* Create a unique copy of the current object.
* @return A duplicate of this object.
*/
public Counts clone() {
Counts other;
try {
other = (Counts)super.clone();
}
catch(CloneNotSupportedException ex) {
throw new StingException("Unable to clone counts object", ex);
}
other.counts = new int[counts.length];
System.arraycopy(counts,0,other.counts,0,counts.length);
return other;
}
/**
* Increment the number of bases seen at the given location.
* @param base Base to increment.
*/
public void increment(Base base) {
counts[base.toPack()]++;
}
/**
* Gets a count of the number of bases seen at a given location.
* Note that counts in this case are not cumulative (counts for A,C,G,T
* are independent).
* @param base Base for which to query counts.
* @return Number of bases of this type seen.
*/
public int get(Base base) {
return counts[base.toPack()];
}
/**
* Gets a count of the number of bases of each seen type.
* Note that counts in this case are cumulative (counts for A,C,G,T
* are independent).
* @param base Base for which to query counts.
* @return Number of bases of this type seen.
*/
public int getCumulative(Base base) {
int accum = 0;
for(int i = 0; i <= base.toPack(); i++)
accum += counts[i];
return accum;
}
/**
* How many total bases are represented by this count structure?
* @return Total bases represented.
*/
public int getTotal() {
int accumulator = 0;
for( int count : counts )
accumulator += count;
return accumulator;
}
}

View File

@ -43,8 +43,6 @@ import org.broadinstitute.sting.utils.StingException;
* @version 0.1
*/
public class CreateBWTFromReference {
private static final int ALPHABET_SIZE = 4;
private String loadReference( File inputFile ) {
// Read in the first sequence in the input file
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
@ -59,15 +57,10 @@ public class CreateBWTFromReference {
return StringUtil.bytesToString(sequence.getBases());
}
private int[] countOccurrences( String sequence ) {
int occurrences[] = new int[ALPHABET_SIZE];
private Counts countOccurrences( String sequence ) {
Counts occurrences = new Counts();
for( char base: sequence.toCharArray() )
occurrences[PackUtils.packBase((byte)base)]++;
// Make occurrences cumulative
for( int i = 1; i < ALPHABET_SIZE; i++ )
occurrences[i] += occurrences[i-1];
occurrences.increment(Base.fromASCII((byte)base));
return occurrences;
}
@ -151,8 +144,11 @@ public class CreateBWTFromReference {
String reverseSequence = creator.loadReverseReference(inputFile);
// Count the occurences of each given base.
int[] occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
Counts occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Base.A),
occurrences.getCumulative(Base.C),
occurrences.getCumulative(Base.G),
occurrences.getCumulative(Base.T));
// Generate the suffix array and print diagnostics.
int[] suffixArrayData = creator.createSuffixArray(sequence);

View File

@ -8,7 +8,7 @@ package org.broadinstitute.sting.bwa;
*/
public class SuffixArray {
public final int inverseSA0;
public final int[] occurrences;
public final Counts occurrences;
public final int[] sequence;
/**
@ -17,7 +17,7 @@ public class SuffixArray {
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param sequence The full suffix array.
*/
public SuffixArray(int inverseSA0, int[] occurrences, int[] sequence) {
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
this.inverseSA0 = inverseSA0;
this.occurrences = occurrences;
this.sequence = sequence;

View File

@ -55,7 +55,7 @@ public class SuffixArrayReader {
throw new StingException("Unable to read BWT from input stream.", ex);
}
return new SuffixArray(inverseSA0, occurrences, suffixArray);
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray);
}

View File

@ -39,7 +39,7 @@ public class SuffixArrayWriter {
try {
intPackedOutputStream.write(suffixArray.inverseSA0);
intPackedOutputStream.write(suffixArray.occurrences);
intPackedOutputStream.write(suffixArray.occurrences.toArray(true));
// How frequently the suffix array entry is placed.
intPackedOutputStream.write(1);
// Length of the suffix array.