Data structure for counts, to isolate the user from wonky 'sometimes counts are cumulative, other times base-by-base' gotchas.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1592 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7c8b17b456
commit
275707f5f6
|
|
@ -1,7 +1,5 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||
*
|
||||
|
|
@ -16,29 +14,29 @@ public class BWT {
|
|||
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||
|
||||
public final int inverseSA0;
|
||||
public final int[] count;
|
||||
public final Counts counts;
|
||||
public final SequenceBlock[] sequenceBlocks;
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param count Cumulative count of bases, in A,C,G,T order.
|
||||
* @param counts Cumulative count of bases, in A,C,G,T order.
|
||||
* @param sequenceBlocks The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) {
|
||||
public BWT( int inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.count = count;
|
||||
this.counts = counts;
|
||||
this.sequenceBlocks = sequenceBlocks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param count Cumulative count of bases, in A,C,G,T order.
|
||||
* @param counts Count of bases, in A,C,G,T order.
|
||||
* @param sequence The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( int inverseSA0, int[] count, byte[] sequence ) {
|
||||
this(inverseSA0,count,generateSequenceBlocks(sequence));
|
||||
public BWT( int inverseSA0, Counts counts, byte[] sequence ) {
|
||||
this(inverseSA0,counts,generateSequenceBlocks(sequence));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -46,7 +44,7 @@ public class BWT {
|
|||
* @return The full BWT string as a byte array.
|
||||
*/
|
||||
public byte[] getSequence() {
|
||||
byte[] sequence = new byte[count[count.length-1]];
|
||||
byte[] sequence = new byte[counts.getTotal()];
|
||||
for( SequenceBlock block: sequenceBlocks )
|
||||
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
|
||||
return sequence;
|
||||
|
|
@ -57,7 +55,7 @@ public class BWT {
|
|||
* @return Number of bases.
|
||||
*/
|
||||
public int length() {
|
||||
return count[count.length-1];
|
||||
return counts.getTotal();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -66,7 +64,7 @@ public class BWT {
|
|||
* @return Array of sequence blocks containing data from the sequence.
|
||||
*/
|
||||
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
|
||||
int[] occurrences = {0,0,0,0};
|
||||
Counts occurrences = new Counts();
|
||||
|
||||
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
|
||||
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
|
||||
|
|
@ -78,10 +76,10 @@ public class BWT {
|
|||
|
||||
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
|
||||
|
||||
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence);
|
||||
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence);
|
||||
|
||||
for( byte base: subsequence )
|
||||
occurrences[PackUtils.packBase(base)]++;
|
||||
occurrences.increment(Base.fromASCII(base));
|
||||
}
|
||||
|
||||
return sequenceBlocks;
|
||||
|
|
@ -106,7 +104,7 @@ class SequenceBlock {
|
|||
/**
|
||||
* Occurrences of each letter up to this sequence block.
|
||||
*/
|
||||
public final int[] occurrences;
|
||||
public final Counts occurrences;
|
||||
|
||||
/**
|
||||
* Sequence for this segment.
|
||||
|
|
@ -120,7 +118,7 @@ class SequenceBlock {
|
|||
* @param occurrences How many of each base has been seen before this sequence began.
|
||||
* @param sequence The actual sequence from the BWT.
|
||||
*/
|
||||
SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) {
|
||||
SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) {
|
||||
this.sequenceStart = sequenceStart;
|
||||
this.sequenceLength = sequenceLength;
|
||||
this.occurrences = occurrences;
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Reads a BWT from a given file.
|
||||
*
|
||||
|
|
@ -60,14 +59,14 @@ public class BWTReader {
|
|||
intPackedInputStream.read(occurrences);
|
||||
basePackedInputStream.read(bwt);
|
||||
|
||||
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt);
|
||||
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt);
|
||||
}
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new BWT(inverseSA0, count, sequenceBlocks);
|
||||
return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -40,22 +40,15 @@ public class BWTWriter {
|
|||
|
||||
try {
|
||||
intPackedOutputStream.write(bwt.inverseSA0);
|
||||
intPackedOutputStream.write(bwt.count);
|
||||
intPackedOutputStream.write(bwt.counts.toArray(true));
|
||||
|
||||
for( SequenceBlock block: bwt.sequenceBlocks ) {
|
||||
intPackedOutputStream.write(block.occurrences);
|
||||
intPackedOutputStream.write(block.occurrences.toArray(false));
|
||||
basePackedOutputStream.write(block.sequence);
|
||||
}
|
||||
|
||||
// The last block is the last set of counts in the structure.
|
||||
int[] finalCount = new int[bwt.count.length];
|
||||
for( int i = 0; i < finalCount.length; i++ ) {
|
||||
if(i != 0)
|
||||
finalCount[i] = bwt.count[i]-bwt.count[i-1];
|
||||
else
|
||||
finalCount[i] = bwt.count[i];
|
||||
}
|
||||
intPackedOutputStream.write(finalCount);
|
||||
intPackedOutputStream.write(bwt.counts.toArray(false));
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,97 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Enhanced enum representation of a base.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public enum Base
|
||||
{
|
||||
A((byte)'A',0),
|
||||
C((byte)'C',1),
|
||||
G((byte)'G',2),
|
||||
T((byte)'T',3);
|
||||
|
||||
/**
|
||||
* The ASCII representation of a given base.
|
||||
*/
|
||||
private final byte ascii;
|
||||
|
||||
/**
|
||||
* The 2-bit packed value of the base.
|
||||
*/
|
||||
private final int pack;
|
||||
|
||||
/**
|
||||
* Representation of the base broken down by packed value.
|
||||
*/
|
||||
private static final Map<Integer,Base> basesByPack = new HashMap<Integer,Base>();
|
||||
|
||||
/**
|
||||
* Representation of the base broken down by ASCII code.
|
||||
*/
|
||||
private static final Map<Byte,Base> basesByASCII = new HashMap<Byte,Base>();
|
||||
|
||||
static {
|
||||
for(Base base : EnumSet.allOf(Base.class)) {
|
||||
basesByPack.put(base.pack,base);
|
||||
basesByASCII.put(base.ascii,base);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new base with the given ascii representation and
|
||||
* pack value.
|
||||
* @param ascii ASCII representation of a given base.
|
||||
* @param pack Packed value of a given base.
|
||||
*/
|
||||
private Base( byte ascii, int pack ) {
|
||||
this.ascii = ascii;
|
||||
this.pack = pack;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the given base from the packed representation.
|
||||
* @param pack Packed representation.
|
||||
* @return base.
|
||||
*/
|
||||
public static Base fromPack( int pack ) { return basesByPack.get(pack); }
|
||||
|
||||
/**
|
||||
* Convert the given base to its packed value.
|
||||
* @return Packed value.
|
||||
*/
|
||||
public int toPack() { return pack; }
|
||||
|
||||
/**
|
||||
* Convert the given base to its packed value.
|
||||
* @param ascii ASCII representation of the base.
|
||||
* @return Packed value.
|
||||
*/
|
||||
public static int toPack( byte ascii ) { return basesByASCII.get(ascii).pack; }
|
||||
|
||||
/**
|
||||
* Get the given base from the ASCII representation.
|
||||
* @param ascii ASCII representation.
|
||||
* @return base.
|
||||
*/
|
||||
public static Base fromASCII( byte ascii ) { return basesByASCII.get(ascii); }
|
||||
|
||||
/**
|
||||
* Convert the given base to its ASCII value.
|
||||
* @return ASCII value.
|
||||
*/
|
||||
public byte toASCII() { return ascii; }
|
||||
|
||||
/**
|
||||
* Convert the given base to its ASCII value.
|
||||
* @param pack The packed representation of the base.
|
||||
* @return ASCII value.
|
||||
*/
|
||||
public static byte toASCII( int pack ) { return basesByPack.get(pack).ascii; }
|
||||
}
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.EnumSet;
|
||||
/**
|
||||
* Counts of how many bases of each type have been seen.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class Counts implements Cloneable {
|
||||
/**
|
||||
* Internal representation of counts, broken down by pack value.
|
||||
*/
|
||||
private int[] counts = new int[EnumSet.allOf(Base.class).size()];
|
||||
|
||||
/**
|
||||
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
|
||||
*/
|
||||
public Counts() {}
|
||||
|
||||
/**
|
||||
* Create a counts data structure with the given initial values.
|
||||
* @param data Count data, broken down by base.
|
||||
* @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example).
|
||||
*/
|
||||
Counts( int[] data, boolean cumulative ) {
|
||||
for( Base base: EnumSet.allOf(Base.class))
|
||||
counts[base.toPack()] = data[base.toPack()];
|
||||
|
||||
// De-cumulatize data as necessary.
|
||||
if(cumulative) {
|
||||
for( int i = EnumSet.allOf(Base.class).size()-1; i > 0; i-- )
|
||||
counts[i] -= counts[i-1];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert to an array for persistence.
|
||||
* @param cumulative Use a cumulative representation.
|
||||
* @return Array of count values.
|
||||
*/
|
||||
public int[] toArray(boolean cumulative) {
|
||||
int[] countArray = counts.clone();
|
||||
if(cumulative) {
|
||||
for( int i = 1; i < counts.length; i++ )
|
||||
countArray[i] += countArray[i-1];
|
||||
}
|
||||
return countArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a unique copy of the current object.
|
||||
* @return A duplicate of this object.
|
||||
*/
|
||||
public Counts clone() {
|
||||
Counts other;
|
||||
try {
|
||||
other = (Counts)super.clone();
|
||||
}
|
||||
catch(CloneNotSupportedException ex) {
|
||||
throw new StingException("Unable to clone counts object", ex);
|
||||
}
|
||||
other.counts = new int[counts.length];
|
||||
System.arraycopy(counts,0,other.counts,0,counts.length);
|
||||
return other;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of bases seen at the given location.
|
||||
* @param base Base to increment.
|
||||
*/
|
||||
public void increment(Base base) {
|
||||
counts[base.toPack()]++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a count of the number of bases seen at a given location.
|
||||
* Note that counts in this case are not cumulative (counts for A,C,G,T
|
||||
* are independent).
|
||||
* @param base Base for which to query counts.
|
||||
* @return Number of bases of this type seen.
|
||||
*/
|
||||
public int get(Base base) {
|
||||
return counts[base.toPack()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a count of the number of bases of each seen type.
|
||||
* Note that counts in this case are cumulative (counts for A,C,G,T
|
||||
* are independent).
|
||||
* @param base Base for which to query counts.
|
||||
* @return Number of bases of this type seen.
|
||||
*/
|
||||
public int getCumulative(Base base) {
|
||||
int accum = 0;
|
||||
for(int i = 0; i <= base.toPack(); i++)
|
||||
accum += counts[i];
|
||||
return accum;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many total bases are represented by this count structure?
|
||||
* @return Total bases represented.
|
||||
*/
|
||||
public int getTotal() {
|
||||
int accumulator = 0;
|
||||
for( int count : counts )
|
||||
accumulator += count;
|
||||
return accumulator;
|
||||
}
|
||||
}
|
||||
|
|
@ -43,8 +43,6 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class CreateBWTFromReference {
|
||||
private static final int ALPHABET_SIZE = 4;
|
||||
|
||||
private String loadReference( File inputFile ) {
|
||||
// Read in the first sequence in the input file
|
||||
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||
|
|
@ -59,15 +57,10 @@ public class CreateBWTFromReference {
|
|||
return StringUtil.bytesToString(sequence.getBases());
|
||||
}
|
||||
|
||||
private int[] countOccurrences( String sequence ) {
|
||||
int occurrences[] = new int[ALPHABET_SIZE];
|
||||
private Counts countOccurrences( String sequence ) {
|
||||
Counts occurrences = new Counts();
|
||||
for( char base: sequence.toCharArray() )
|
||||
occurrences[PackUtils.packBase((byte)base)]++;
|
||||
|
||||
// Make occurrences cumulative
|
||||
for( int i = 1; i < ALPHABET_SIZE; i++ )
|
||||
occurrences[i] += occurrences[i-1];
|
||||
|
||||
occurrences.increment(Base.fromASCII((byte)base));
|
||||
return occurrences;
|
||||
}
|
||||
|
||||
|
|
@ -151,8 +144,11 @@ public class CreateBWTFromReference {
|
|||
String reverseSequence = creator.loadReverseReference(inputFile);
|
||||
|
||||
// Count the occurences of each given base.
|
||||
int[] occurrences = creator.countOccurrences(sequence);
|
||||
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
|
||||
Counts occurrences = creator.countOccurrences(sequence);
|
||||
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Base.A),
|
||||
occurrences.getCumulative(Base.C),
|
||||
occurrences.getCumulative(Base.G),
|
||||
occurrences.getCumulative(Base.T));
|
||||
|
||||
// Generate the suffix array and print diagnostics.
|
||||
int[] suffixArrayData = creator.createSuffixArray(sequence);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ package org.broadinstitute.sting.bwa;
|
|||
*/
|
||||
public class SuffixArray {
|
||||
public final int inverseSA0;
|
||||
public final int[] occurrences;
|
||||
public final Counts occurrences;
|
||||
public final int[] sequence;
|
||||
|
||||
/**
|
||||
|
|
@ -17,7 +17,7 @@ public class SuffixArray {
|
|||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||
* @param sequence The full suffix array.
|
||||
*/
|
||||
public SuffixArray(int inverseSA0, int[] occurrences, int[] sequence) {
|
||||
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ public class SuffixArrayReader {
|
|||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new SuffixArray(inverseSA0, occurrences, suffixArray);
|
||||
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class SuffixArrayWriter {
|
|||
|
||||
try {
|
||||
intPackedOutputStream.write(suffixArray.inverseSA0);
|
||||
intPackedOutputStream.write(suffixArray.occurrences);
|
||||
intPackedOutputStream.write(suffixArray.occurrences.toArray(true));
|
||||
// How frequently the suffix array entry is placed.
|
||||
intPackedOutputStream.write(1);
|
||||
// Length of the suffix array.
|
||||
|
|
|
|||
Loading…
Reference in New Issue