diff --git a/java/src/org/broadinstitute/sting/bwa/BWT.java b/java/src/org/broadinstitute/sting/bwa/BWT.java index 0d2b7300f..be127cbfc 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWT.java +++ b/java/src/org/broadinstitute/sting/bwa/BWT.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.bwa; -import java.util.Arrays; - /** * Represents the Burrows-Wheeler Transform of a reference sequence. * @@ -16,29 +14,29 @@ public class BWT { public static final int SEQUENCE_BLOCK_SIZE = 128; public final int inverseSA0; - public final int[] count; + public final Counts counts; public final SequenceBlock[] sequenceBlocks; /** * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII). * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. - * @param count Cumulative count of bases, in A,C,G,T order. + * @param counts Cumulative count of bases, in A,C,G,T order. * @param sequenceBlocks The full BWT sequence, sans the '$'. */ - public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) { + public BWT( int inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) { this.inverseSA0 = inverseSA0; - this.count = count; + this.counts = counts; this.sequenceBlocks = sequenceBlocks; } /** * Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII). * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. - * @param count Cumulative count of bases, in A,C,G,T order. + * @param counts Count of bases, in A,C,G,T order. * @param sequence The full BWT sequence, sans the '$'. */ - public BWT( int inverseSA0, int[] count, byte[] sequence ) { - this(inverseSA0,count,generateSequenceBlocks(sequence)); + public BWT( int inverseSA0, Counts counts, byte[] sequence ) { + this(inverseSA0,counts,generateSequenceBlocks(sequence)); } /** @@ -46,7 +44,7 @@ public class BWT { * @return The full BWT string as a byte array. */ public byte[] getSequence() { - byte[] sequence = new byte[count[count.length-1]]; + byte[] sequence = new byte[counts.getTotal()]; for( SequenceBlock block: sequenceBlocks ) System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength); return sequence; @@ -57,7 +55,7 @@ public class BWT { * @return Number of bases. */ public int length() { - return count[count.length-1]; + return counts.getTotal(); } /** @@ -66,7 +64,7 @@ public class BWT { * @return Array of sequence blocks containing data from the sequence. */ private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) { - int[] occurrences = {0,0,0,0}; + Counts occurrences = new Counts(); int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE); SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks]; @@ -78,10 +76,10 @@ public class BWT { System.arraycopy(sequence,blockStart,subsequence,0,blockLength); - sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence); + sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence); for( byte base: subsequence ) - occurrences[PackUtils.packBase(base)]++; + occurrences.increment(Base.fromASCII(base)); } return sequenceBlocks; @@ -106,7 +104,7 @@ class SequenceBlock { /** * Occurrences of each letter up to this sequence block. */ - public final int[] occurrences; + public final Counts occurrences; /** * Sequence for this segment. @@ -120,7 +118,7 @@ class SequenceBlock { * @param occurrences How many of each base has been seen before this sequence began. * @param sequence The actual sequence from the BWT. */ - SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) { + SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) { this.sequenceStart = sequenceStart; this.sequenceLength = sequenceLength; this.occurrences = occurrences; diff --git a/java/src/org/broadinstitute/sting/bwa/BWTReader.java b/java/src/org/broadinstitute/sting/bwa/BWTReader.java index 06437e72c..413a46fbb 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWTReader.java +++ b/java/src/org/broadinstitute/sting/bwa/BWTReader.java @@ -4,7 +4,6 @@ import org.broadinstitute.sting.utils.StingException; import java.io.*; import java.nio.ByteOrder; - /** * Reads a BWT from a given file. * @@ -60,14 +59,14 @@ public class BWTReader { intPackedInputStream.read(occurrences); basePackedInputStream.read(bwt); - sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt); + sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt); } } catch( IOException ex ) { throw new StingException("Unable to read BWT from input stream.", ex); } - return new BWT(inverseSA0, count, sequenceBlocks); + return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks); } /** diff --git a/java/src/org/broadinstitute/sting/bwa/BWTWriter.java b/java/src/org/broadinstitute/sting/bwa/BWTWriter.java index 7cc32faf0..1913247d7 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWTWriter.java +++ b/java/src/org/broadinstitute/sting/bwa/BWTWriter.java @@ -40,22 +40,15 @@ public class BWTWriter { try { intPackedOutputStream.write(bwt.inverseSA0); - intPackedOutputStream.write(bwt.count); + intPackedOutputStream.write(bwt.counts.toArray(true)); for( SequenceBlock block: bwt.sequenceBlocks ) { - intPackedOutputStream.write(block.occurrences); + intPackedOutputStream.write(block.occurrences.toArray(false)); basePackedOutputStream.write(block.sequence); } // The last block is the last set of counts in the structure. - int[] finalCount = new int[bwt.count.length]; - for( int i = 0; i < finalCount.length; i++ ) { - if(i != 0) - finalCount[i] = bwt.count[i]-bwt.count[i-1]; - else - finalCount[i] = bwt.count[i]; - } - intPackedOutputStream.write(finalCount); + intPackedOutputStream.write(bwt.counts.toArray(false)); } catch( IOException ex ) { throw new StingException("Unable to read BWT from input stream.", ex); diff --git a/java/src/org/broadinstitute/sting/bwa/Base.java b/java/src/org/broadinstitute/sting/bwa/Base.java new file mode 100644 index 000000000..47e8bff06 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/Base.java @@ -0,0 +1,97 @@ +package org.broadinstitute.sting.bwa; + +import java.util.EnumSet; +import java.util.Map; +import java.util.HashMap; + +/** + * Enhanced enum representation of a base. + * + * @author mhanna + * @version 0.1 + */ +public enum Base +{ + A((byte)'A',0), + C((byte)'C',1), + G((byte)'G',2), + T((byte)'T',3); + + /** + * The ASCII representation of a given base. + */ + private final byte ascii; + + /** + * The 2-bit packed value of the base. + */ + private final int pack; + + /** + * Representation of the base broken down by packed value. + */ + private static final Map basesByPack = new HashMap(); + + /** + * Representation of the base broken down by ASCII code. + */ + private static final Map basesByASCII = new HashMap(); + + static { + for(Base base : EnumSet.allOf(Base.class)) { + basesByPack.put(base.pack,base); + basesByASCII.put(base.ascii,base); + } + } + + /** + * Create a new base with the given ascii representation and + * pack value. + * @param ascii ASCII representation of a given base. + * @param pack Packed value of a given base. + */ + private Base( byte ascii, int pack ) { + this.ascii = ascii; + this.pack = pack; + } + + /** + * Get the given base from the packed representation. + * @param pack Packed representation. + * @return base. + */ + public static Base fromPack( int pack ) { return basesByPack.get(pack); } + + /** + * Convert the given base to its packed value. + * @return Packed value. + */ + public int toPack() { return pack; } + + /** + * Convert the given base to its packed value. + * @param ascii ASCII representation of the base. + * @return Packed value. + */ + public static int toPack( byte ascii ) { return basesByASCII.get(ascii).pack; } + + /** + * Get the given base from the ASCII representation. + * @param ascii ASCII representation. + * @return base. + */ + public static Base fromASCII( byte ascii ) { return basesByASCII.get(ascii); } + + /** + * Convert the given base to its ASCII value. + * @return ASCII value. + */ + public byte toASCII() { return ascii; } + + /** + * Convert the given base to its ASCII value. + * @param pack The packed representation of the base. + * @return ASCII value. + */ + public static byte toASCII( int pack ) { return basesByPack.get(pack).ascii; } +} diff --git a/java/src/org/broadinstitute/sting/bwa/Counts.java b/java/src/org/broadinstitute/sting/bwa/Counts.java new file mode 100644 index 000000000..8b8e41933 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/Counts.java @@ -0,0 +1,113 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.util.EnumSet; +/** + * Counts of how many bases of each type have been seen. + * + * @author mhanna + * @version 0.1 + */ +public class Counts implements Cloneable { + /** + * Internal representation of counts, broken down by pack value. + */ + private int[] counts = new int[EnumSet.allOf(Base.class).size()]; + + /** + * Create an empty Counts object with values A=0,C=0,G=0,T=0. + */ + public Counts() {} + + /** + * Create a counts data structure with the given initial values. + * @param data Count data, broken down by base. + * @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example). + */ + Counts( int[] data, boolean cumulative ) { + for( Base base: EnumSet.allOf(Base.class)) + counts[base.toPack()] = data[base.toPack()]; + + // De-cumulatize data as necessary. + if(cumulative) { + for( int i = EnumSet.allOf(Base.class).size()-1; i > 0; i-- ) + counts[i] -= counts[i-1]; + } + } + + /** + * Convert to an array for persistence. + * @param cumulative Use a cumulative representation. + * @return Array of count values. + */ + public int[] toArray(boolean cumulative) { + int[] countArray = counts.clone(); + if(cumulative) { + for( int i = 1; i < counts.length; i++ ) + countArray[i] += countArray[i-1]; + } + return countArray; + } + + /** + * Create a unique copy of the current object. + * @return A duplicate of this object. + */ + public Counts clone() { + Counts other; + try { + other = (Counts)super.clone(); + } + catch(CloneNotSupportedException ex) { + throw new StingException("Unable to clone counts object", ex); + } + other.counts = new int[counts.length]; + System.arraycopy(counts,0,other.counts,0,counts.length); + return other; + } + + /** + * Increment the number of bases seen at the given location. + * @param base Base to increment. + */ + public void increment(Base base) { + counts[base.toPack()]++; + } + + /** + * Gets a count of the number of bases seen at a given location. + * Note that counts in this case are not cumulative (counts for A,C,G,T + * are independent). + * @param base Base for which to query counts. + * @return Number of bases of this type seen. + */ + public int get(Base base) { + return counts[base.toPack()]; + } + + /** + * Gets a count of the number of bases of each seen type. + * Note that counts in this case are cumulative (counts for A,C,G,T + * are independent). + * @param base Base for which to query counts. + * @return Number of bases of this type seen. + */ + public int getCumulative(Base base) { + int accum = 0; + for(int i = 0; i <= base.toPack(); i++) + accum += counts[i]; + return accum; + } + + /** + * How many total bases are represented by this count structure? + * @return Total bases represented. + */ + public int getTotal() { + int accumulator = 0; + for( int count : counts ) + accumulator += count; + return accumulator; + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java index 9582ed626..a674f5205 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java @@ -43,8 +43,6 @@ import org.broadinstitute.sting.utils.StingException; * @version 0.1 */ public class CreateBWTFromReference { - private static final int ALPHABET_SIZE = 4; - private String loadReference( File inputFile ) { // Read in the first sequence in the input file ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); @@ -59,15 +57,10 @@ public class CreateBWTFromReference { return StringUtil.bytesToString(sequence.getBases()); } - private int[] countOccurrences( String sequence ) { - int occurrences[] = new int[ALPHABET_SIZE]; + private Counts countOccurrences( String sequence ) { + Counts occurrences = new Counts(); for( char base: sequence.toCharArray() ) - occurrences[PackUtils.packBase((byte)base)]++; - - // Make occurrences cumulative - for( int i = 1; i < ALPHABET_SIZE; i++ ) - occurrences[i] += occurrences[i-1]; - + occurrences.increment(Base.fromASCII((byte)base)); return occurrences; } @@ -151,8 +144,11 @@ public class CreateBWTFromReference { String reverseSequence = creator.loadReverseReference(inputFile); // Count the occurences of each given base. - int[] occurrences = creator.countOccurrences(sequence); - System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]); + Counts occurrences = creator.countOccurrences(sequence); + System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Base.A), + occurrences.getCumulative(Base.C), + occurrences.getCumulative(Base.G), + occurrences.getCumulative(Base.T)); // Generate the suffix array and print diagnostics. int[] suffixArrayData = creator.createSuffixArray(sequence); diff --git a/java/src/org/broadinstitute/sting/bwa/SuffixArray.java b/java/src/org/broadinstitute/sting/bwa/SuffixArray.java index 240c60e34..fffec0657 100644 --- a/java/src/org/broadinstitute/sting/bwa/SuffixArray.java +++ b/java/src/org/broadinstitute/sting/bwa/SuffixArray.java @@ -8,7 +8,7 @@ package org.broadinstitute.sting.bwa; */ public class SuffixArray { public final int inverseSA0; - public final int[] occurrences; + public final Counts occurrences; public final int[] sequence; /** @@ -17,7 +17,7 @@ public class SuffixArray { * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. * @param sequence The full suffix array. */ - public SuffixArray(int inverseSA0, int[] occurrences, int[] sequence) { + public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) { this.inverseSA0 = inverseSA0; this.occurrences = occurrences; this.sequence = sequence; diff --git a/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java b/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java index fe07b0a03..591f4125a 100644 --- a/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java +++ b/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java @@ -55,7 +55,7 @@ public class SuffixArrayReader { throw new StingException("Unable to read BWT from input stream.", ex); } - return new SuffixArray(inverseSA0, occurrences, suffixArray); + return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray); } diff --git a/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java b/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java index 47d40f7ad..edc3a0821 100644 --- a/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java +++ b/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java @@ -39,7 +39,7 @@ public class SuffixArrayWriter { try { intPackedOutputStream.write(suffixArray.inverseSA0); - intPackedOutputStream.write(suffixArray.occurrences); + intPackedOutputStream.write(suffixArray.occurrences.toArray(true)); // How frequently the suffix array entry is placed. intPackedOutputStream.write(1); // Length of the suffix array.