diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java index 8d6c6d086..97e888296 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java @@ -47,13 +47,15 @@ public class AlignerTestHarness { for(SAMRecord read: reader) { count++; - //if( count > 100000 ) break; + if( count > 100000 ) break; //if( count < 366000 ) continue; //if( count != 2 ) continue; //if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") ) // continue; //if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") ) // continue; + //if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") ) + // continue; SAMRecord alignmentCleaned = null; try { diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java index b05d2a2a8..115fb08f8 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java @@ -159,7 +159,7 @@ public class BWAAligner implements Aligner { lowerBounds.get(alignment.position+1).width, alignment.loBound, alignment.hiBound); - */ + */ // Temporary -- look ahead to see if the next alignment is bounded. boolean allowDifferences = mismatches > 0; @@ -240,11 +240,11 @@ public class BWAAligner implements Aligner { private List createMatchedAlignments( BWT bwt, BWAAlignment alignment, byte[] bases, boolean allowMismatch ) { List newAlignments = new ArrayList(); - List baseChoices = new ArrayList(); - Base thisBase = Base.fromASCII(bases[alignment.position+1]); + List baseChoices = new ArrayList(); + Byte thisBase = Bases.fromASCII(bases[alignment.position+1]); if( allowMismatch ) - baseChoices.addAll(EnumSet.allOf(Base.class)); + baseChoices.addAll(Bases.allOf()); else baseChoices.add(thisBase); @@ -258,7 +258,7 @@ public class BWAAligner implements Aligner { } } - for(Base base: baseChoices) { + for(byte base: baseChoices) { BWAAlignment newAlignment = alignment.clone(); newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; @@ -270,7 +270,7 @@ public class BWAAligner implements Aligner { newAlignment.position++; newAlignment.addState(AlignmentState.MATCH_MISMATCH); - if( base.toASCII() != bases[newAlignment.position] ) + if( Bases.fromASCII(bases[newAlignment.position]) == null || base != Bases.fromASCII(bases[newAlignment.position]) ) newAlignment.mismatches++; newAlignments.add(newAlignment); @@ -300,7 +300,7 @@ public class BWAAligner implements Aligner { */ private List createDeletionAlignments( BWT bwt, BWAAlignment alignment) { List newAlignments = new ArrayList(); - for(Base base: EnumSet.allOf(Base.class)) { + for(byte base: Bases.instance) { BWAAlignment newAlignment = alignment.clone(); newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; @@ -326,7 +326,7 @@ public class BWAAligner implements Aligner { */ private void exactMatch( BWAAlignment alignment, byte[] bases, BWT bwt ) { while( ++alignment.position < bases.length ) { - Base base = Base.fromASCII(bases[alignment.position]); + byte base = Bases.fromASCII(bases[alignment.position]); alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); if( alignment.loBound > alignment.hiBound ) diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java b/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java index cd091b69a..be0ddc1e8 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java @@ -3,7 +3,7 @@ package org.broadinstitute.sting.alignment.bwa; import java.util.List; import java.util.ArrayList; -import org.broadinstitute.sting.alignment.bwa.bwt.Base; +import org.broadinstitute.sting.alignment.bwa.bwt.Bases; import org.broadinstitute.sting.alignment.bwa.bwt.BWT; /** @@ -53,7 +53,7 @@ public class LowerBound { int loIndex = 0, hiIndex = bwt.length(), mismatches = 0; for( int i = bases.length-1; i >= 0; i-- ) { - Base base = Base.fromASCII(bases[i]); + Byte base = Bases.fromASCII(bases[i]); // Ignore non-ACGT bases. if( base != null ) { diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java index 31754d379..292319e3f 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java @@ -68,11 +68,8 @@ public class BWT { * @param base The base. * @return Total counts for all bases lexicographically smaller than this base. */ - public int counts(Base base) { - if( base.toPack() - 1 >= 0 ) - return counts.getCumulative(Base.fromPack(base.toPack()-1)); - else - return 0; + public int counts(byte base) { + return counts.getCumulative(base); } /** @@ -81,7 +78,7 @@ public class BWT { * @param index The position to search within the BWT. * @return Total counts for all bases lexicographically smaller than this base. */ - public int occurrences(Base base,int index) { + public int occurrences(byte base,int index) { // If the index is above the SA-1[0], remap it to the appropriate coordinate space. if( index > inverseSA0 ) index--; @@ -89,7 +86,7 @@ public class BWT { int position = index % SEQUENCE_BLOCK_SIZE; int accumulator = block.occurrences.get(base); for(int i = 0; i <= position; i++) { - if(base == Base.fromASCII(block.sequence[i])) + if(base == block.sequence[i]) accumulator++; } return accumulator; @@ -124,7 +121,7 @@ public class BWT { sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence); for( byte base: subsequence ) - occurrences.increment(Base.fromASCII(base)); + occurrences.increment(base); } return sequenceBlocks; diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Base.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Base.java deleted file mode 100644 index f3669864f..000000000 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Base.java +++ /dev/null @@ -1,97 +0,0 @@ -package org.broadinstitute.sting.alignment.bwa.bwt; - -import java.util.EnumSet; -import java.util.Map; -import java.util.HashMap; - -/** - * Enhanced enum representation of a base. - * - * @author mhanna - * @version 0.1 - */ -public enum Base -{ - A((byte)'A',0), - C((byte)'C',1), - G((byte)'G',2), - T((byte)'T',3); - - /** - * The ASCII representation of a given base. - */ - private final byte ascii; - - /** - * The 2-bit packed value of the base. - */ - private final int pack; - - /** - * Representation of the base broken down by packed value. - */ - private static final Map basesByPack = new HashMap(); - - /** - * Representation of the base broken down by ASCII code. - */ - private static final Map basesByASCII = new HashMap(); - - static { - for(Base base : EnumSet.allOf(Base.class)) { - basesByPack.put(base.pack,base); - basesByASCII.put(base.ascii,base); - } - } - - /** - * Create a new base with the given ascii representation and - * pack value. - * @param ascii ASCII representation of a given base. - * @param pack Packed value of a given base. - */ - private Base( byte ascii, int pack ) { - this.ascii = ascii; - this.pack = pack; - } - - /** - * Get the given base from the packed representation. - * @param pack Packed representation. - * @return base. - */ - public static Base fromPack( int pack ) { return basesByPack.get(pack); } - - /** - * Convert the given base to its packed value. - * @return Packed value. - */ - public int toPack() { return pack; } - - /** - * Convert the given base to its packed value. - * @param ascii ASCII representation of the base. - * @return Packed value. - */ - public static int toPack( byte ascii ) { return basesByASCII.get(ascii).pack; } - - /** - * Get the given base from the ASCII representation. - * @param ascii ASCII representation. - * @return base. - */ - public static Base fromASCII( byte ascii ) { return basesByASCII.get(ascii); } - - /** - * Convert the given base to its ASCII value. - * @return ASCII value. - */ - public byte toASCII() { return ascii; } - - /** - * Convert the given base to its ASCII value. - * @param pack The packed representation of the base. - * @return ASCII value. - */ - public static byte toASCII( int pack ) { return basesByPack.get(pack).ascii; } -} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Bases.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Bases.java new file mode 100644 index 000000000..9ea69de52 --- /dev/null +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Bases.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.alignment.bwa.bwt; + +import org.broadinstitute.sting.utils.StingException; + +import java.util.*; + +/** + * Enhanced enum representation of a base. + * + * @author mhanna + * @version 0.1 + */ +public class Bases implements Iterable +{ + public static byte A = 'A'; + public static byte C = 'C'; + public static byte G = 'G'; + public static byte T = 'T'; + + public static final Bases instance = new Bases(); + + private static final List allBases; + + /** + * Representation of the base broken down by packed value. + */ + private static final Map basesByPack = new HashMap(); + + static { + List bases = new ArrayList(); + bases.add(A); + bases.add(C); + bases.add(G); + bases.add(T); + allBases = Collections.unmodifiableList(bases); + + for(int i = 0; i < allBases.size(); i++) + basesByPack.put(i,allBases.get(i)); + } + + /** + * Create a new base with the given ascii representation and + * pack value. + */ + private Bases() { + } + + /** + * Return all possible bases. + * @return Byte representation of all bases. + */ + public static Collection allOf() { + return allBases; + } + + /** + * Gets the number of known bases. + * @return The number of known bases. + */ + public static int size() { + return allBases.size(); + } + + /** + * Gets an iterator over the total number of known base types. + * @return Iterator over all known bases. + */ + public Iterator iterator() { + return basesByPack.values().iterator(); + } + + /** + * Get the given base from the packed representation. + * @param pack Packed representation. + * @return base. + */ + public static byte fromPack( int pack ) { return basesByPack.get(pack); } + + /** + * Convert the given base to its packed value. + * @param ascii ASCII representation of the base. + * @return Packed value. + */ + public static int toPack( byte ascii ) + { + for( Map.Entry entry: basesByPack.entrySet() ) { + if( entry.getValue().equals(ascii) ) + return entry.getKey(); + } + throw new StingException(String.format("Base %c is an invalid base to pack", (char)ascii)); + } + + /** + * Convert the ASCII representation of a base to its 'normalized' representation. + * @param base The base itself. + * @return The byte, if present. Null if unknown. + */ + public static Byte fromASCII( byte base ) { + Byte found = null; + for( Byte normalized: allBases ) { + if( normalized.equals(base) ) { + found = normalized; + break; + } + } + return found; + } +} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java index a92fb30f5..ece0cfb9e 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java @@ -2,7 +2,9 @@ package org.broadinstitute.sting.alignment.bwa.bwt; import org.broadinstitute.sting.utils.StingException; -import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; + /** * Counts of how many bases of each type have been seen. * @@ -13,7 +15,7 @@ public class Counts implements Cloneable { /** * Internal representation of counts, broken down by pack value. */ - private int[] counts = new int[EnumSet.allOf(Base.class).size()]; + private Map counts = new HashMap(); /** * Create an empty Counts object with values A=0,C=0,G=0,T=0. @@ -26,13 +28,17 @@ public class Counts implements Cloneable { * @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example). */ public Counts( int[] data, boolean cumulative ) { - for( Base base: EnumSet.allOf(Base.class)) - counts[base.toPack()] = data[base.toPack()]; + for( byte base: Bases.instance) + counts.put(base,data[Bases.toPack(base)]); // De-cumulatize data as necessary. if(cumulative) { - for( int i = EnumSet.allOf(Base.class).size()-1; i > 0; i-- ) - counts[i] -= counts[i-1]; + int previousCount = 0; + for( byte base: Bases.instance ) { + int count = counts.get(base); + counts.put(base,count-previousCount); + previousCount = count; + } } } @@ -42,9 +48,11 @@ public class Counts implements Cloneable { * @return Array of count values. */ public int[] toArray(boolean cumulative) { - int[] countArray = counts.clone(); + int[] countArray = new int[counts.size()]; + for(byte base: Bases.instance) + countArray[Bases.toPack(base)] = counts.get(base); if(cumulative) { - for( int i = 1; i < counts.length; i++ ) + for( int i = 1; i < countArray.length; i++ ) countArray[i] += countArray[i-1]; } return countArray; @@ -62,8 +70,7 @@ public class Counts implements Cloneable { catch(CloneNotSupportedException ex) { throw new StingException("Unable to clone counts object", ex); } - other.counts = new int[counts.length]; - System.arraycopy(counts,0,other.counts,0,counts.length); + other.counts = new HashMap(counts); return other; } @@ -71,8 +78,8 @@ public class Counts implements Cloneable { * Increment the number of bases seen at the given location. * @param base Base to increment. */ - public void increment(Base base) { - counts[base.toPack()]++; + public void increment(byte base) { + counts.put(base,counts.get(base)+1); } /** @@ -82,8 +89,8 @@ public class Counts implements Cloneable { * @param base Base for which to query counts. * @return Number of bases of this type seen. */ - public int get(Base base) { - return counts[base.toPack()]; + public int get(byte base) { + return counts.get(base); } /** @@ -93,10 +100,12 @@ public class Counts implements Cloneable { * @param base Base for which to query counts. * @return Number of bases of this type seen. */ - public int getCumulative(Base base) { + public int getCumulative(byte base) { int accum = 0; - for(int i = 0; i <= base.toPack(); i++) - accum += counts[i]; + for( byte current: Bases.allOf() ) { + if(base == current) break; + accum += counts.get(current); + } return accum; } @@ -106,7 +115,7 @@ public class Counts implements Cloneable { */ public int getTotal() { int accumulator = 0; - for( int count : counts ) + for( int count : counts.values() ) accumulator += count; return accumulator; } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java index 11f413b25..02e9780b8 100755 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java @@ -61,7 +61,7 @@ public class CreateBWTFromReference { private Counts countOccurrences( String sequence ) { Counts occurrences = new Counts(); for( char base: sequence.toCharArray() ) - occurrences.increment(Base.fromASCII((byte)base)); + occurrences.increment((byte)base); return occurrences; } @@ -146,10 +146,10 @@ public class CreateBWTFromReference { // Count the occurences of each given base. Counts occurrences = creator.countOccurrences(sequence); - System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Base.A), - occurrences.getCumulative(Base.C), - occurrences.getCumulative(Base.G), - occurrences.getCumulative(Base.T)); + System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A), + occurrences.getCumulative(Bases.C), + occurrences.getCumulative(Bases.G), + occurrences.getCumulative(Bases.T)); // Generate the suffix array and print diagnostics. int[] suffixArrayData = creator.createSuffixArray(sequence);