diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java index 90272db41..a1875a389 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java @@ -49,7 +49,7 @@ public class AlignerTestHarness { count++; if( count > 200000 ) break; //if( count < 366000 ) continue; - //if( count != 2 ) continue; + //if( count > 2 ) break; //if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") ) // continue; //if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") ) diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java index 36319fda2..4d6d502c9 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java @@ -75,8 +75,8 @@ public class BWAAligner implements Aligner { public BWAAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) { forwardBWT = new BWTReader(forwardBWTFile).read(); reverseBWT = new BWTReader(reverseBWTFile).read(); - forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile).read(); - reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read(); + forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read(); + reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read(); } public List align( SAMRecord read ) { @@ -146,7 +146,7 @@ public class BWAAligner implements Aligner { continue; } - //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position] : ""); + //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : ""); /* System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(), alignment.negativeStrand?1:0, @@ -164,12 +164,6 @@ public class BWAAligner implements Aligner { // Temporary -- look ahead to see if the next alignment is bounded. boolean allowDifferences = mismatches > 0; boolean allowMismatches = mismatches > 0; - if( alignment.position+1 < read.getReadLength()-1 ) { - allowDifferences &= lowerBounds.get(alignment.position+2).value <= mismatches - 1; - allowMismatches &= - !(lowerBounds.get(alignment.position+2).value == mismatches-1 && lowerBounds.get(alignment.position+1).value == mismatches-1 && - lowerBounds.get(alignment.position+2).width == lowerBounds.get(alignment.position+1).width); - } if( allowDifferences && alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() && diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java index 292319e3f..93ccf2a16 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/BWT.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.alignment.bwa.bwt; import org.broadinstitute.sting.alignment.bwa.packing.PackUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; /** * Represents the Burrows-Wheeler Transform of a reference sequence. @@ -79,11 +81,8 @@ public class BWT { * @return Total counts for all bases lexicographically smaller than this base. */ public int occurrences(byte base,int index) { - // If the index is above the SA-1[0], remap it to the appropriate coordinate space. - if( index > inverseSA0 ) index--; - - SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE]; - int position = index % SEQUENCE_BLOCK_SIZE; + SequenceBlock block = getSequenceBlock(index); + int position = getSequencePosition(index); int accumulator = block.occurrences.get(base); for(int i = 0; i <= position; i++) { if(base == block.sequence[i]) @@ -100,6 +99,32 @@ public class BWT { return counts.getTotal(); } + /** + * Gets the base at a given position in the BWT. + * @param index The index to use. + * @return The base at that location. + */ + protected byte getBase(int index) { + if(index == inverseSA0) + throw new StingException(String.format("Base at index %d does not have a text representation",index)); + + SequenceBlock block = getSequenceBlock(index); + int position = getSequencePosition(index); + return block.sequence[position]; + } + + private SequenceBlock getSequenceBlock(int index) { + // If the index is above the SA-1[0], remap it to the appropriate coordinate space. + if(index > inverseSA0) index--; + return sequenceBlocks[index/SEQUENCE_BLOCK_SIZE]; + } + + private int getSequencePosition(int index) { + // If the index is above the SA-1[0], remap it to the appropriate coordinate space. + if(index > inverseSA0) index--; + return index%SEQUENCE_BLOCK_SIZE; + } + /** * Create a set of sequence blocks from one long sequence. * @param sequence Sequence from which to derive blocks. diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java index 48a83ffa8..950849b9b 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/Counts.java @@ -25,7 +25,13 @@ public class Counts implements Cloneable { /** * Create an empty Counts object with values A=0,C=0,G=0,T=0. */ - public Counts() {} + public Counts() + { + for(byte base: Bases.instance) { + counts.put(base,0); + cumulativeCounts.put(base,0); + } + } /** * Create a counts data structure with the given initial values. @@ -63,8 +69,11 @@ public class Counts implements Cloneable { if(cumulative) { int index = 0; for(byte base: Bases.instance) { - if(index++ == 0) continue; - countArray[index] = getCumulative(base); + if(index == 0) { + index++; + continue; + } + countArray[index++] = getCumulative(base); } countArray[countArray.length-1] = getTotal(); } @@ -98,6 +107,11 @@ public class Counts implements Cloneable { */ public void increment(byte base) { counts.put(base,counts.get(base)+1); + int previous = 0; + for(byte cumulative: Bases.instance) { + cumulativeCounts.put(cumulative,counts.get(cumulative)+previous); + previous += counts.get(cumulative); + } } /** diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java index 02e9780b8..db3aa38b2 100755 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/CreateBWTFromReference.java @@ -200,6 +200,7 @@ public class CreateBWTFromReference { reverseBWTWriter.write(reverseBWT); reverseBWTWriter.close(); + /* SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile); saWriter.write(suffixArray); saWriter.close(); @@ -207,6 +208,7 @@ public class CreateBWTFromReference { SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile); reverseSAWriter.write(reverseSuffixArray); reverseSAWriter.close(); + */ File existingBWTFile = new File(inputFileName+".bwt"); BWTReader existingBWTReader = new BWTReader(existingBWTFile); @@ -221,12 +223,14 @@ public class CreateBWTFromReference { } File existingSAFile = new File(inputFileName+".sa"); - SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile); + SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT); SuffixArray existingSuffixArray = existingSuffixArrayReader.read(); - for( int i = 0; i < suffixArray.sequence.length; i++ ) { - if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] ) - throw new StingException("Suffix array mismatch at " + i); + for(int i = 0; i < suffixArray.length(); i++) { + if( i % 10000 == 0 ) + System.out.printf("Validating suffix array entry %d%n", i); + if( suffixArray.get(i) != existingSuffixArray.get(i) ) + throw new StingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i))); } } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java index 7af37a1d9..a0db8e920 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.alignment.bwa.bwt; +import org.broadinstitute.sting.utils.StingException; + /** * An in-memory representation of a suffix array. * @@ -9,7 +11,26 @@ package org.broadinstitute.sting.alignment.bwa.bwt; public class SuffixArray { public final int inverseSA0; public final Counts occurrences; - public final int[] sequence; + + /** + * The elements of the sequence actually stored in memory. + */ + protected final int[] sequence; + + /** + * How often are individual elements in the sequence actually stored + * in memory, as opposed to being calculated on the fly? + */ + protected final int sequenceInterval; + + /** + * The BWT used to calculate missing portions of the sequence. + */ + protected final BWT bwt; + + public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) { + this(inverseSA0,occurrences,sequence,1,null); + } /** * Creates a new sequence array with the given inverse SA, occurrences, and values. @@ -17,10 +38,15 @@ public class SuffixArray { * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. * @param sequence The full suffix array. */ - public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) { + public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence, int sequenceInterval, BWT bwt) { this.inverseSA0 = inverseSA0; this.occurrences = occurrences; this.sequence = sequence; + this.sequenceInterval = sequenceInterval; + this.bwt = bwt; + + if(sequenceInterval != 1 && bwt == null) + throw new StingException("A BWT must be provided if the sequence interval is not 1"); } /** @@ -28,15 +54,29 @@ public class SuffixArray { * @return Length of the suffix array. */ public int length() { - return sequence.length; + if( bwt != null ) + return bwt.length()+1; + else + return sequence.length; } /** * Get the suffix array value at a given sequence. - * @param index - * @return + * @param index Index at which to retrieve the suffix array vaule. + * @return The suffix array value at that entry. */ - public int get( int index ) { - return sequence[index]; + public int get(int index) { + int iterations = 0; + while(index%sequenceInterval != 0) { + // The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case. + if(index == inverseSA0) + index = 0; + else { + byte base = bwt.getBase(index); + index = bwt.counts(base) + bwt.occurrences(base,index); + } + iterations++; + } + return (sequence[index/sequenceInterval]+iterations) % length(); } } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArrayReader.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArrayReader.java index 8d7ea797c..f478a42d1 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArrayReader.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArrayReader.java @@ -19,13 +19,20 @@ public class SuffixArrayReader { */ private InputStream inputStream; + /** + * BWT to use to fill in missing data. + */ + private BWT bwt; + /** * Create a new suffix array reader. * @param inputFile File in which the suffix array is stored. + * @param bwt BWT to use when filling in missing data. */ - public SuffixArrayReader( File inputFile ) { + public SuffixArrayReader(File inputFile, BWT bwt) { try { this.inputStream = new BufferedInputStream(new FileInputStream(inputFile)); + this.bwt = bwt; } catch( FileNotFoundException ex ) { throw new StingException("Unable to open input file", ex); @@ -42,22 +49,22 @@ public class SuffixArrayReader { int inverseSA0; int[] occurrences; int[] suffixArray; + int suffixArrayInterval; try { inverseSA0 = intPackedInputStream.read(); occurrences = new int[PackUtils.ALPHABET_SIZE]; intPackedInputStream.read(occurrences); // Throw away the suffix array size in bytes and use the occurrences table directly. - intPackedInputStream.read(); - int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1; - suffixArray = new int[suffixArraySize]; + suffixArrayInterval = intPackedInputStream.read(); + suffixArray = new int[occurrences[occurrences.length-1]+1/suffixArrayInterval]; intPackedInputStream.read(suffixArray); } catch( IOException ex ) { throw new StingException("Unable to read BWT from input stream.", ex); } - return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray); + return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt); }