Preparing for human -- support bwa output files directly rather than relying on a custom fixed sa interval.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-10-14 17:17:46 +00:00
parent d89bc2c796
commit 3553fc9ec0
7 changed files with 118 additions and 34 deletions

View File

@ -49,7 +49,7 @@ public class AlignerTestHarness {
count++; count++;
if( count > 200000 ) break; if( count > 200000 ) break;
//if( count < 366000 ) continue; //if( count < 366000 ) continue;
//if( count != 2 ) continue; //if( count > 2 ) break;
//if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") ) //if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
// continue; // continue;
//if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") ) //if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )

View File

@ -75,8 +75,8 @@ public class BWAAligner implements Aligner {
public BWAAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) { public BWAAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
forwardBWT = new BWTReader(forwardBWTFile).read(); forwardBWT = new BWTReader(forwardBWTFile).read();
reverseBWT = new BWTReader(reverseBWTFile).read(); reverseBWT = new BWTReader(reverseBWTFile).read();
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile).read(); forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read(); reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
} }
public List<Alignment> align( SAMRecord read ) { public List<Alignment> align( SAMRecord read ) {
@ -146,7 +146,7 @@ public class BWAAligner implements Aligner {
continue; continue;
} }
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position] : ""); //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
/* /*
System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(), System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
alignment.negativeStrand?1:0, alignment.negativeStrand?1:0,
@ -164,12 +164,6 @@ public class BWAAligner implements Aligner {
// Temporary -- look ahead to see if the next alignment is bounded. // Temporary -- look ahead to see if the next alignment is bounded.
boolean allowDifferences = mismatches > 0; boolean allowDifferences = mismatches > 0;
boolean allowMismatches = mismatches > 0; boolean allowMismatches = mismatches > 0;
if( alignment.position+1 < read.getReadLength()-1 ) {
allowDifferences &= lowerBounds.get(alignment.position+2).value <= mismatches - 1;
allowMismatches &=
!(lowerBounds.get(alignment.position+2).value == mismatches-1 && lowerBounds.get(alignment.position+1).value == mismatches-1 &&
lowerBounds.get(alignment.position+2).width == lowerBounds.get(alignment.position+1).width);
}
if( allowDifferences && if( allowDifferences &&
alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() && alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.alignment.bwa.bwt; package org.broadinstitute.sting.alignment.bwa.bwt;
import org.broadinstitute.sting.alignment.bwa.packing.PackUtils; import org.broadinstitute.sting.alignment.bwa.packing.PackUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
/** /**
* Represents the Burrows-Wheeler Transform of a reference sequence. * Represents the Burrows-Wheeler Transform of a reference sequence.
@ -79,11 +81,8 @@ public class BWT {
* @return Total counts for all bases lexicographically smaller than this base. * @return Total counts for all bases lexicographically smaller than this base.
*/ */
public int occurrences(byte base,int index) { public int occurrences(byte base,int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space. SequenceBlock block = getSequenceBlock(index);
if( index > inverseSA0 ) index--; int position = getSequencePosition(index);
SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
int position = index % SEQUENCE_BLOCK_SIZE;
int accumulator = block.occurrences.get(base); int accumulator = block.occurrences.get(base);
for(int i = 0; i <= position; i++) { for(int i = 0; i <= position; i++) {
if(base == block.sequence[i]) if(base == block.sequence[i])
@ -100,6 +99,32 @@ public class BWT {
return counts.getTotal(); return counts.getTotal();
} }
/**
* Gets the base at a given position in the BWT.
* @param index The index to use.
* @return The base at that location.
*/
protected byte getBase(int index) {
if(index == inverseSA0)
throw new StingException(String.format("Base at index %d does not have a text representation",index));
SequenceBlock block = getSequenceBlock(index);
int position = getSequencePosition(index);
return block.sequence[position];
}
private SequenceBlock getSequenceBlock(int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if(index > inverseSA0) index--;
return sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
}
private int getSequencePosition(int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if(index > inverseSA0) index--;
return index%SEQUENCE_BLOCK_SIZE;
}
/** /**
* Create a set of sequence blocks from one long sequence. * Create a set of sequence blocks from one long sequence.
* @param sequence Sequence from which to derive blocks. * @param sequence Sequence from which to derive blocks.

View File

@ -25,7 +25,13 @@ public class Counts implements Cloneable {
/** /**
* Create an empty Counts object with values A=0,C=0,G=0,T=0. * Create an empty Counts object with values A=0,C=0,G=0,T=0.
*/ */
public Counts() {} public Counts()
{
for(byte base: Bases.instance) {
counts.put(base,0);
cumulativeCounts.put(base,0);
}
}
/** /**
* Create a counts data structure with the given initial values. * Create a counts data structure with the given initial values.
@ -63,8 +69,11 @@ public class Counts implements Cloneable {
if(cumulative) { if(cumulative) {
int index = 0; int index = 0;
for(byte base: Bases.instance) { for(byte base: Bases.instance) {
if(index++ == 0) continue; if(index == 0) {
countArray[index] = getCumulative(base); index++;
continue;
}
countArray[index++] = getCumulative(base);
} }
countArray[countArray.length-1] = getTotal(); countArray[countArray.length-1] = getTotal();
} }
@ -98,6 +107,11 @@ public class Counts implements Cloneable {
*/ */
public void increment(byte base) { public void increment(byte base) {
counts.put(base,counts.get(base)+1); counts.put(base,counts.get(base)+1);
int previous = 0;
for(byte cumulative: Bases.instance) {
cumulativeCounts.put(cumulative,counts.get(cumulative)+previous);
previous += counts.get(cumulative);
}
} }
/** /**

View File

@ -200,6 +200,7 @@ public class CreateBWTFromReference {
reverseBWTWriter.write(reverseBWT); reverseBWTWriter.write(reverseBWT);
reverseBWTWriter.close(); reverseBWTWriter.close();
/*
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile); SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
saWriter.write(suffixArray); saWriter.write(suffixArray);
saWriter.close(); saWriter.close();
@ -207,6 +208,7 @@ public class CreateBWTFromReference {
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile); SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
reverseSAWriter.write(reverseSuffixArray); reverseSAWriter.write(reverseSuffixArray);
reverseSAWriter.close(); reverseSAWriter.close();
*/
File existingBWTFile = new File(inputFileName+".bwt"); File existingBWTFile = new File(inputFileName+".bwt");
BWTReader existingBWTReader = new BWTReader(existingBWTFile); BWTReader existingBWTReader = new BWTReader(existingBWTFile);
@ -221,12 +223,14 @@ public class CreateBWTFromReference {
} }
File existingSAFile = new File(inputFileName+".sa"); File existingSAFile = new File(inputFileName+".sa");
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile); SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
SuffixArray existingSuffixArray = existingSuffixArrayReader.read(); SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
for( int i = 0; i < suffixArray.sequence.length; i++ ) { for(int i = 0; i < suffixArray.length(); i++) {
if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] ) if( i % 10000 == 0 )
throw new StingException("Suffix array mismatch at " + i); System.out.printf("Validating suffix array entry %d%n", i);
if( suffixArray.get(i) != existingSuffixArray.get(i) )
throw new StingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
} }
} }

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.alignment.bwa.bwt; package org.broadinstitute.sting.alignment.bwa.bwt;
import org.broadinstitute.sting.utils.StingException;
/** /**
* An in-memory representation of a suffix array. * An in-memory representation of a suffix array.
* *
@ -9,7 +11,26 @@ package org.broadinstitute.sting.alignment.bwa.bwt;
public class SuffixArray { public class SuffixArray {
public final int inverseSA0; public final int inverseSA0;
public final Counts occurrences; public final Counts occurrences;
public final int[] sequence;
/**
* The elements of the sequence actually stored in memory.
*/
protected final int[] sequence;
/**
* How often are individual elements in the sequence actually stored
* in memory, as opposed to being calculated on the fly?
*/
protected final int sequenceInterval;
/**
* The BWT used to calculate missing portions of the sequence.
*/
protected final BWT bwt;
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
this(inverseSA0,occurrences,sequence,1,null);
}
/** /**
* Creates a new sequence array with the given inverse SA, occurrences, and values. * Creates a new sequence array with the given inverse SA, occurrences, and values.
@ -17,10 +38,15 @@ public class SuffixArray {
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order. * @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param sequence The full suffix array. * @param sequence The full suffix array.
*/ */
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) { public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence, int sequenceInterval, BWT bwt) {
this.inverseSA0 = inverseSA0; this.inverseSA0 = inverseSA0;
this.occurrences = occurrences; this.occurrences = occurrences;
this.sequence = sequence; this.sequence = sequence;
this.sequenceInterval = sequenceInterval;
this.bwt = bwt;
if(sequenceInterval != 1 && bwt == null)
throw new StingException("A BWT must be provided if the sequence interval is not 1");
} }
/** /**
@ -28,15 +54,29 @@ public class SuffixArray {
* @return Length of the suffix array. * @return Length of the suffix array.
*/ */
public int length() { public int length() {
if( bwt != null )
return bwt.length()+1;
else
return sequence.length; return sequence.length;
} }
/** /**
* Get the suffix array value at a given sequence. * Get the suffix array value at a given sequence.
* @param index * @param index Index at which to retrieve the suffix array vaule.
* @return * @return The suffix array value at that entry.
*/ */
public int get( int index ) { public int get(int index) {
return sequence[index]; int iterations = 0;
while(index%sequenceInterval != 0) {
// The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case.
if(index == inverseSA0)
index = 0;
else {
byte base = bwt.getBase(index);
index = bwt.counts(base) + bwt.occurrences(base,index);
}
iterations++;
}
return (sequence[index/sequenceInterval]+iterations) % length();
} }
} }

View File

@ -19,13 +19,20 @@ public class SuffixArrayReader {
*/ */
private InputStream inputStream; private InputStream inputStream;
/**
* BWT to use to fill in missing data.
*/
private BWT bwt;
/** /**
* Create a new suffix array reader. * Create a new suffix array reader.
* @param inputFile File in which the suffix array is stored. * @param inputFile File in which the suffix array is stored.
* @param bwt BWT to use when filling in missing data.
*/ */
public SuffixArrayReader( File inputFile ) { public SuffixArrayReader(File inputFile, BWT bwt) {
try { try {
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile)); this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
this.bwt = bwt;
} }
catch( FileNotFoundException ex ) { catch( FileNotFoundException ex ) {
throw new StingException("Unable to open input file", ex); throw new StingException("Unable to open input file", ex);
@ -42,22 +49,22 @@ public class SuffixArrayReader {
int inverseSA0; int inverseSA0;
int[] occurrences; int[] occurrences;
int[] suffixArray; int[] suffixArray;
int suffixArrayInterval;
try { try {
inverseSA0 = intPackedInputStream.read(); inverseSA0 = intPackedInputStream.read();
occurrences = new int[PackUtils.ALPHABET_SIZE]; occurrences = new int[PackUtils.ALPHABET_SIZE];
intPackedInputStream.read(occurrences); intPackedInputStream.read(occurrences);
// Throw away the suffix array size in bytes and use the occurrences table directly. // Throw away the suffix array size in bytes and use the occurrences table directly.
intPackedInputStream.read(); suffixArrayInterval = intPackedInputStream.read();
int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1; suffixArray = new int[occurrences[occurrences.length-1]+1/suffixArrayInterval];
suffixArray = new int[suffixArraySize];
intPackedInputStream.read(suffixArray); intPackedInputStream.read(suffixArray);
} }
catch( IOException ex ) { catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex); throw new StingException("Unable to read BWT from input stream.", ex);
} }
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray); return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt);
} }