Preparing for human -- support bwa output files directly rather than relying on a custom fixed sa interval.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-10-14 17:17:46 +00:00
parent d89bc2c796
commit 3553fc9ec0
7 changed files with 118 additions and 34 deletions

View File

@ -49,7 +49,7 @@ public class AlignerTestHarness {
count++;
if( count > 200000 ) break;
//if( count < 366000 ) continue;
//if( count != 2 ) continue;
//if( count > 2 ) break;
//if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
// continue;
//if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )

View File

@ -75,8 +75,8 @@ public class BWAAligner implements Aligner {
public BWAAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
forwardBWT = new BWTReader(forwardBWTFile).read();
reverseBWT = new BWTReader(reverseBWTFile).read();
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile).read();
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read();
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
}
public List<Alignment> align( SAMRecord read ) {
@ -146,7 +146,7 @@ public class BWAAligner implements Aligner {
continue;
}
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position] : "");
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
/*
System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
alignment.negativeStrand?1:0,
@ -164,12 +164,6 @@ public class BWAAligner implements Aligner {
// Temporary -- look ahead to see if the next alignment is bounded.
boolean allowDifferences = mismatches > 0;
boolean allowMismatches = mismatches > 0;
if( alignment.position+1 < read.getReadLength()-1 ) {
allowDifferences &= lowerBounds.get(alignment.position+2).value <= mismatches - 1;
allowMismatches &=
!(lowerBounds.get(alignment.position+2).value == mismatches-1 && lowerBounds.get(alignment.position+1).value == mismatches-1 &&
lowerBounds.get(alignment.position+2).width == lowerBounds.get(alignment.position+1).width);
}
if( allowDifferences &&
alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.alignment.bwa.bwt;
import org.broadinstitute.sting.alignment.bwa.packing.PackUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
/**
* Represents the Burrows-Wheeler Transform of a reference sequence.
@ -79,11 +81,8 @@ public class BWT {
* @return Total counts for all bases lexicographically smaller than this base.
*/
public int occurrences(byte base,int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if( index > inverseSA0 ) index--;
SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
int position = index % SEQUENCE_BLOCK_SIZE;
SequenceBlock block = getSequenceBlock(index);
int position = getSequencePosition(index);
int accumulator = block.occurrences.get(base);
for(int i = 0; i <= position; i++) {
if(base == block.sequence[i])
@ -100,6 +99,32 @@ public class BWT {
return counts.getTotal();
}
/**
* Gets the base at a given position in the BWT.
* @param index The index to use.
* @return The base at that location.
*/
protected byte getBase(int index) {
if(index == inverseSA0)
throw new StingException(String.format("Base at index %d does not have a text representation",index));
SequenceBlock block = getSequenceBlock(index);
int position = getSequencePosition(index);
return block.sequence[position];
}
private SequenceBlock getSequenceBlock(int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if(index > inverseSA0) index--;
return sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
}
private int getSequencePosition(int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if(index > inverseSA0) index--;
return index%SEQUENCE_BLOCK_SIZE;
}
/**
* Create a set of sequence blocks from one long sequence.
* @param sequence Sequence from which to derive blocks.

View File

@ -25,7 +25,13 @@ public class Counts implements Cloneable {
/**
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
*/
public Counts() {}
public Counts()
{
for(byte base: Bases.instance) {
counts.put(base,0);
cumulativeCounts.put(base,0);
}
}
/**
* Create a counts data structure with the given initial values.
@ -63,8 +69,11 @@ public class Counts implements Cloneable {
if(cumulative) {
int index = 0;
for(byte base: Bases.instance) {
if(index++ == 0) continue;
countArray[index] = getCumulative(base);
if(index == 0) {
index++;
continue;
}
countArray[index++] = getCumulative(base);
}
countArray[countArray.length-1] = getTotal();
}
@ -98,6 +107,11 @@ public class Counts implements Cloneable {
*/
public void increment(byte base) {
counts.put(base,counts.get(base)+1);
int previous = 0;
for(byte cumulative: Bases.instance) {
cumulativeCounts.put(cumulative,counts.get(cumulative)+previous);
previous += counts.get(cumulative);
}
}
/**

View File

@ -200,6 +200,7 @@ public class CreateBWTFromReference {
reverseBWTWriter.write(reverseBWT);
reverseBWTWriter.close();
/*
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
saWriter.write(suffixArray);
saWriter.close();
@ -207,6 +208,7 @@ public class CreateBWTFromReference {
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
reverseSAWriter.write(reverseSuffixArray);
reverseSAWriter.close();
*/
File existingBWTFile = new File(inputFileName+".bwt");
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
@ -221,12 +223,14 @@ public class CreateBWTFromReference {
}
File existingSAFile = new File(inputFileName+".sa");
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile);
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
for( int i = 0; i < suffixArray.sequence.length; i++ ) {
if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] )
throw new StingException("Suffix array mismatch at " + i);
for(int i = 0; i < suffixArray.length(); i++) {
if( i % 10000 == 0 )
System.out.printf("Validating suffix array entry %d%n", i);
if( suffixArray.get(i) != existingSuffixArray.get(i) )
throw new StingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
}
}

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.alignment.bwa.bwt;
import org.broadinstitute.sting.utils.StingException;
/**
* An in-memory representation of a suffix array.
*
@ -9,7 +11,26 @@ package org.broadinstitute.sting.alignment.bwa.bwt;
public class SuffixArray {
public final int inverseSA0;
public final Counts occurrences;
public final int[] sequence;
/**
* The elements of the sequence actually stored in memory.
*/
protected final int[] sequence;
/**
* How often are individual elements in the sequence actually stored
* in memory, as opposed to being calculated on the fly?
*/
protected final int sequenceInterval;
/**
* The BWT used to calculate missing portions of the sequence.
*/
protected final BWT bwt;
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
this(inverseSA0,occurrences,sequence,1,null);
}
/**
* Creates a new sequence array with the given inverse SA, occurrences, and values.
@ -17,10 +38,15 @@ public class SuffixArray {
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param sequence The full suffix array.
*/
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence, int sequenceInterval, BWT bwt) {
this.inverseSA0 = inverseSA0;
this.occurrences = occurrences;
this.sequence = sequence;
this.sequenceInterval = sequenceInterval;
this.bwt = bwt;
if(sequenceInterval != 1 && bwt == null)
throw new StingException("A BWT must be provided if the sequence interval is not 1");
}
/**
@ -28,15 +54,29 @@ public class SuffixArray {
* @return Length of the suffix array.
*/
public int length() {
return sequence.length;
if( bwt != null )
return bwt.length()+1;
else
return sequence.length;
}
/**
* Get the suffix array value at a given sequence.
* @param index
* @return
* @param index Index at which to retrieve the suffix array vaule.
* @return The suffix array value at that entry.
*/
public int get( int index ) {
return sequence[index];
public int get(int index) {
int iterations = 0;
while(index%sequenceInterval != 0) {
// The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case.
if(index == inverseSA0)
index = 0;
else {
byte base = bwt.getBase(index);
index = bwt.counts(base) + bwt.occurrences(base,index);
}
iterations++;
}
return (sequence[index/sequenceInterval]+iterations) % length();
}
}

View File

@ -19,13 +19,20 @@ public class SuffixArrayReader {
*/
private InputStream inputStream;
/**
* BWT to use to fill in missing data.
*/
private BWT bwt;
/**
* Create a new suffix array reader.
* @param inputFile File in which the suffix array is stored.
* @param bwt BWT to use when filling in missing data.
*/
public SuffixArrayReader( File inputFile ) {
public SuffixArrayReader(File inputFile, BWT bwt) {
try {
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
this.bwt = bwt;
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open input file", ex);
@ -42,22 +49,22 @@ public class SuffixArrayReader {
int inverseSA0;
int[] occurrences;
int[] suffixArray;
int suffixArrayInterval;
try {
inverseSA0 = intPackedInputStream.read();
occurrences = new int[PackUtils.ALPHABET_SIZE];
intPackedInputStream.read(occurrences);
// Throw away the suffix array size in bytes and use the occurrences table directly.
intPackedInputStream.read();
int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1;
suffixArray = new int[suffixArraySize];
suffixArrayInterval = intPackedInputStream.read();
suffixArray = new int[occurrences[occurrences.length-1]+1/suffixArrayInterval];
intPackedInputStream.read(suffixArray);
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray);
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt);
}