Preparing for human -- support bwa output files directly rather than relying on a custom fixed sa interval.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1833 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d89bc2c796
commit
3553fc9ec0
|
|
@ -49,7 +49,7 @@ public class AlignerTestHarness {
|
|||
count++;
|
||||
if( count > 200000 ) break;
|
||||
//if( count < 366000 ) continue;
|
||||
//if( count != 2 ) continue;
|
||||
//if( count > 2 ) break;
|
||||
//if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
|
||||
// continue;
|
||||
//if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )
|
||||
|
|
|
|||
|
|
@ -75,8 +75,8 @@ public class BWAAligner implements Aligner {
|
|||
public BWAAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
|
||||
forwardBWT = new BWTReader(forwardBWTFile).read();
|
||||
reverseBWT = new BWTReader(reverseBWTFile).read();
|
||||
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile).read();
|
||||
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read();
|
||||
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
|
||||
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
|
||||
}
|
||||
|
||||
public List<Alignment> align( SAMRecord read ) {
|
||||
|
|
@ -146,7 +146,7 @@ public class BWAAligner implements Aligner {
|
|||
continue;
|
||||
}
|
||||
|
||||
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position] : "");
|
||||
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
|
||||
/*
|
||||
System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
|
||||
alignment.negativeStrand?1:0,
|
||||
|
|
@ -164,12 +164,6 @@ public class BWAAligner implements Aligner {
|
|||
// Temporary -- look ahead to see if the next alignment is bounded.
|
||||
boolean allowDifferences = mismatches > 0;
|
||||
boolean allowMismatches = mismatches > 0;
|
||||
if( alignment.position+1 < read.getReadLength()-1 ) {
|
||||
allowDifferences &= lowerBounds.get(alignment.position+2).value <= mismatches - 1;
|
||||
allowMismatches &=
|
||||
!(lowerBounds.get(alignment.position+2).value == mismatches-1 && lowerBounds.get(alignment.position+1).value == mismatches-1 &&
|
||||
lowerBounds.get(alignment.position+2).width == lowerBounds.get(alignment.position+1).width);
|
||||
}
|
||||
|
||||
if( allowDifferences &&
|
||||
alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.bwt;
|
||||
|
||||
import org.broadinstitute.sting.alignment.bwa.packing.PackUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
/**
|
||||
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||
|
|
@ -79,11 +81,8 @@ public class BWT {
|
|||
* @return Total counts for all bases lexicographically smaller than this base.
|
||||
*/
|
||||
public int occurrences(byte base,int index) {
|
||||
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||
if( index > inverseSA0 ) index--;
|
||||
|
||||
SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
|
||||
int position = index % SEQUENCE_BLOCK_SIZE;
|
||||
SequenceBlock block = getSequenceBlock(index);
|
||||
int position = getSequencePosition(index);
|
||||
int accumulator = block.occurrences.get(base);
|
||||
for(int i = 0; i <= position; i++) {
|
||||
if(base == block.sequence[i])
|
||||
|
|
@ -100,6 +99,32 @@ public class BWT {
|
|||
return counts.getTotal();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the base at a given position in the BWT.
|
||||
* @param index The index to use.
|
||||
* @return The base at that location.
|
||||
*/
|
||||
protected byte getBase(int index) {
|
||||
if(index == inverseSA0)
|
||||
throw new StingException(String.format("Base at index %d does not have a text representation",index));
|
||||
|
||||
SequenceBlock block = getSequenceBlock(index);
|
||||
int position = getSequencePosition(index);
|
||||
return block.sequence[position];
|
||||
}
|
||||
|
||||
private SequenceBlock getSequenceBlock(int index) {
|
||||
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||
if(index > inverseSA0) index--;
|
||||
return sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
|
||||
}
|
||||
|
||||
private int getSequencePosition(int index) {
|
||||
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||
if(index > inverseSA0) index--;
|
||||
return index%SEQUENCE_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a set of sequence blocks from one long sequence.
|
||||
* @param sequence Sequence from which to derive blocks.
|
||||
|
|
|
|||
|
|
@ -25,7 +25,13 @@ public class Counts implements Cloneable {
|
|||
/**
|
||||
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
|
||||
*/
|
||||
public Counts() {}
|
||||
public Counts()
|
||||
{
|
||||
for(byte base: Bases.instance) {
|
||||
counts.put(base,0);
|
||||
cumulativeCounts.put(base,0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a counts data structure with the given initial values.
|
||||
|
|
@ -63,8 +69,11 @@ public class Counts implements Cloneable {
|
|||
if(cumulative) {
|
||||
int index = 0;
|
||||
for(byte base: Bases.instance) {
|
||||
if(index++ == 0) continue;
|
||||
countArray[index] = getCumulative(base);
|
||||
if(index == 0) {
|
||||
index++;
|
||||
continue;
|
||||
}
|
||||
countArray[index++] = getCumulative(base);
|
||||
}
|
||||
countArray[countArray.length-1] = getTotal();
|
||||
}
|
||||
|
|
@ -98,6 +107,11 @@ public class Counts implements Cloneable {
|
|||
*/
|
||||
public void increment(byte base) {
|
||||
counts.put(base,counts.get(base)+1);
|
||||
int previous = 0;
|
||||
for(byte cumulative: Bases.instance) {
|
||||
cumulativeCounts.put(cumulative,counts.get(cumulative)+previous);
|
||||
previous += counts.get(cumulative);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -200,6 +200,7 @@ public class CreateBWTFromReference {
|
|||
reverseBWTWriter.write(reverseBWT);
|
||||
reverseBWTWriter.close();
|
||||
|
||||
/*
|
||||
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
||||
saWriter.write(suffixArray);
|
||||
saWriter.close();
|
||||
|
|
@ -207,6 +208,7 @@ public class CreateBWTFromReference {
|
|||
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
|
||||
reverseSAWriter.write(reverseSuffixArray);
|
||||
reverseSAWriter.close();
|
||||
*/
|
||||
|
||||
File existingBWTFile = new File(inputFileName+".bwt");
|
||||
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
||||
|
|
@ -221,12 +223,14 @@ public class CreateBWTFromReference {
|
|||
}
|
||||
|
||||
File existingSAFile = new File(inputFileName+".sa");
|
||||
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile);
|
||||
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
|
||||
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
|
||||
|
||||
for( int i = 0; i < suffixArray.sequence.length; i++ ) {
|
||||
if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] )
|
||||
throw new StingException("Suffix array mismatch at " + i);
|
||||
for(int i = 0; i < suffixArray.length(); i++) {
|
||||
if( i % 10000 == 0 )
|
||||
System.out.printf("Validating suffix array entry %d%n", i);
|
||||
if( suffixArray.get(i) != existingSuffixArray.get(i) )
|
||||
throw new StingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.bwt;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
* An in-memory representation of a suffix array.
|
||||
*
|
||||
|
|
@ -9,7 +11,26 @@ package org.broadinstitute.sting.alignment.bwa.bwt;
|
|||
public class SuffixArray {
|
||||
public final int inverseSA0;
|
||||
public final Counts occurrences;
|
||||
public final int[] sequence;
|
||||
|
||||
/**
|
||||
* The elements of the sequence actually stored in memory.
|
||||
*/
|
||||
protected final int[] sequence;
|
||||
|
||||
/**
|
||||
* How often are individual elements in the sequence actually stored
|
||||
* in memory, as opposed to being calculated on the fly?
|
||||
*/
|
||||
protected final int sequenceInterval;
|
||||
|
||||
/**
|
||||
* The BWT used to calculate missing portions of the sequence.
|
||||
*/
|
||||
protected final BWT bwt;
|
||||
|
||||
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
|
||||
this(inverseSA0,occurrences,sequence,1,null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new sequence array with the given inverse SA, occurrences, and values.
|
||||
|
|
@ -17,10 +38,15 @@ public class SuffixArray {
|
|||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||
* @param sequence The full suffix array.
|
||||
*/
|
||||
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence) {
|
||||
public SuffixArray(int inverseSA0, Counts occurrences, int[] sequence, int sequenceInterval, BWT bwt) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
this.sequenceInterval = sequenceInterval;
|
||||
this.bwt = bwt;
|
||||
|
||||
if(sequenceInterval != 1 && bwt == null)
|
||||
throw new StingException("A BWT must be provided if the sequence interval is not 1");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -28,15 +54,29 @@ public class SuffixArray {
|
|||
* @return Length of the suffix array.
|
||||
*/
|
||||
public int length() {
|
||||
return sequence.length;
|
||||
if( bwt != null )
|
||||
return bwt.length()+1;
|
||||
else
|
||||
return sequence.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the suffix array value at a given sequence.
|
||||
* @param index
|
||||
* @return
|
||||
* @param index Index at which to retrieve the suffix array vaule.
|
||||
* @return The suffix array value at that entry.
|
||||
*/
|
||||
public int get( int index ) {
|
||||
return sequence[index];
|
||||
public int get(int index) {
|
||||
int iterations = 0;
|
||||
while(index%sequenceInterval != 0) {
|
||||
// The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case.
|
||||
if(index == inverseSA0)
|
||||
index = 0;
|
||||
else {
|
||||
byte base = bwt.getBase(index);
|
||||
index = bwt.counts(base) + bwt.occurrences(base,index);
|
||||
}
|
||||
iterations++;
|
||||
}
|
||||
return (sequence[index/sequenceInterval]+iterations) % length();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,13 +19,20 @@ public class SuffixArrayReader {
|
|||
*/
|
||||
private InputStream inputStream;
|
||||
|
||||
/**
|
||||
* BWT to use to fill in missing data.
|
||||
*/
|
||||
private BWT bwt;
|
||||
|
||||
/**
|
||||
* Create a new suffix array reader.
|
||||
* @param inputFile File in which the suffix array is stored.
|
||||
* @param bwt BWT to use when filling in missing data.
|
||||
*/
|
||||
public SuffixArrayReader( File inputFile ) {
|
||||
public SuffixArrayReader(File inputFile, BWT bwt) {
|
||||
try {
|
||||
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
|
||||
this.bwt = bwt;
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new StingException("Unable to open input file", ex);
|
||||
|
|
@ -42,22 +49,22 @@ public class SuffixArrayReader {
|
|||
int inverseSA0;
|
||||
int[] occurrences;
|
||||
int[] suffixArray;
|
||||
int suffixArrayInterval;
|
||||
|
||||
try {
|
||||
inverseSA0 = intPackedInputStream.read();
|
||||
occurrences = new int[PackUtils.ALPHABET_SIZE];
|
||||
intPackedInputStream.read(occurrences);
|
||||
// Throw away the suffix array size in bytes and use the occurrences table directly.
|
||||
intPackedInputStream.read();
|
||||
int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1;
|
||||
suffixArray = new int[suffixArraySize];
|
||||
suffixArrayInterval = intPackedInputStream.read();
|
||||
suffixArray = new int[occurrences[occurrences.length-1]+1/suffixArrayInterval];
|
||||
intPackedInputStream.read(suffixArray);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray);
|
||||
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue