diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java index d156c015b..735d94e33 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.alignment.bwa; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.alignment.reference.packing.PackUtils; import org.broadinstitute.sting.alignment.reference.bwt.BWT; import org.broadinstitute.sting.alignment.reference.bwt.BWTWriter; @@ -14,6 +15,7 @@ import java.io.IOException; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.StringUtil; /** * Support files for BWT. @@ -132,14 +134,18 @@ public class BWTFiles { * @return A new object representing encoded representations of each sequence. */ public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) { + byte[] normalizedReferenceSequence = new byte[referenceSequence.length]; + System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length); + normalizeReferenceSequence(normalizedReferenceSequence); + File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile; try { // Write the ann and amb for this reference sequence. - annFile = File.createTempFile("bwt","ann"); - ambFile = File.createTempFile("bwt","amb"); + annFile = File.createTempFile("bwt",".ann"); + ambFile = File.createTempFile("bwt",".amb"); SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); - dictionary.addSequence(new SAMSequenceRecord("autogenerated",referenceSequence.length)); + dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length)); ANNWriter annWriter = new ANNWriter(annFile); annWriter.write(dictionary); @@ -150,19 +156,18 @@ public class BWTFiles { ambWriter.close(); // Write the encoded files for the forward version of this reference sequence. - pacFile = File.createTempFile("bwt","pac"); - bwtFile = File.createTempFile("bwt","bwt"); - saFile = File.createTempFile("bwt","sa"); + pacFile = File.createTempFile("bwt",".pac"); + bwtFile = File.createTempFile("bwt",".bwt"); + saFile = File.createTempFile("bwt",".sa"); - writeEncodedReferenceSequence(referenceSequence,pacFile,bwtFile,saFile); + writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile); // Write the encoded files for the reverse version of this reference sequence. - byte[] reverseReferenceSequence = new byte[referenceSequence.length]; - System.arraycopy(referenceSequence,0,reverseReferenceSequence,0,referenceSequence.length); + byte[] reverseReferenceSequence = BaseUtils.reverse(normalizedReferenceSequence); - rpacFile = File.createTempFile("bwt","rpac"); - rbwtFile = File.createTempFile("bwt","rbwt"); - rsaFile = File.createTempFile("bwt","rsa"); + rpacFile = File.createTempFile("bwt",".rpac"); + rbwtFile = File.createTempFile("bwt",".rbwt"); + rsaFile = File.createTempFile("bwt",".rsa"); writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile); } @@ -207,5 +212,19 @@ public class BWTFiles { SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile); suffixArrayWriter.write(suffixArray); suffixArrayWriter.close(); - } + } + + /** + * Convert the given reference sequence into a form suitable for building into + * on-the-fly sequences. + * @param referenceSequence The reference sequence to normalize. + * @throws StingException if normalized sequence cannot be generated. + */ + private static void normalizeReferenceSequence(byte[] referenceSequence) { + StringUtil.toUpperCase(referenceSequence); + for(byte base: referenceSequence) { + if(base != 'A' && base != 'C' && base != 'G' && base != 'T') + throw new StingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base)); + } + } } diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java index 781d54965..3cb45c738 100644 --- a/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java @@ -45,7 +45,7 @@ public class SuffixArrayWriter { uintPackedOutputStream.write(1); // Length of the suffix array. uintPackedOutputStream.write(suffixArray.length()-1); - uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length); + uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length-1); } catch( IOException ex ) { throw new StingException("Unable to read BWT from input stream.", ex); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java new file mode 100644 index 000000000..14bc47d0d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java @@ -0,0 +1,78 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.alignment.bwa.BWAAligner; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.picard.reference.ReferenceSequence; + +import java.io.IOException; + +/** + * A walker to experiment with fishing for reads in the GATK. Has very limited utility in its current state. + * + * @author mhanna + * @version 0.1 + */ +public class TestReadFishingWalker extends ReadWalker { + /** + * An aligner for the small custom reference. + */ + private BWAAligner aligner; + + @Override + public void initialize() { + ReferenceSequence initialBases; + + try { + IndexedFastaSequenceFile reader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + SAMSequenceRecord firstSequence = reader.getSequenceDictionary().getSequences().get(0); + initialBases = reader.getSubsequenceAt(firstSequence.getSequenceName(),1,100); + } + catch(IOException ex) { + throw new StingException("Unable to load initial bases from reference sequence"); + } + + aligner = new BWACAligner(initialBases.getBases(),new BWAConfiguration()); + } + + @Override + public Integer map(char[] ref, SAMRecord read) { + Alignment bestAlignment = aligner.getBestAlignment(read.getReadBases()); + System.out.println("bestAlignment = " + bestAlignment); + return 1; + } + + + /** + * Provide an initial value for reduce computations. + * @return Initial value of reduce. + */ + @Override + public Long reduceInit() { + return 0L; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * @param value result of the map. + * @param accum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Long reduce(Integer value, Long accum) { + return value + accum; + } + + @Override + public void onTraversalDone(Long result) { + aligner.close(); + super.onTraversalDone(result); + } + +}