diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java new file mode 100644 index 000000000..1d97fec79 --- /dev/null +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java @@ -0,0 +1,68 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; + +import java.io.PrintStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; + +/** + * Writes .amb files - a file indicating where 'holes' (indeterminant bases) + * exist in the contig. Currently, only empty, placeholder AMBs are supported. + * + * @author mhanna + * @version 0.1 + */ +public class AMBWriter { + /** + * Number of holes is fixed at zero. + */ + private static final int NUM_HOLES = 0; + + /** + * Input stream from which to read BWT data. + */ + private final PrintStream out; + + /** + * Create a new ANNWriter targeting the given file. + * @param file file into which ANN data should be written. + * @throws java.io.IOException if there is a problem opening the output file. + */ + public AMBWriter(File file) throws IOException { + out = new PrintStream(file); + } + + /** + * Create a new ANNWriter targeting the given OutputStream. + * @param stream Stream into which ANN data should be written. + */ + public AMBWriter(OutputStream stream) { + out = new PrintStream(stream); + } + + /** + * Write the contents of the given dictionary into the AMB file. + * Assumes that there are no holes in the dictionary. + * @param dictionary Dictionary to write. + */ + public void writeEmpty(SAMSequenceDictionary dictionary) { + long genomeLength = 0L; + for(SAMSequenceRecord sequence: dictionary.getSequences()) + genomeLength += sequence.getSequenceLength(); + + int sequences = dictionary.getSequences().size(); + + // Write the header + out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES); + } + + /** + * Close the given output stream. + */ + public void close() { + out.close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java new file mode 100644 index 000000000..17296c31c --- /dev/null +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java @@ -0,0 +1,95 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; + +import java.io.PrintStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; + +/** + * Writes .ann files - an alternate sequence dictionary format + * used by BWA/C. For best results, the input sequence dictionary + * should be created with Picard's CreateSequenceDictionary.jar, + * TRUNCATE_NAMES_AT_WHITESPACE=false. + * + * @author mhanna + * @version 0.1 + */ +public class ANNWriter { + /** + * BWA uses a fixed seed of 11, written into every file. + */ + private static final int BNS_SEED = 11; + + /** + * A seemingly unused value that appears in every contig in the ANN. + */ + private static final int GI = 0; + + /** + * Input stream from which to read BWT data. + */ + private final PrintStream out; + + /** + * Create a new ANNWriter targeting the given file. + * @param file file into which ANN data should be written. + * @throws IOException if there is a problem opening the output file. + */ + public ANNWriter(File file) throws IOException { + out = new PrintStream(file); + } + + /** + * Create a new ANNWriter targeting the given OutputStream. + * @param stream Stream into which ANN data should be written. + */ + public ANNWriter(OutputStream stream) { + out = new PrintStream(stream); + } + + /** + * Write the contents of the given dictionary into the ANN file. + * Assumes that no ambs (blocks of indeterminate base) are present in the dictionary. + * @param dictionary Dictionary to write. + */ + public void write(SAMSequenceDictionary dictionary) { + long genomeLength = 0L; + for(SAMSequenceRecord sequence: dictionary.getSequences()) + genomeLength += sequence.getSequenceLength(); + + int sequences = dictionary.getSequences().size(); + + // Write the header + out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED); + + for(SAMSequenceRecord sequence: dictionary.getSequences()) { + String fullSequenceName = sequence.getSequenceName(); + String trimmedSequenceName = fullSequenceName; + String sequenceComment = "(null)"; + + long offset = 0; + + // Separate the sequence name from the sequence comment, based on BWA's definition. + // BWA's definition appears to accept a zero-length contig name, so mimic that behavior. + if(fullSequenceName.indexOf(' ') >= 0) { + trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' ')); + sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1); + } + + // Write the sequence GI (?), name, and comment. + out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment); + // Write the sequence offset, length, and ambs (currently fixed at 0). + out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0); + } + } + + /** + * Close the given output stream. + */ + public void close() { + out.close(); + } +} diff --git a/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java new file mode 100644 index 000000000..f24baf766 --- /dev/null +++ b/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMSequenceDictionary; + +import java.io.File; +import java.io.IOException; + +/** + * Generate BWA supplementary files (.ann, .amb) from the command line. + * + * @author mhanna + * @version 0.1 + */ +public class BWTSupplementaryFileGenerator { + enum SupplementaryFileType { ANN, AMB } + + public static void main(String[] args) throws IOException { + if(args.length < 3) + usage("Incorrect number of arguments supplied"); + + File fastaFile = new File(args[0]); + File outputFile = new File(args[1]); + SupplementaryFileType outputType = null; + try { + outputType = Enum.valueOf(SupplementaryFileType.class,args[2]); + } + catch(IllegalArgumentException ex) { + usage("Invalid output type: " + args[2]); + } + + ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile); + SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary(); + + switch(outputType) { + case ANN: + ANNWriter annWriter = new ANNWriter(outputFile); + annWriter.write(dictionary); + annWriter.close(); + break; + case AMB: + AMBWriter ambWriter = new AMBWriter(outputFile); + ambWriter.writeEmpty(dictionary); + ambWriter.close(); + break; + default: + usage("Unsupported output type: " + outputType); + } + } + + /** + * Print usage information and exit. + */ + private static void usage(String message) { + System.err.println(message); + System.err.println("Usage: BWTSupplementaryFileGenerator "); + System.exit(1); + } +}