Creates files supplemental to the reference sequence, consumed by BWA.

ANN - Alternate form of the sequence dictionary.  Should be created from a sequence dictionary with full contig names.
AMB - A map of 'holes' in the genome, aka runs of non-ACGTacgt bases.  This skeletal implementation always reports no
      holes.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2455 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-12-28 21:40:44 +00:00
parent fcc52fbcd1
commit d4ee999ef9
3 changed files with 223 additions and 0 deletions

View File

@ -0,0 +1,68 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import java.io.PrintStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
/**
* Writes .amb files - a file indicating where 'holes' (indeterminant bases)
* exist in the contig. Currently, only empty, placeholder AMBs are supported.
*
* @author mhanna
* @version 0.1
*/
public class AMBWriter {
/**
* Number of holes is fixed at zero.
*/
private static final int NUM_HOLES = 0;
/**
* Input stream from which to read BWT data.
*/
private final PrintStream out;
/**
* Create a new ANNWriter targeting the given file.
* @param file file into which ANN data should be written.
* @throws java.io.IOException if there is a problem opening the output file.
*/
public AMBWriter(File file) throws IOException {
out = new PrintStream(file);
}
/**
* Create a new ANNWriter targeting the given OutputStream.
* @param stream Stream into which ANN data should be written.
*/
public AMBWriter(OutputStream stream) {
out = new PrintStream(stream);
}
/**
* Write the contents of the given dictionary into the AMB file.
* Assumes that there are no holes in the dictionary.
* @param dictionary Dictionary to write.
*/
public void writeEmpty(SAMSequenceDictionary dictionary) {
long genomeLength = 0L;
for(SAMSequenceRecord sequence: dictionary.getSequences())
genomeLength += sequence.getSequenceLength();
int sequences = dictionary.getSequences().size();
// Write the header
out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES);
}
/**
* Close the given output stream.
*/
public void close() {
out.close();
}
}

View File

@ -0,0 +1,95 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import java.io.PrintStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
/**
* Writes .ann files - an alternate sequence dictionary format
* used by BWA/C. For best results, the input sequence dictionary
* should be created with Picard's CreateSequenceDictionary.jar,
* TRUNCATE_NAMES_AT_WHITESPACE=false.
*
* @author mhanna
* @version 0.1
*/
public class ANNWriter {
/**
* BWA uses a fixed seed of 11, written into every file.
*/
private static final int BNS_SEED = 11;
/**
* A seemingly unused value that appears in every contig in the ANN.
*/
private static final int GI = 0;
/**
* Input stream from which to read BWT data.
*/
private final PrintStream out;
/**
* Create a new ANNWriter targeting the given file.
* @param file file into which ANN data should be written.
* @throws IOException if there is a problem opening the output file.
*/
public ANNWriter(File file) throws IOException {
out = new PrintStream(file);
}
/**
* Create a new ANNWriter targeting the given OutputStream.
* @param stream Stream into which ANN data should be written.
*/
public ANNWriter(OutputStream stream) {
out = new PrintStream(stream);
}
/**
* Write the contents of the given dictionary into the ANN file.
* Assumes that no ambs (blocks of indeterminate base) are present in the dictionary.
* @param dictionary Dictionary to write.
*/
public void write(SAMSequenceDictionary dictionary) {
long genomeLength = 0L;
for(SAMSequenceRecord sequence: dictionary.getSequences())
genomeLength += sequence.getSequenceLength();
int sequences = dictionary.getSequences().size();
// Write the header
out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED);
for(SAMSequenceRecord sequence: dictionary.getSequences()) {
String fullSequenceName = sequence.getSequenceName();
String trimmedSequenceName = fullSequenceName;
String sequenceComment = "(null)";
long offset = 0;
// Separate the sequence name from the sequence comment, based on BWA's definition.
// BWA's definition appears to accept a zero-length contig name, so mimic that behavior.
if(fullSequenceName.indexOf(' ') >= 0) {
trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' '));
sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1);
}
// Write the sequence GI (?), name, and comment.
out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment);
// Write the sequence offset, length, and ambs (currently fixed at 0).
out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0);
}
}
/**
* Close the given output stream.
*/
public void close() {
out.close();
}
}

View File

@ -0,0 +1,60 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import java.io.File;
import java.io.IOException;
/**
* Generate BWA supplementary files (.ann, .amb) from the command line.
*
* @author mhanna
* @version 0.1
*/
public class BWTSupplementaryFileGenerator {
enum SupplementaryFileType { ANN, AMB }
public static void main(String[] args) throws IOException {
if(args.length < 3)
usage("Incorrect number of arguments supplied");
File fastaFile = new File(args[0]);
File outputFile = new File(args[1]);
SupplementaryFileType outputType = null;
try {
outputType = Enum.valueOf(SupplementaryFileType.class,args[2]);
}
catch(IllegalArgumentException ex) {
usage("Invalid output type: " + args[2]);
}
ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile);
SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary();
switch(outputType) {
case ANN:
ANNWriter annWriter = new ANNWriter(outputFile);
annWriter.write(dictionary);
annWriter.close();
break;
case AMB:
AMBWriter ambWriter = new AMBWriter(outputFile);
ambWriter.writeEmpty(dictionary);
ambWriter.close();
break;
default:
usage("Unsupported output type: " + outputType);
}
}
/**
* Print usage information and exit.
*/
private static void usage(String message) {
System.err.println(message);
System.err.println("Usage: BWTSupplementaryFileGenerator <fasta> <output file> <output type>");
System.exit(1);
}
}