Creates files supplemental to the reference sequence, consumed by BWA.
ANN - Alternate form of the sequence dictionary. Should be created from a sequence dictionary with full contig names.
AMB - A map of 'holes' in the genome, aka runs of non-ACGTacgt bases. This skeletal implementation always reports no
holes.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2455 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fcc52fbcd1
commit
d4ee999ef9
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
|
||||
/**
|
||||
* Writes .amb files - a file indicating where 'holes' (indeterminant bases)
|
||||
* exist in the contig. Currently, only empty, placeholder AMBs are supported.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AMBWriter {
|
||||
/**
|
||||
* Number of holes is fixed at zero.
|
||||
*/
|
||||
private static final int NUM_HOLES = 0;
|
||||
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private final PrintStream out;
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given file.
|
||||
* @param file file into which ANN data should be written.
|
||||
* @throws java.io.IOException if there is a problem opening the output file.
|
||||
*/
|
||||
public AMBWriter(File file) throws IOException {
|
||||
out = new PrintStream(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given OutputStream.
|
||||
* @param stream Stream into which ANN data should be written.
|
||||
*/
|
||||
public AMBWriter(OutputStream stream) {
|
||||
out = new PrintStream(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the contents of the given dictionary into the AMB file.
|
||||
* Assumes that there are no holes in the dictionary.
|
||||
* @param dictionary Dictionary to write.
|
||||
*/
|
||||
public void writeEmpty(SAMSequenceDictionary dictionary) {
|
||||
long genomeLength = 0L;
|
||||
for(SAMSequenceRecord sequence: dictionary.getSequences())
|
||||
genomeLength += sequence.getSequenceLength();
|
||||
|
||||
int sequences = dictionary.getSequences().size();
|
||||
|
||||
// Write the header
|
||||
out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the given output stream.
|
||||
*/
|
||||
public void close() {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
|
||||
/**
|
||||
* Writes .ann files - an alternate sequence dictionary format
|
||||
* used by BWA/C. For best results, the input sequence dictionary
|
||||
* should be created with Picard's CreateSequenceDictionary.jar,
|
||||
* TRUNCATE_NAMES_AT_WHITESPACE=false.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class ANNWriter {
|
||||
/**
|
||||
* BWA uses a fixed seed of 11, written into every file.
|
||||
*/
|
||||
private static final int BNS_SEED = 11;
|
||||
|
||||
/**
|
||||
* A seemingly unused value that appears in every contig in the ANN.
|
||||
*/
|
||||
private static final int GI = 0;
|
||||
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private final PrintStream out;
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given file.
|
||||
* @param file file into which ANN data should be written.
|
||||
* @throws IOException if there is a problem opening the output file.
|
||||
*/
|
||||
public ANNWriter(File file) throws IOException {
|
||||
out = new PrintStream(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given OutputStream.
|
||||
* @param stream Stream into which ANN data should be written.
|
||||
*/
|
||||
public ANNWriter(OutputStream stream) {
|
||||
out = new PrintStream(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the contents of the given dictionary into the ANN file.
|
||||
* Assumes that no ambs (blocks of indeterminate base) are present in the dictionary.
|
||||
* @param dictionary Dictionary to write.
|
||||
*/
|
||||
public void write(SAMSequenceDictionary dictionary) {
|
||||
long genomeLength = 0L;
|
||||
for(SAMSequenceRecord sequence: dictionary.getSequences())
|
||||
genomeLength += sequence.getSequenceLength();
|
||||
|
||||
int sequences = dictionary.getSequences().size();
|
||||
|
||||
// Write the header
|
||||
out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED);
|
||||
|
||||
for(SAMSequenceRecord sequence: dictionary.getSequences()) {
|
||||
String fullSequenceName = sequence.getSequenceName();
|
||||
String trimmedSequenceName = fullSequenceName;
|
||||
String sequenceComment = "(null)";
|
||||
|
||||
long offset = 0;
|
||||
|
||||
// Separate the sequence name from the sequence comment, based on BWA's definition.
|
||||
// BWA's definition appears to accept a zero-length contig name, so mimic that behavior.
|
||||
if(fullSequenceName.indexOf(' ') >= 0) {
|
||||
trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' '));
|
||||
sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1);
|
||||
}
|
||||
|
||||
// Write the sequence GI (?), name, and comment.
|
||||
out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment);
|
||||
// Write the sequence offset, length, and ambs (currently fixed at 0).
|
||||
out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the given output stream.
|
||||
*/
|
||||
public void close() {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Generate BWA supplementary files (.ann, .amb) from the command line.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTSupplementaryFileGenerator {
|
||||
enum SupplementaryFileType { ANN, AMB }
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
if(args.length < 3)
|
||||
usage("Incorrect number of arguments supplied");
|
||||
|
||||
File fastaFile = new File(args[0]);
|
||||
File outputFile = new File(args[1]);
|
||||
SupplementaryFileType outputType = null;
|
||||
try {
|
||||
outputType = Enum.valueOf(SupplementaryFileType.class,args[2]);
|
||||
}
|
||||
catch(IllegalArgumentException ex) {
|
||||
usage("Invalid output type: " + args[2]);
|
||||
}
|
||||
|
||||
ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile);
|
||||
SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary();
|
||||
|
||||
switch(outputType) {
|
||||
case ANN:
|
||||
ANNWriter annWriter = new ANNWriter(outputFile);
|
||||
annWriter.write(dictionary);
|
||||
annWriter.close();
|
||||
break;
|
||||
case AMB:
|
||||
AMBWriter ambWriter = new AMBWriter(outputFile);
|
||||
ambWriter.writeEmpty(dictionary);
|
||||
ambWriter.close();
|
||||
break;
|
||||
default:
|
||||
usage("Unsupported output type: " + outputType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Print usage information and exit.
|
||||
*/
|
||||
private static void usage(String message) {
|
||||
System.err.println(message);
|
||||
System.err.println("Usage: BWTSupplementaryFileGenerator <fasta> <output file> <output type>");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue