Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
b9c9e0e952
|
|
@ -0,0 +1,70 @@
|
|||
#include "MainTools.h"
|
||||
#include "Basevector.h"
|
||||
#include "lookup/LookAlign.h"
|
||||
#include "lookup/SerialQltout.h"
|
||||
|
||||
unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) {
|
||||
//la.PrintParseable(cout);
|
||||
|
||||
for (int i = 0; i < candidates.size(); i++) {
|
||||
look_align newla = la;
|
||||
|
||||
if (newla.rc1) { candidates[i].ReverseComplement(); }
|
||||
newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]);
|
||||
|
||||
//newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]);
|
||||
//cout << newla.Errors() << " " << la.Errors() << endl;
|
||||
|
||||
if (newla.Errors() == la.Errors()) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
//FatalErr("Query id " + ToString(la.query_id) + " had no matches.");
|
||||
|
||||
return candidates.size() + 1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
RunTime();
|
||||
|
||||
BeginCommandArguments;
|
||||
CommandArgument_String(ALIGNS);
|
||||
CommandArgument_String(FASTB_END_1);
|
||||
CommandArgument_String(FASTB_END_2);
|
||||
CommandArgument_String(REFERENCE);
|
||||
|
||||
CommandArgument_String(ALIGNS_END_1_OUT);
|
||||
CommandArgument_String(ALIGNS_END_2_OUT);
|
||||
EndCommandArguments;
|
||||
|
||||
vecbasevector ref(REFERENCE);
|
||||
vecbasevector reads1(FASTB_END_1);
|
||||
vecbasevector reads2(FASTB_END_2);
|
||||
|
||||
ofstream aligns1stream(ALIGNS_END_1_OUT.c_str());
|
||||
ofstream aligns2stream(ALIGNS_END_2_OUT.c_str());
|
||||
|
||||
basevector bv;
|
||||
|
||||
SerialQltout sqltout(ALIGNS);
|
||||
look_align la;
|
||||
while (sqltout.Next(la)) {
|
||||
vecbasevector candidates(2);
|
||||
candidates[0] = reads1[la.query_id];
|
||||
candidates[1] = reads2[la.query_id];
|
||||
|
||||
unsigned int matchingend = MatchingEnd(la, candidates, ref);
|
||||
if (matchingend < 2) {
|
||||
bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id];
|
||||
|
||||
//la.PrintParseable(cout, &bv, &ref[la.target_id]);
|
||||
la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]);
|
||||
}
|
||||
}
|
||||
|
||||
aligns1stream.close();
|
||||
aligns2stream.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
CXX=g++
|
||||
CXXFLAGS=-g -Wall -O2 -m64 -fPIC
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -I$(BWA_HOME) -I$(JAVA_INCLUDE) $< -o $@
|
||||
|
||||
all: init lib
|
||||
|
||||
init:
|
||||
@echo Please make sure the following platforms are set correctly on your machine.
|
||||
@echo BWA_HOME=$(BWA_HOME)
|
||||
@echo JAVA_INCLUDE=$(JAVA_INCLUDE)
|
||||
@echo TARGET_LIB=$(TARGET_LIB)
|
||||
@echo EXTRA_LIBS=$(EXTRA_LIBS)
|
||||
@echo LIBTOOL_COMMAND=$(LIBTOOL_COMMAND)
|
||||
|
||||
lib: org_broadinstitute_sting_alignment_bwa_c_BWACAligner.o bwa_gateway.o
|
||||
$(LIBTOOL_COMMAND) $? -o $(TARGET_LIB) -L$(BWA_HOME) -lbwacore $(EXTRA_LIBS)
|
||||
|
||||
clean:
|
||||
rm *.o libbwa.*
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/sh
|
||||
export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa"
|
||||
export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux"
|
||||
export TARGET_LIB="libbwa.so"
|
||||
export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread"
|
||||
export LIBTOOL_COMMAND="g++ -shared -Wl,-soname,libbwa.so"
|
||||
make
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/sh
|
||||
export BWA_HOME="/Users/mhanna/src/bwa"
|
||||
export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers"
|
||||
export TARGET_LIB="libbwa.dylib"
|
||||
export EXTRA_LIBS="-lc -lz -lsupc++"
|
||||
export LIBTOOL_COMMAND="libtool -dynamic"
|
||||
make
|
||||
|
|
@ -0,0 +1,268 @@
|
|||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
#include "bwase.h"
|
||||
#include "bwa_gateway.h"
|
||||
|
||||
BWA::BWA(const char* ann_filename,
|
||||
const char* amb_filename,
|
||||
const char* pac_filename,
|
||||
const char* forward_bwt_filename,
|
||||
const char* forward_sa_filename,
|
||||
const char* reverse_bwt_filename,
|
||||
const char* reverse_sa_filename)
|
||||
{
|
||||
// Load the bns (?) and reference
|
||||
bns = bns_restore_core(ann_filename,amb_filename,pac_filename);
|
||||
reference = new ubyte_t[bns->l_pac/4+1];
|
||||
rewind(bns->fp_pac);
|
||||
fread(reference, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
fclose(bns->fp_pac);
|
||||
bns->fp_pac = NULL;
|
||||
|
||||
// Load the BWTs (both directions) and suffix arrays (both directions)
|
||||
bwts[0] = bwt_restore_bwt(forward_bwt_filename);
|
||||
bwt_restore_sa(forward_sa_filename, bwts[0]);
|
||||
bwts[1] = bwt_restore_bwt(reverse_bwt_filename);
|
||||
bwt_restore_sa(reverse_sa_filename, bwts[1]);
|
||||
load_default_options();
|
||||
|
||||
// initialize the bwase subsystem
|
||||
bwase_initialize();
|
||||
}
|
||||
|
||||
BWA::~BWA() {
|
||||
delete[] reference;
|
||||
bns_destroy(bns);
|
||||
bwt_destroy(bwts[0]);
|
||||
bwt_destroy(bwts[1]);
|
||||
}
|
||||
|
||||
void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count)
|
||||
{
|
||||
bwa_seq_t* sequence = create_sequence(bases, read_length);
|
||||
|
||||
// Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln).
|
||||
// This method will destroy the contents of seq and rseq.
|
||||
bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
|
||||
|
||||
paths = new bwt_aln1_t[sequence->n_aln];
|
||||
memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t));
|
||||
num_paths = sequence->n_aln;
|
||||
|
||||
// Call aln2seq to initialize the type of match present.
|
||||
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
|
||||
best_path_count = sequence->c1;
|
||||
second_best_path_count = sequence->c2;
|
||||
|
||||
bwa_free_read_seq(1,sequence);
|
||||
}
|
||||
|
||||
Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) {
|
||||
bwa_seq_t* sequence = create_sequence(bases,read_length);
|
||||
|
||||
// Calculate paths.
|
||||
bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
|
||||
|
||||
// Check for no alignments found and return null.
|
||||
if(sequence->n_aln == 0) {
|
||||
bwa_free_read_seq(1,sequence);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in.
|
||||
copy_bases_into_sequence(sequence,bases,read_length);
|
||||
|
||||
// Pick best alignment and propagate its information into the sequence.
|
||||
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
|
||||
|
||||
// Generate the best alignment from the sequence.
|
||||
Alignment* alignment = new Alignment;
|
||||
*alignment = generate_final_alignment_from_sequence(sequence);
|
||||
|
||||
bwa_free_read_seq(1,sequence);
|
||||
|
||||
return alignment;
|
||||
}
|
||||
|
||||
void BWA::generate_alignments_from_paths(const char* bases,
|
||||
const unsigned read_length,
|
||||
bwt_aln1_t* paths,
|
||||
const unsigned num_paths,
|
||||
const unsigned best_count,
|
||||
const unsigned second_best_count,
|
||||
Alignment*& alignments,
|
||||
unsigned& num_alignments)
|
||||
{
|
||||
bwa_seq_t* sequence = create_sequence(bases,read_length);
|
||||
|
||||
sequence->aln = paths;
|
||||
sequence->n_aln = num_paths;
|
||||
|
||||
// (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself.
|
||||
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
|
||||
|
||||
// But overwrite key parts of the sequence in case the user passed back only a smaller subset
|
||||
// of the paths.
|
||||
sequence->c1 = best_count;
|
||||
sequence->c2 = second_best_count;
|
||||
sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
||||
|
||||
num_alignments = 0;
|
||||
for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++)
|
||||
num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1;
|
||||
|
||||
alignments = new Alignment[num_alignments];
|
||||
unsigned alignment_idx = 0;
|
||||
|
||||
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
|
||||
// Stub in a 'working' path, so that only the desired alignment is local-aligned.
|
||||
const bwt_aln1_t* path = paths + path_idx;
|
||||
bwt_aln1_t working_path = *path;
|
||||
|
||||
// Loop through all alignments, aligning each one individually.
|
||||
for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) {
|
||||
working_path.k = working_path.l = sa_idx;
|
||||
sequence->aln = &working_path;
|
||||
sequence->n_aln = 1;
|
||||
|
||||
sequence->sa = sa_idx;
|
||||
sequence->strand = path->a;
|
||||
sequence->score = path->score;
|
||||
|
||||
// Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse.
|
||||
// TODO: Fix the interface to bwa_refine_gapped so its easier to work with.
|
||||
if(alignment_idx > 0)
|
||||
seq_reverse(sequence->len, sequence->seq, 0);
|
||||
|
||||
// Copy the local alignment data into the alignment object.
|
||||
*(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence);
|
||||
|
||||
alignment_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
sequence->aln = NULL;
|
||||
sequence->n_aln = 0;
|
||||
|
||||
bwa_free_read_seq(1,sequence);
|
||||
}
|
||||
|
||||
Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) {
|
||||
// Calculate the local coordinate and local alignment.
|
||||
bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr);
|
||||
bwa_refine_gapped(bns, 1, sequence, reference, NULL);
|
||||
|
||||
// Copy the local alignment data into the alignment object.
|
||||
Alignment alignment;
|
||||
|
||||
// Populate basic path info
|
||||
alignment.edit_distance = sequence->nm;
|
||||
alignment.num_mismatches = sequence->n_mm;
|
||||
alignment.num_gap_opens = sequence->n_gapo;
|
||||
alignment.num_gap_extensions = sequence->n_gape;
|
||||
alignment.num_best = sequence->c1;
|
||||
alignment.num_second_best = sequence->c2;
|
||||
|
||||
// Final alignment position.
|
||||
alignment.type = sequence->type;
|
||||
bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig);
|
||||
alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1;
|
||||
alignment.negative_strand = sequence->strand;
|
||||
alignment.mapping_quality = sequence->mapQ;
|
||||
|
||||
// Cigar step.
|
||||
alignment.cigar = NULL;
|
||||
if(sequence->cigar) {
|
||||
alignment.cigar = new uint16_t[sequence->n_cigar];
|
||||
memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t));
|
||||
}
|
||||
alignment.n_cigar = sequence->n_cigar;
|
||||
|
||||
// MD tag with a better breakdown of differences in the cigar
|
||||
alignment.md = strdup(sequence->md);
|
||||
delete[] sequence->md;
|
||||
sequence->md = NULL;
|
||||
|
||||
return alignment;
|
||||
}
|
||||
|
||||
void BWA::load_default_options()
|
||||
{
|
||||
options.s_mm = 3;
|
||||
options.s_gapo = 11;
|
||||
options.s_gape = 4;
|
||||
options.mode = 3;
|
||||
options.indel_end_skip = 5;
|
||||
options.max_del_occ = 10;
|
||||
options.max_entries = 2000000;
|
||||
options.fnr = 0.04;
|
||||
options.max_diff = -1;
|
||||
options.max_gapo = 1;
|
||||
options.max_gape = 6;
|
||||
options.max_seed_diff = 2;
|
||||
options.seed_len = 2147483647;
|
||||
options.n_threads = 1;
|
||||
options.max_top2 = 30;
|
||||
options.trim_qual = 0;
|
||||
}
|
||||
|
||||
void BWA::set_max_edit_distance(float edit_distance) {
|
||||
if(edit_distance > 0 && edit_distance < 1) {
|
||||
options.fnr = edit_distance;
|
||||
options.max_diff = -1;
|
||||
}
|
||||
else {
|
||||
options.fnr = -1.0;
|
||||
options.max_diff = (int)edit_distance;
|
||||
}
|
||||
}
|
||||
|
||||
void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; }
|
||||
void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; }
|
||||
void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; }
|
||||
void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; }
|
||||
void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; }
|
||||
void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; }
|
||||
|
||||
/**
|
||||
* Create a sequence with a set of reasonable initial defaults.
|
||||
* Will leave seq and rseq empty.
|
||||
*/
|
||||
bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length)
|
||||
{
|
||||
bwa_seq_t* sequence = new bwa_seq_t;
|
||||
|
||||
sequence->tid = -1;
|
||||
|
||||
sequence->name = 0;
|
||||
|
||||
copy_bases_into_sequence(sequence, bases, read_length);
|
||||
|
||||
sequence->qual = 0;
|
||||
sequence->aln = 0;
|
||||
sequence->md = 0;
|
||||
|
||||
sequence->cigar = NULL;
|
||||
sequence->n_cigar = 0;
|
||||
|
||||
sequence->multi = NULL;
|
||||
sequence->n_multi = 0;
|
||||
|
||||
return sequence;
|
||||
}
|
||||
|
||||
void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length)
|
||||
{
|
||||
// seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap
|
||||
sequence->seq = new ubyte_t[read_length];
|
||||
sequence->rseq = new ubyte_t[read_length];
|
||||
for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]];
|
||||
memcpy(sequence->rseq,sequence->seq,read_length);
|
||||
|
||||
// BWA expects the read bases to arrive reversed.
|
||||
seq_reverse(read_length,sequence->seq,0);
|
||||
seq_reverse(read_length,sequence->rseq,1);
|
||||
|
||||
sequence->full_len = sequence->len = read_length;
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
#ifndef BWA_GATEWAY
|
||||
#define BWA_GATEWAY
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "bwtaln.h"
|
||||
|
||||
class Alignment {
|
||||
public:
|
||||
uint32_t type;
|
||||
int contig;
|
||||
bwtint_t pos;
|
||||
bool negative_strand;
|
||||
uint32_t mapping_quality;
|
||||
|
||||
uint16_t *cigar;
|
||||
int n_cigar;
|
||||
|
||||
uint8_t num_mismatches;
|
||||
uint8_t num_gap_opens;
|
||||
uint8_t num_gap_extensions;
|
||||
uint16_t edit_distance;
|
||||
|
||||
uint32_t num_best;
|
||||
uint32_t num_second_best;
|
||||
|
||||
char* md;
|
||||
};
|
||||
|
||||
class BWA {
|
||||
private:
|
||||
bntseq_t *bns;
|
||||
ubyte_t* reference;
|
||||
bwt_t* bwts[2];
|
||||
gap_opt_t options;
|
||||
|
||||
void load_default_options();
|
||||
bwa_seq_t* create_sequence(const char* bases, const unsigned read_length);
|
||||
void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length);
|
||||
Alignment generate_final_alignment_from_sequence(bwa_seq_t* sequence);
|
||||
|
||||
public:
|
||||
BWA(const char* ann_filename,
|
||||
const char* amb_filename,
|
||||
const char* pac_filename,
|
||||
const char* forward_bwt_filename,
|
||||
const char* forward_sa_filename,
|
||||
const char* reverse_bwt_filename,
|
||||
const char* reverse_sa_filename);
|
||||
~BWA();
|
||||
|
||||
// Parameterize the aligner.
|
||||
void set_max_edit_distance(float edit_distance);
|
||||
void set_max_gap_opens(int max_gap_opens);
|
||||
void set_max_gap_extensions(int max_gap_extensions);
|
||||
void set_disallow_indel_within_range(int indel_range);
|
||||
void set_mismatch_penalty(int penalty);
|
||||
void set_gap_open_penalty(int penalty);
|
||||
void set_gap_extension_penalty(int penalty);
|
||||
|
||||
// Perform the alignment
|
||||
Alignment* generate_single_alignment(const char* bases,
|
||||
const unsigned read_length);
|
||||
void find_paths(const char* bases,
|
||||
const unsigned read_length,
|
||||
bwt_aln1_t*& paths,
|
||||
unsigned& num_paths,
|
||||
unsigned& best_path_count,
|
||||
unsigned& second_best_path_count);
|
||||
void generate_alignments_from_paths(const char* bases,
|
||||
const unsigned read_length,
|
||||
bwt_aln1_t* paths,
|
||||
const unsigned num_paths,
|
||||
const unsigned best_count,
|
||||
const unsigned second_best_count,
|
||||
Alignment*& alignments,
|
||||
unsigned& num_alignments);
|
||||
};
|
||||
|
||||
#endif // BWA_GATEWAY
|
||||
Binary file not shown.
|
|
@ -0,0 +1,437 @@
|
|||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "bwtaln.h"
|
||||
#include "bwa_gateway.h"
|
||||
#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h"
|
||||
|
||||
typedef void (BWA::*int_setter)(int value);
|
||||
typedef void (BWA::*float_setter)(float value);
|
||||
|
||||
static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
|
||||
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
|
||||
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
|
||||
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
|
||||
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
|
||||
|
||||
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration)
|
||||
{
|
||||
jstring java_ann = get_configuration_file(env,bwtFiles,"annFile");
|
||||
if(java_ann == NULL) return 0L;
|
||||
jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile");
|
||||
if(java_amb == NULL) return 0L;
|
||||
jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile");
|
||||
if(java_pac == NULL) return 0L;
|
||||
jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile");
|
||||
if(java_forward_bwt == NULL) return 0L;
|
||||
jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile");
|
||||
if(java_forward_sa == NULL) return 0L;
|
||||
jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile");
|
||||
if(java_reverse_bwt == NULL) return 0L;
|
||||
jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile");
|
||||
if(java_reverse_sa == NULL) return 0L;
|
||||
|
||||
const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
const char* amb_filename = env->GetStringUTFChars(java_amb,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
const char* pac_filename = env->GetStringUTFChars(java_pac,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
const char* forward_bwt_filename = env->GetStringUTFChars(java_forward_bwt,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
const char* forward_sa_filename = env->GetStringUTFChars(java_forward_sa,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
const char* reverse_bwt_filename = env->GetStringUTFChars(java_reverse_bwt,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
const char* reverse_sa_filename = env->GetStringUTFChars(java_reverse_sa,JNI_FALSE);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
|
||||
BWA* bwa = new BWA(ann_filename,
|
||||
amb_filename,
|
||||
pac_filename,
|
||||
forward_bwt_filename,
|
||||
forward_sa_filename,
|
||||
reverse_bwt_filename,
|
||||
reverse_sa_filename);
|
||||
|
||||
Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(env,instance,(jlong)bwa,configuration);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
|
||||
env->ReleaseStringUTFChars(java_ann,ann_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
env->ReleaseStringUTFChars(java_amb,amb_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
env->ReleaseStringUTFChars(java_pac,pac_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
env->ReleaseStringUTFChars(java_forward_bwt,forward_bwt_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
env->ReleaseStringUTFChars(java_forward_sa,forward_sa_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
env->ReleaseStringUTFChars(java_reverse_bwt,reverse_bwt_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
env->ReleaseStringUTFChars(java_reverse_sa,reverse_sa_filename);
|
||||
if(env->ExceptionCheck()) return 0L;
|
||||
|
||||
return (jlong)bwa;
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy(JNIEnv* env, jobject instance, jlong java_bwa)
|
||||
{
|
||||
BWA* bwa = (BWA*)java_bwa;
|
||||
delete bwa;
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(JNIEnv *env, jobject instance, jlong java_bwa, jobject configuration) {
|
||||
BWA* bwa = (BWA*)java_bwa;
|
||||
set_float_configuration_param(env, configuration, "maximumEditDistance", bwa, &BWA::set_max_edit_distance);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "maximumGapOpens", bwa, &BWA::set_max_gap_opens);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "maximumGapExtensions", bwa, &BWA::set_max_gap_extensions);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "disallowIndelWithinRange", bwa, &BWA::set_disallow_indel_within_range);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "mismatchPenalty", bwa, &BWA::set_mismatch_penalty);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "gapOpenPenalty", bwa, &BWA::set_gap_open_penalty);
|
||||
if(env->ExceptionCheck()) return;
|
||||
set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty);
|
||||
if(env->ExceptionCheck()) return;
|
||||
}
|
||||
|
||||
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases)
|
||||
{
|
||||
BWA* bwa = (BWA*)java_bwa;
|
||||
|
||||
const jsize read_length = env->GetArrayLength(java_bases);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
|
||||
if(read_bases == NULL) return NULL;
|
||||
|
||||
bwt_aln1_t* paths = NULL;
|
||||
unsigned num_paths = 0;
|
||||
|
||||
unsigned best_path_count, second_best_path_count;
|
||||
bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count);
|
||||
|
||||
jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL);
|
||||
if(java_paths == NULL) return NULL;
|
||||
|
||||
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
|
||||
bwt_aln1_t& path = *(paths + path_idx);
|
||||
|
||||
jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath");
|
||||
if(java_path_class == NULL) return NULL;
|
||||
|
||||
jmethodID java_path_constructor = env->GetMethodID(java_path_class, "<init>", "(IIIZJJIII)V");
|
||||
if(java_path_constructor == NULL) return NULL;
|
||||
|
||||
// Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints.
|
||||
jobject java_path = env->NewObject(java_path_class,
|
||||
java_path_constructor,
|
||||
path.n_mm,
|
||||
path.n_gapo,
|
||||
path.n_gape,
|
||||
path.a,
|
||||
(jlong)path.k,
|
||||
(jlong)path.l,
|
||||
path.score,
|
||||
best_path_count,
|
||||
second_best_path_count);
|
||||
if(java_path == NULL) return NULL;
|
||||
|
||||
env->SetObjectArrayElement(java_paths,path_idx,java_path);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
env->DeleteLocalRef(java_path_class);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
}
|
||||
|
||||
delete[] paths;
|
||||
|
||||
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
|
||||
|
||||
return env->ExceptionCheck() ? NULL : java_paths;
|
||||
}
|
||||
|
||||
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths)
|
||||
{
|
||||
BWA* bwa = (BWA*)java_bwa;
|
||||
|
||||
const jsize read_length = env->GetArrayLength(java_bases);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
|
||||
if(read_bases == NULL) return NULL;
|
||||
|
||||
const jsize num_paths = env->GetArrayLength(java_paths);
|
||||
bwt_aln1_t* paths = new bwt_aln1_t[num_paths];
|
||||
unsigned best_count = 0, second_best_count = 0;
|
||||
|
||||
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
|
||||
jobject java_path = env->GetObjectArrayElement(java_paths,path_idx);
|
||||
jclass java_path_class = env->GetObjectClass(java_path);
|
||||
if(java_path_class == NULL) return NULL;
|
||||
|
||||
bwt_aln1_t& path = *(paths + path_idx);
|
||||
|
||||
jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I");
|
||||
if(mismatches_field == NULL) return NULL;
|
||||
path.n_mm = env->GetIntField(java_path,mismatches_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I");
|
||||
if(gap_opens_field == NULL) return NULL;
|
||||
path.n_gapo = env->GetIntField(java_path,gap_opens_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I");
|
||||
if(gap_extensions_field == NULL) return NULL;
|
||||
path.n_gape = env->GetIntField(java_path,gap_extensions_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z");
|
||||
if(negative_strand_field == NULL) return NULL;
|
||||
path.a = env->GetBooleanField(java_path,negative_strand_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID k_field = env->GetFieldID(java_path_class, "k", "J");
|
||||
if(k_field == NULL) return NULL;
|
||||
path.k = env->GetLongField(java_path,k_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID l_field = env->GetFieldID(java_path_class, "l", "J");
|
||||
if(l_field == NULL) return NULL;
|
||||
path.l = env->GetLongField(java_path,l_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID score_field = env->GetFieldID(java_path_class, "score", "I");
|
||||
if(score_field == NULL) return NULL;
|
||||
path.score = env->GetIntField(java_path,score_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I");
|
||||
if(best_count_field == NULL) return NULL;
|
||||
best_count = env->GetIntField(java_path,best_count_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I");
|
||||
if(second_best_count_field == NULL) return NULL;
|
||||
second_best_count = env->GetIntField(java_path,second_best_count_field);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
}
|
||||
|
||||
Alignment* alignments = NULL;
|
||||
unsigned num_alignments = 0;
|
||||
bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments);
|
||||
|
||||
jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL);
|
||||
if(java_alignments == NULL) return NULL;
|
||||
|
||||
for(unsigned alignment_idx = 0; alignment_idx < (unsigned)num_alignments; alignment_idx++) {
|
||||
Alignment& alignment = *(alignments + alignment_idx);
|
||||
jobject java_alignment = convert_to_java_alignment(env,read_bases,read_length,alignment);
|
||||
if(java_alignment == NULL) return NULL;
|
||||
env->SetObjectArrayElement(java_alignments,alignment_idx,java_alignment);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
}
|
||||
|
||||
delete[] alignments;
|
||||
delete[] paths;
|
||||
|
||||
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
|
||||
|
||||
return env->ExceptionCheck() ? NULL : java_alignments;
|
||||
}
|
||||
|
||||
JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) {
|
||||
BWA* bwa = (BWA*)java_bwa;
|
||||
|
||||
const jsize read_length = env->GetArrayLength(java_bases);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
|
||||
if(read_bases == NULL) return NULL;
|
||||
|
||||
Alignment* best_alignment = bwa->generate_single_alignment((const char*)read_bases,read_length);
|
||||
jobject java_best_alignment = (best_alignment != NULL) ? convert_to_java_alignment(env,read_bases,read_length,*best_alignment) : NULL;
|
||||
delete best_alignment;
|
||||
|
||||
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
|
||||
|
||||
return java_best_alignment;
|
||||
}
|
||||
|
||||
static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment) {
|
||||
unsigned cigar_length;
|
||||
if(alignment.type == BWA_TYPE_NO_MATCH) cigar_length = 0;
|
||||
else if(!alignment.cigar) cigar_length = 1;
|
||||
else cigar_length = alignment.n_cigar;
|
||||
|
||||
jcharArray java_cigar_operators = env->NewCharArray(cigar_length);
|
||||
if(java_cigar_operators == NULL) return NULL;
|
||||
jintArray java_cigar_lengths = env->NewIntArray(cigar_length);
|
||||
if(java_cigar_lengths == NULL) return NULL;
|
||||
|
||||
if(alignment.cigar) {
|
||||
for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) {
|
||||
jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14];
|
||||
jint cigar_length = alignment.cigar[cigar_idx]&0x3fff;
|
||||
|
||||
env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if(alignment.type != BWA_TYPE_NO_MATCH) {
|
||||
jchar cigar_operator = 'M';
|
||||
env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
}
|
||||
}
|
||||
delete[] alignment.cigar;
|
||||
|
||||
jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment");
|
||||
if(java_alignment_class == NULL) return NULL;
|
||||
|
||||
jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "<init>", "(IIZI[C[IILjava/lang/String;IIIII)V");
|
||||
if(java_alignment_constructor == NULL) return NULL;
|
||||
|
||||
jstring java_md = env->NewStringUTF(alignment.md);
|
||||
if(java_md == NULL) return NULL;
|
||||
delete[] alignment.md;
|
||||
|
||||
jobject java_alignment = env->NewObject(java_alignment_class,
|
||||
java_alignment_constructor,
|
||||
alignment.contig,
|
||||
alignment.pos,
|
||||
alignment.negative_strand,
|
||||
alignment.mapping_quality,
|
||||
java_cigar_operators,
|
||||
java_cigar_lengths,
|
||||
alignment.edit_distance,
|
||||
java_md,
|
||||
alignment.num_mismatches,
|
||||
alignment.num_gap_opens,
|
||||
alignment.num_gap_extensions,
|
||||
alignment.num_best,
|
||||
alignment.num_second_best);
|
||||
if(java_alignment == NULL) return NULL;
|
||||
|
||||
env->DeleteLocalRef(java_alignment_class);
|
||||
if(env->ExceptionCheck()) return NULL;
|
||||
|
||||
return java_alignment;
|
||||
}
|
||||
|
||||
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) {
|
||||
jclass configuration_class = env->GetObjectClass(configuration);
|
||||
if(configuration_class == NULL) return NULL;
|
||||
|
||||
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;");
|
||||
if(configuration_field == NULL) return NULL;
|
||||
|
||||
jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field);
|
||||
|
||||
jclass file_class = env->FindClass("java/io/File");
|
||||
if(file_class == NULL) return NULL;
|
||||
|
||||
jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;");
|
||||
if(path_extractor == NULL) return NULL;
|
||||
|
||||
jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor);
|
||||
if(path == NULL) return NULL;
|
||||
|
||||
env->DeleteLocalRef(configuration_class);
|
||||
env->DeleteLocalRef(file_class);
|
||||
env->DeleteLocalRef(configuration_file);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {
|
||||
jclass configuration_class = env->GetObjectClass(configuration);
|
||||
if(configuration_class == NULL) return;
|
||||
|
||||
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Integer;");
|
||||
if(configuration_field == NULL) return;
|
||||
|
||||
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
|
||||
if(env->ExceptionCheck()) return;
|
||||
|
||||
if(boxed_value != NULL) {
|
||||
jclass int_box_class = env->FindClass("java/lang/Integer");
|
||||
if(int_box_class == NULL) return;
|
||||
|
||||
jmethodID int_extractor = env->GetMethodID(int_box_class,"intValue", "()I");
|
||||
if(int_extractor == NULL) return;
|
||||
|
||||
jint value = env->CallIntMethod(boxed_value,int_extractor);
|
||||
if(env->ExceptionCheck()) return;
|
||||
|
||||
if(value < 0)
|
||||
{
|
||||
throw_config_value_exception(env,field_name,"cannot be set to a negative value");
|
||||
return;
|
||||
}
|
||||
|
||||
(bwa->*setter)(value);
|
||||
|
||||
env->DeleteLocalRef(int_box_class);
|
||||
}
|
||||
|
||||
env->DeleteLocalRef(boxed_value);
|
||||
env->DeleteLocalRef(configuration_class);
|
||||
}
|
||||
|
||||
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter)
|
||||
{
|
||||
jclass configuration_class = env->GetObjectClass(configuration);
|
||||
if(configuration_class == NULL) return;
|
||||
|
||||
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Float;");
|
||||
if(configuration_field == NULL) return;
|
||||
|
||||
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
|
||||
if(boxed_value != NULL) {
|
||||
jclass float_box_class = env->FindClass("java/lang/Float");
|
||||
if(float_box_class == NULL) return;
|
||||
|
||||
jmethodID float_extractor = env->GetMethodID(float_box_class,"floatValue", "()F");
|
||||
if(float_extractor == NULL) return;
|
||||
|
||||
jfloat value = env->CallFloatMethod(boxed_value,float_extractor);
|
||||
if(env->ExceptionCheck()) return;
|
||||
|
||||
if(value < 0)
|
||||
{
|
||||
throw_config_value_exception(env,field_name,"cannot be set to a negative value");
|
||||
return;
|
||||
}
|
||||
|
||||
(bwa->*setter)(value);
|
||||
|
||||
env->DeleteLocalRef(float_box_class);
|
||||
}
|
||||
|
||||
env->DeleteLocalRef(boxed_value);
|
||||
env->DeleteLocalRef(configuration_class);
|
||||
}
|
||||
|
||||
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message)
|
||||
{
|
||||
char* buffer = new char[strlen(field_name)+1+strlen(message)+1];
|
||||
sprintf(buffer,"%s %s",field_name,message);
|
||||
jclass sting_exception_class = env->FindClass("org/broadinstitute/sting/utils/StingException");
|
||||
if(sting_exception_class == NULL) return;
|
||||
env->ThrowNew(sting_exception_class, buffer);
|
||||
delete[] buffer;
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
/* DO NOT EDIT THIS FILE - it is machine generated */
|
||||
#include <jni.h>
|
||||
/* Header for class org_broadinstitute_sting_alignment_bwa_c_BWACAligner */
|
||||
|
||||
#ifndef _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
#define _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/*
|
||||
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
* Method: create
|
||||
* Signature: (Lorg/broadinstitute/sting/alignment/bwa/BWTFiles;Lorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)J
|
||||
*/
|
||||
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create
|
||||
(JNIEnv *, jobject, jobject, jobject);
|
||||
|
||||
/*
|
||||
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
* Method: updateConfiguration
|
||||
* Signature: (JLorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)V
|
||||
*/
|
||||
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration
|
||||
(JNIEnv *, jobject, jlong, jobject);
|
||||
|
||||
/*
|
||||
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
* Method: destroy
|
||||
* Signature: (J)V
|
||||
*/
|
||||
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy
|
||||
(JNIEnv *, jobject, jlong);
|
||||
|
||||
/*
|
||||
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
* Method: getPaths
|
||||
* Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;
|
||||
*/
|
||||
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths
|
||||
(JNIEnv *, jobject, jlong, jbyteArray);
|
||||
|
||||
/*
|
||||
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
* Method: convertPathsToAlignments
|
||||
* Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment;
|
||||
*/
|
||||
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments
|
||||
(JNIEnv *, jobject, jlong, jbyteArray, jobjectArray);
|
||||
|
||||
/*
|
||||
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||
* Method: getBestAlignment
|
||||
* Signature: (J[B)Lorg/broadinstitute/sting/alignment/Alignment;
|
||||
*/
|
||||
JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment
|
||||
(JNIEnv *, jobject, jlong, jbyteArray);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
CC=gcc
|
||||
CCFLAGS=-Wall -dynamiclib -arch i386 -arch x86_64
|
||||
|
||||
libenvironhack.dylib: libenvironhack.c
|
||||
$(CC) $(CCFLAGS) -init _init_environ $< -o $@
|
||||
|
||||
all: libenvironhack.dylib
|
||||
|
||||
clean:
|
||||
rm -f libenvironhack.dylib
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+.
|
||||
nm $LSF_LIBDIR/liblsf.dylib | grep environ
|
||||
See "man environ" for more info, along with http://lists.apple.com/archives/java-dev/2007/Dec/msg00096.html
|
||||
*/
|
||||
|
||||
#include <crt_externs.h>
|
||||
|
||||
char **environ = (char **)0;
|
||||
|
||||
void init_environ(void) {
|
||||
environ = (*_NSGetEnviron());
|
||||
}
|
||||
Binary file not shown.
|
|
@ -0,0 +1,49 @@
|
|||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Create perfect alignments from the read to the genome represented by the given BWT / suffix array.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public interface Aligner {
|
||||
/**
|
||||
* Close this instance of the BWA pointer and delete its resources.
|
||||
*/
|
||||
public void close();
|
||||
|
||||
/**
|
||||
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
|
||||
* @param bases Bases to align.
|
||||
* @return An align
|
||||
*/
|
||||
public Alignment getBestAlignment(final byte[] bases);
|
||||
|
||||
/**
|
||||
* Align the read to the reference.
|
||||
* @param read Read to align.
|
||||
* @param header Optional header to drop in place.
|
||||
* @return A list of the alignments.
|
||||
*/
|
||||
public SAMRecord align(final SAMRecord read, final SAMFileHeader header);
|
||||
|
||||
/**
|
||||
* Get a iterator of alignments, batched by mapping quality.
|
||||
* @param bases List of bases.
|
||||
* @return Iterator to alignments.
|
||||
*/
|
||||
public Iterable<Alignment[]> getAllAlignments(final byte[] bases);
|
||||
|
||||
/**
|
||||
* Get a iterator of aligned reads, batched by mapping quality.
|
||||
* @param read Read to align.
|
||||
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
|
||||
* @return Iterator to alignments.
|
||||
*/
|
||||
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* Represents an alignment of a read to a site in the reference genome.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class Alignment {
|
||||
protected int contigIndex;
|
||||
protected long alignmentStart;
|
||||
protected boolean negativeStrand;
|
||||
protected int mappingQuality;
|
||||
|
||||
protected char[] cigarOperators;
|
||||
protected int[] cigarLengths;
|
||||
|
||||
protected int editDistance;
|
||||
protected String mismatchingPositions;
|
||||
|
||||
protected int numMismatches;
|
||||
protected int numGapOpens;
|
||||
protected int numGapExtensions;
|
||||
protected int bestCount;
|
||||
protected int secondBestCount;
|
||||
|
||||
/**
|
||||
* Gets the index of the given contig.
|
||||
* @return the inde
|
||||
*/
|
||||
public int getContigIndex() { return contigIndex; }
|
||||
|
||||
/**
|
||||
* Gets the starting position for the given alignment.
|
||||
* @return Starting position.
|
||||
*/
|
||||
public long getAlignmentStart() { return alignmentStart; }
|
||||
|
||||
/**
|
||||
* Is the given alignment on the reverse strand?
|
||||
* @return True if the alignment is on the reverse strand.
|
||||
*/
|
||||
public boolean isNegativeStrand() { return negativeStrand; }
|
||||
|
||||
/**
|
||||
* Gets the score of this alignment.
|
||||
* @return The score.
|
||||
*/
|
||||
public int getMappingQuality() { return mappingQuality; }
|
||||
|
||||
/**
|
||||
* Gets the edit distance; will eventually end up in the NM SAM tag
|
||||
* if this alignment makes it that far.
|
||||
* @return The edit distance.
|
||||
*/
|
||||
public int getEditDistance() { return editDistance; }
|
||||
|
||||
/**
|
||||
* A string representation of which positions mismatch; contents of MD tag.
|
||||
* @return String representation of mismatching positions.
|
||||
*/
|
||||
public String getMismatchingPositions() { return mismatchingPositions; }
|
||||
|
||||
/**
|
||||
* Gets the number of mismatches in the read.
|
||||
* @return Number of mismatches.
|
||||
*/
|
||||
public int getNumMismatches() { return numMismatches; }
|
||||
|
||||
/**
|
||||
* Get the number of gap opens.
|
||||
* @return Number of gap opens.
|
||||
*/
|
||||
public int getNumGapOpens() { return numGapOpens; }
|
||||
|
||||
/**
|
||||
* Get the number of gap extensions.
|
||||
* @return Number of gap extensions.
|
||||
*/
|
||||
public int getNumGapExtensions() { return numGapExtensions; }
|
||||
|
||||
/**
|
||||
* Get the number of best alignments.
|
||||
* @return Number of top scoring alignments.
|
||||
*/
|
||||
public int getBestCount() { return bestCount; }
|
||||
|
||||
/**
|
||||
* Get the number of second best alignments.
|
||||
* @return Number of second best scoring alignments.
|
||||
*/
|
||||
public int getSecondBestCount() { return secondBestCount; }
|
||||
|
||||
/**
|
||||
* Gets the cigar for this alignment.
|
||||
* @return sam-jdk formatted alignment.
|
||||
*/
|
||||
public Cigar getCigar() {
|
||||
Cigar cigar = new Cigar();
|
||||
for(int i = 0; i < cigarOperators.length; i++) {
|
||||
CigarOperator operator = CigarOperator.characterToEnum(cigarOperators[i]);
|
||||
cigar.add(new CigarElement(cigarLengths[i],operator));
|
||||
}
|
||||
return cigar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Temporarily implement getCigarString() for debugging; the TextCigarCodec is unfortunately
|
||||
* package-protected.
|
||||
* @return
|
||||
*/
|
||||
public String getCigarString() {
|
||||
Cigar cigar = getCigar();
|
||||
if(cigar.isEmpty()) return "*";
|
||||
|
||||
StringBuilder cigarString = new StringBuilder();
|
||||
for(CigarElement element: cigar.getCigarElements()) {
|
||||
cigarString.append(element.getLength());
|
||||
cigarString.append(element.getOperator());
|
||||
}
|
||||
return cigarString.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Stub for inheritance.
|
||||
*/
|
||||
public Alignment() {}
|
||||
|
||||
/**
|
||||
* Create a new alignment object.
|
||||
* @param contigIndex The contig to which this read aligned.
|
||||
* @param alignmentStart The point within the contig to which this read aligned.
|
||||
* @param negativeStrand Forward or reverse alignment of the given read.
|
||||
* @param mappingQuality How good does BWA think this mapping is?
|
||||
* @param cigarOperators The ordered operators in the cigar string.
|
||||
* @param cigarLengths The lengths to which each operator applies.
|
||||
* @param editDistance The edit distance (cumulative) of the read.
|
||||
* @param mismatchingPositions String representation of which bases in the read mismatch.
|
||||
* @param numMismatches Number of total mismatches in the read.
|
||||
* @param numGapOpens Number of gap opens in the read.
|
||||
* @param numGapExtensions Number of gap extensions in the read.
|
||||
* @param bestCount Number of best alignments in the read.
|
||||
* @param secondBestCount Number of second best alignments in the read.
|
||||
*/
|
||||
public Alignment(int contigIndex,
|
||||
int alignmentStart,
|
||||
boolean negativeStrand,
|
||||
int mappingQuality,
|
||||
char[] cigarOperators,
|
||||
int[] cigarLengths,
|
||||
int editDistance,
|
||||
String mismatchingPositions,
|
||||
int numMismatches,
|
||||
int numGapOpens,
|
||||
int numGapExtensions,
|
||||
int bestCount,
|
||||
int secondBestCount) {
|
||||
this.contigIndex = contigIndex;
|
||||
this.alignmentStart = alignmentStart;
|
||||
this.negativeStrand = negativeStrand;
|
||||
this.mappingQuality = mappingQuality;
|
||||
this.cigarOperators = cigarOperators;
|
||||
this.cigarLengths = cigarLengths;
|
||||
this.editDistance = editDistance;
|
||||
this.mismatchingPositions = mismatchingPositions;
|
||||
this.numMismatches = numMismatches;
|
||||
this.numGapOpens = numGapOpens;
|
||||
this.numGapExtensions = numGapExtensions;
|
||||
this.bestCount = bestCount;
|
||||
this.secondBestCount = secondBestCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a read directly from an alignment.
|
||||
* @param alignment The alignment to convert to a read.
|
||||
* @param unmappedRead Source of the unmapped read. Should have bases, quality scores, and flags.
|
||||
* @param newSAMHeader The new SAM header to use in creating this read. Can be null, but if so, the sequence
|
||||
* dictionary in the
|
||||
* @return A mapped alignment.
|
||||
*/
|
||||
public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader) {
|
||||
SAMRecord read;
|
||||
try {
|
||||
read = (SAMRecord)unmappedRead.clone();
|
||||
}
|
||||
catch(CloneNotSupportedException ex) {
|
||||
throw new ReviewedStingException("Unable to create aligned read from template.");
|
||||
}
|
||||
|
||||
if(newSAMHeader != null)
|
||||
read.setHeader(newSAMHeader);
|
||||
|
||||
// If we're realigning a previously aligned record, strip out the placement of the alignment.
|
||||
read.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
||||
read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||
read.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
||||
read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||
|
||||
if(alignment != null) {
|
||||
read.setReadUnmappedFlag(false);
|
||||
read.setReferenceIndex(alignment.getContigIndex());
|
||||
read.setAlignmentStart((int)alignment.getAlignmentStart());
|
||||
read.setReadNegativeStrandFlag(alignment.isNegativeStrand());
|
||||
read.setMappingQuality(alignment.getMappingQuality());
|
||||
read.setCigar(alignment.getCigar());
|
||||
if(alignment.isNegativeStrand()) {
|
||||
read.setReadBases(BaseUtils.simpleReverseComplement(read.getReadBases()));
|
||||
read.setBaseQualities(Utils.reverse(read.getBaseQualities()));
|
||||
}
|
||||
read.setAttribute("NM",alignment.getEditDistance());
|
||||
read.setAttribute("MD",alignment.getMismatchingPositions());
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them
|
||||
* of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original
|
||||
* alignment from the input file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AlignmentValidationWalker extends ReadWalker<Integer,Integer> {
|
||||
/**
|
||||
* The supporting BWT index generated using BWT.
|
||||
*/
|
||||
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
|
||||
private String prefix = null;
|
||||
|
||||
/**
|
||||
* The instance used to generate alignments.
|
||||
*/
|
||||
private BWACAligner aligner = null;
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(prefix == null)
|
||||
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
|
||||
BWTFiles bwtFiles = new BWTFiles(prefix);
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of reads aligned by this map (aka 1).
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
//logger.info(String.format("examining read %s", read.getReadName()));
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
if(read.getReadNegativeStrandFlag()) bases = BaseUtils.simpleReverseComplement(bases);
|
||||
|
||||
boolean matches = true;
|
||||
Iterable<Alignment[]> alignments = aligner.getAllAlignments(bases);
|
||||
Iterator<Alignment[]> alignmentIterator = alignments.iterator();
|
||||
|
||||
if(!alignmentIterator.hasNext()) {
|
||||
matches = read.getReadUnmappedFlag();
|
||||
}
|
||||
else {
|
||||
Alignment[] alignmentsOfBestQuality = alignmentIterator.next();
|
||||
for(Alignment alignment: alignmentsOfBestQuality) {
|
||||
matches = (alignment.getContigIndex() == read.getReferenceIndex());
|
||||
matches &= (alignment.getAlignmentStart() == read.getAlignmentStart());
|
||||
matches &= (alignment.isNegativeStrand() == read.getReadNegativeStrandFlag());
|
||||
matches &= (alignment.getCigar().equals(read.getCigar()));
|
||||
matches &= (alignment.getMappingQuality() == read.getMappingQuality());
|
||||
if(matches) break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!matches) {
|
||||
logger.error("Found mismatch!");
|
||||
logger.error(String.format("Read %s:",read.getReadName()));
|
||||
logger.error(String.format(" Contig index: %d",read.getReferenceIndex()));
|
||||
logger.error(String.format(" Alignment start: %d", read.getAlignmentStart()));
|
||||
logger.error(String.format(" Negative strand: %b", read.getReadNegativeStrandFlag()));
|
||||
logger.error(String.format(" Cigar: %s%n", read.getCigarString()));
|
||||
logger.error(String.format(" Mapping quality: %s%n", read.getMappingQuality()));
|
||||
for(Alignment[] alignmentsByScore: alignments) {
|
||||
for(int i = 0; i < alignmentsByScore.length; i++) {
|
||||
logger.error(String.format("Alignment %d:",i));
|
||||
logger.error(String.format(" Contig index: %d",alignmentsByScore[i].getContigIndex()));
|
||||
logger.error(String.format(" Alignment start: %d", alignmentsByScore[i].getAlignmentStart()));
|
||||
logger.error(String.format(" Negative strand: %b", alignmentsByScore[i].isNegativeStrand()));
|
||||
logger.error(String.format(" Cigar: %s", alignmentsByScore[i].getCigarString()));
|
||||
logger.error(String.format(" Mapping quality: %s%n", alignmentsByScore[i].getMappingQuality()));
|
||||
}
|
||||
}
|
||||
throw new ReviewedStingException(String.format("Read %s mismatches!", read.getReadName()));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, validated reads will be counted.
|
||||
* @return 0, indicating no reads yet validated.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of reads processed.
|
||||
* @param value Number of reads processed by this map.
|
||||
* @param sum Number of reads processed before this map.
|
||||
* @return Number of reads processed up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
|
||||
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
@WalkerName("Align")
|
||||
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
|
||||
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
|
||||
"generated by bwa index -d bwtsw. If unspecified, will default " +
|
||||
"to the reference specified via the -R argument.",required=false)
|
||||
private File targetReferenceFile = null;
|
||||
|
||||
@Output
|
||||
private StingSAMFileWriter out = null;
|
||||
|
||||
/**
|
||||
* The actual aligner.
|
||||
*/
|
||||
private BWACAligner aligner = null;
|
||||
|
||||
/**
|
||||
* New header to use, if desired.
|
||||
*/
|
||||
private SAMFileHeader header;
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(targetReferenceFile == null)
|
||||
targetReferenceFile = getToolkit().getArguments().referenceFile;
|
||||
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
|
||||
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
|
||||
header = getToolkit().getSAMFileHeader().clone();
|
||||
SAMSequenceDictionary referenceDictionary =
|
||||
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
|
||||
header.setSequenceDictionary(referenceDictionary);
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
|
||||
out.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of alignments found for this read.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
SAMRecord alignedRead = aligner.align(read,header);
|
||||
out.addAlignment(alignedRead);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, alignments will be counted.
|
||||
* @return 0, indicating no alignments yet found.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of alignments found.
|
||||
* @param value Number of alignments found by this map.
|
||||
* @param sum Number of alignments found before this map.
|
||||
* @return Number of alignments found up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
|
||||
* frequency of that number of placements.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class CountBestAlignmentsWalker extends ReadWalker<Integer,Integer> {
|
||||
/**
|
||||
* The supporting BWT index generated using BWT.
|
||||
*/
|
||||
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
|
||||
private String prefix = null;
|
||||
|
||||
@Output
|
||||
private PrintStream out = null;
|
||||
|
||||
/**
|
||||
* The actual aligner.
|
||||
*/
|
||||
private Aligner aligner = null;
|
||||
|
||||
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(prefix == null)
|
||||
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
|
||||
BWTFiles bwtFiles = new BWTFiles(prefix);
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of alignments found for this read.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
|
||||
if(alignmentIterator.hasNext()) {
|
||||
int numAlignments = alignmentIterator.next().length;
|
||||
if(alignmentFrequencies.containsKey(numAlignments))
|
||||
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
|
||||
else
|
||||
alignmentFrequencies.put(numAlignments,1);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, validated reads will be counted.
|
||||
* @return 0, indicating no reads yet validated.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of reads processed.
|
||||
* @param value Number of reads processed by this map.
|
||||
* @param sum Number of reads processed before this map.
|
||||
* @return Number of reads processed up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
|
||||
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
package org.broadinstitute.sting.alignment.bwa;
|
||||
|
||||
import org.broadinstitute.sting.alignment.Aligner;
|
||||
|
||||
/**
|
||||
* Align reads using BWA.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public abstract class BWAAligner implements Aligner {
|
||||
/**
|
||||
* The supporting files used by BWA.
|
||||
*/
|
||||
protected BWTFiles bwtFiles;
|
||||
|
||||
/**
|
||||
* The current configuration for the BWA aligner.
|
||||
*/
|
||||
protected BWAConfiguration configuration;
|
||||
|
||||
/**
|
||||
* Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct
|
||||
* parameters.
|
||||
* @param bwtFiles The many files representing BWTs persisted to disk.
|
||||
* @param configuration Configuration parameters for the alignment.
|
||||
*/
|
||||
public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
|
||||
this.bwtFiles = bwtFiles;
|
||||
this.configuration = configuration;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the configuration passed to the BWA aligner.
|
||||
* @param configuration New configuration to set.
|
||||
*/
|
||||
public abstract void updateConfiguration(BWAConfiguration configuration);
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
package org.broadinstitute.sting.alignment.bwa;
|
||||
|
||||
/**
|
||||
* Configuration for the BWA/C aligner.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWAConfiguration {
|
||||
/**
|
||||
* The maximum edit distance used by BWA.
|
||||
*/
|
||||
public Float maximumEditDistance = null;
|
||||
|
||||
/**
|
||||
* How many gap opens are acceptable within this alignment?
|
||||
*/
|
||||
public Integer maximumGapOpens = null;
|
||||
|
||||
/**
|
||||
* How many gap extensions are acceptable within this alignment?
|
||||
*/
|
||||
public Integer maximumGapExtensions = null;
|
||||
|
||||
/**
|
||||
* Do we disallow indels within a certain range from the start / end?
|
||||
*/
|
||||
public Integer disallowIndelWithinRange = null;
|
||||
|
||||
/**
|
||||
* What is the scoring penalty for a mismatch?
|
||||
*/
|
||||
public Integer mismatchPenalty = null;
|
||||
|
||||
/**
|
||||
* What is the scoring penalty for a gap open?
|
||||
*/
|
||||
public Integer gapOpenPenalty = null;
|
||||
|
||||
/**
|
||||
* What is the scoring penalty for a gap extension?
|
||||
*/
|
||||
public Integer gapExtensionPenalty = null;
|
||||
}
|
||||
|
|
@ -0,0 +1,234 @@
|
|||
package org.broadinstitute.sting.alignment.bwa;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.alignment.reference.bwt.*;
|
||||
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Support files for BWT.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTFiles {
|
||||
/**
|
||||
* ANN (?) file name.
|
||||
*/
|
||||
public final File annFile;
|
||||
|
||||
/**
|
||||
* AMB (?) file name.
|
||||
*/
|
||||
public final File ambFile;
|
||||
|
||||
/**
|
||||
* Packed reference sequence file.
|
||||
*/
|
||||
public final File pacFile;
|
||||
|
||||
/**
|
||||
* Reverse of packed reference sequence file.
|
||||
*/
|
||||
public final File rpacFile;
|
||||
|
||||
/**
|
||||
* Forward BWT file.
|
||||
*/
|
||||
public final File forwardBWTFile;
|
||||
|
||||
/**
|
||||
* Forward suffix array file.
|
||||
*/
|
||||
public final File forwardSAFile;
|
||||
|
||||
/**
|
||||
* Reverse BWT file.
|
||||
*/
|
||||
public final File reverseBWTFile;
|
||||
|
||||
/**
|
||||
* Reverse suffix array file.
|
||||
*/
|
||||
public final File reverseSAFile;
|
||||
|
||||
/**
|
||||
* Where these files autogenerated on the fly?
|
||||
*/
|
||||
public final boolean autogenerated;
|
||||
|
||||
/**
|
||||
* Create a new BWA configuration file using the given prefix.
|
||||
* @param prefix Prefix to use when creating the configuration. Must not be null.
|
||||
*/
|
||||
public BWTFiles(String prefix) {
|
||||
if(prefix == null)
|
||||
throw new ReviewedStingException("Prefix must not be null.");
|
||||
annFile = new File(prefix + ".ann");
|
||||
ambFile = new File(prefix + ".amb");
|
||||
pacFile = new File(prefix + ".pac");
|
||||
rpacFile = new File(prefix + ".rpac");
|
||||
forwardBWTFile = new File(prefix + ".bwt");
|
||||
forwardSAFile = new File(prefix + ".sa");
|
||||
reverseBWTFile = new File(prefix + ".rbwt");
|
||||
reverseSAFile = new File(prefix + ".rsa");
|
||||
autogenerated = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hand-create a new BWTFiles object, specifying a unique file object for each type.
|
||||
* @param annFile ANN (alternate dictionary) file.
|
||||
* @param ambFile AMB (holes) files.
|
||||
* @param pacFile Packed representation of the forward reference sequence.
|
||||
* @param forwardBWTFile BWT representation of the forward reference sequence.
|
||||
* @param forwardSAFile SA representation of the forward reference sequence.
|
||||
* @param rpacFile Packed representation of the reversed reference sequence.
|
||||
* @param reverseBWTFile BWT representation of the reversed reference sequence.
|
||||
* @param reverseSAFile SA representation of the reversed reference sequence.
|
||||
*/
|
||||
private BWTFiles(File annFile,
|
||||
File ambFile,
|
||||
File pacFile,
|
||||
File forwardBWTFile,
|
||||
File forwardSAFile,
|
||||
File rpacFile,
|
||||
File reverseBWTFile,
|
||||
File reverseSAFile) {
|
||||
this.annFile = annFile;
|
||||
this.ambFile = ambFile;
|
||||
this.pacFile = pacFile;
|
||||
this.forwardBWTFile = forwardBWTFile;
|
||||
this.forwardSAFile = forwardSAFile;
|
||||
this.rpacFile = rpacFile;
|
||||
this.reverseBWTFile = reverseBWTFile;
|
||||
this.reverseSAFile = reverseSAFile;
|
||||
autogenerated = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close out this files object, in the process deleting any temporary filse
|
||||
* that were created.
|
||||
*/
|
||||
public void close() {
|
||||
if(autogenerated) {
|
||||
boolean success = true;
|
||||
success = annFile.delete();
|
||||
success &= ambFile.delete();
|
||||
success &= pacFile.delete();
|
||||
success &= forwardBWTFile.delete();
|
||||
success &= forwardSAFile.delete();
|
||||
success &= rpacFile.delete();
|
||||
success &= reverseBWTFile.delete();
|
||||
success &= reverseSAFile.delete();
|
||||
|
||||
if(!success)
|
||||
throw new ReviewedStingException("Unable to clean up autogenerated representation");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new set of BWT files from the given reference sequence.
|
||||
* @param referenceSequence Sequence from which to build metadata.
|
||||
* @return A new object representing encoded representations of each sequence.
|
||||
*/
|
||||
public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
|
||||
byte[] normalizedReferenceSequence = new byte[referenceSequence.length];
|
||||
System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length);
|
||||
normalizeReferenceSequence(normalizedReferenceSequence);
|
||||
|
||||
File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
|
||||
try {
|
||||
// Write the ann and amb for this reference sequence.
|
||||
annFile = File.createTempFile("bwt",".ann");
|
||||
ambFile = File.createTempFile("bwt",".amb");
|
||||
|
||||
SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
|
||||
dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length));
|
||||
|
||||
ANNWriter annWriter = new ANNWriter(annFile);
|
||||
annWriter.write(dictionary);
|
||||
annWriter.close();
|
||||
|
||||
AMBWriter ambWriter = new AMBWriter(ambFile);
|
||||
ambWriter.writeEmpty(dictionary);
|
||||
ambWriter.close();
|
||||
|
||||
// Write the encoded files for the forward version of this reference sequence.
|
||||
pacFile = File.createTempFile("bwt",".pac");
|
||||
bwtFile = File.createTempFile("bwt",".bwt");
|
||||
saFile = File.createTempFile("bwt",".sa");
|
||||
|
||||
writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile);
|
||||
|
||||
// Write the encoded files for the reverse version of this reference sequence.
|
||||
byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence);
|
||||
|
||||
rpacFile = File.createTempFile("bwt",".rpac");
|
||||
rbwtFile = File.createTempFile("bwt",".rbwt");
|
||||
rsaFile = File.createTempFile("bwt",".rsa");
|
||||
|
||||
writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to write autogenerated reference sequence to temporary files");
|
||||
}
|
||||
|
||||
// Make sure that, at the very least, all temporary files are deleted on exit.
|
||||
annFile.deleteOnExit();
|
||||
ambFile.deleteOnExit();
|
||||
pacFile.deleteOnExit();
|
||||
bwtFile.deleteOnExit();
|
||||
saFile.deleteOnExit();
|
||||
rpacFile.deleteOnExit();
|
||||
rbwtFile.deleteOnExit();
|
||||
rsaFile.deleteOnExit();
|
||||
|
||||
return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
|
||||
* sequence is the reference itself in PAC format, the BWT, and the suffix array.
|
||||
* @param referenceSequence The reference sequence to encode.
|
||||
* @param pacFile Target for the PAC-encoded reference.
|
||||
* @param bwtFile Target for the BWT representation of the reference.
|
||||
* @param suffixArrayFile Target for the suffix array encoding of the reference.
|
||||
* @throws java.io.IOException In case of issues writing to the file.
|
||||
*/
|
||||
private static void writeEncodedReferenceSequence(byte[] referenceSequence,
|
||||
File pacFile,
|
||||
File bwtFile,
|
||||
File suffixArrayFile) throws IOException {
|
||||
PackUtils.writeReferenceSequence(pacFile,referenceSequence);
|
||||
|
||||
BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
|
||||
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||
bwtWriter.write(bwt);
|
||||
bwtWriter.close();
|
||||
|
||||
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
|
||||
SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
|
||||
suffixArrayWriter.write(suffixArray);
|
||||
suffixArrayWriter.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the given reference sequence into a form suitable for building into
|
||||
* on-the-fly sequences.
|
||||
* @param referenceSequence The reference sequence to normalize.
|
||||
* @throws org.broadinstitute.sting.utils.exceptions.ReviewedStingException if normalized sequence cannot be generated.
|
||||
*/
|
||||
private static void normalizeReferenceSequence(byte[] referenceSequence) {
|
||||
StringUtil.toUpperCase(referenceSequence);
|
||||
for(byte base: referenceSequence) {
|
||||
if(base != 'A' && base != 'C' && base != 'G' && base != 'T')
|
||||
throw new ReviewedStingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,259 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.c;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* An aligner using the BWA/C implementation.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWACAligner extends BWAAligner {
|
||||
static {
|
||||
System.loadLibrary("bwa");
|
||||
}
|
||||
|
||||
/**
|
||||
* A pointer to the C++ object representing the BWA engine.
|
||||
*/
|
||||
private long thunkPointer = 0;
|
||||
|
||||
public BWACAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
|
||||
super(bwtFiles,configuration);
|
||||
if(thunkPointer != 0)
|
||||
throw new ReviewedStingException("BWA/C attempting to reinitialize.");
|
||||
|
||||
if(!bwtFiles.annFile.exists()) throw new ReviewedStingException("ANN file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
if(!bwtFiles.ambFile.exists()) throw new ReviewedStingException("AMB file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
if(!bwtFiles.pacFile.exists()) throw new ReviewedStingException("PAC file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
if(!bwtFiles.forwardBWTFile.exists()) throw new ReviewedStingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
if(!bwtFiles.forwardSAFile.exists()) throw new ReviewedStingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
if(!bwtFiles.reverseBWTFile.exists()) throw new ReviewedStingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
if(!bwtFiles.reverseSAFile.exists()) throw new ReviewedStingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||
|
||||
thunkPointer = create(bwtFiles,configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an aligner object using an array of bytes as a reference.
|
||||
* @param referenceSequence Reference sequence to encode ad-hoc.
|
||||
* @param configuration Configuration for the given aligner.
|
||||
*/
|
||||
public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) {
|
||||
this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration);
|
||||
// Now that the temporary files are created, the temporary files can be destroyed.
|
||||
bwtFiles.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the configuration passed to the BWA aligner.
|
||||
* @param configuration New configuration to set.
|
||||
*/
|
||||
@Override
|
||||
public void updateConfiguration(BWAConfiguration configuration) {
|
||||
if(thunkPointer == 0)
|
||||
throw new ReviewedStingException("BWA/C: attempting to update configuration of uninitialized aligner.");
|
||||
updateConfiguration(thunkPointer,configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close this instance of the BWA pointer and delete its resources.
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
if(thunkPointer == 0)
|
||||
throw new ReviewedStingException("BWA/C close attempted, but BWA/C is not properly initialized.");
|
||||
destroy(thunkPointer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
|
||||
* @param bases Bases to align.
|
||||
* @return An align
|
||||
*/
|
||||
@Override
|
||||
public Alignment getBestAlignment(final byte[] bases) {
|
||||
if(thunkPointer == 0)
|
||||
throw new ReviewedStingException("BWA/C getBestAlignment attempted, but BWA/C is not properly initialized.");
|
||||
return getBestAlignment(thunkPointer,bases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the best aligned read, chosen randomly from the pile of best alignments.
|
||||
* @param read Read to align.
|
||||
* @param newHeader New header to apply to this SAM file. Can be null, but if so, read header must be valid.
|
||||
* @return Read with injected alignment data.
|
||||
*/
|
||||
@Override
|
||||
public SAMRecord align(final SAMRecord read, final SAMFileHeader newHeader) {
|
||||
if(bwtFiles.autogenerated)
|
||||
throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
|
||||
return Alignment.convertToRead(getBestAlignment(read.getReadBases()),read,newHeader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a iterator of alignments, batched by mapping quality.
|
||||
* @param bases List of bases.
|
||||
* @return Iterator to alignments.
|
||||
*/
|
||||
@Override
|
||||
public Iterable<Alignment[]> getAllAlignments(final byte[] bases) {
|
||||
final BWAPath[] paths = getPaths(bases);
|
||||
return new Iterable<Alignment[]>() {
|
||||
public Iterator<Alignment[]> iterator() {
|
||||
return new Iterator<Alignment[]>() {
|
||||
/**
|
||||
* The last position accessed.
|
||||
*/
|
||||
private int position = 0;
|
||||
|
||||
/**
|
||||
* Whether all alignments have been seen based on the current position.
|
||||
* @return True if any more alignments are pending. False otherwise.
|
||||
*/
|
||||
public boolean hasNext() { return position < paths.length; }
|
||||
|
||||
/**
|
||||
* Return the next cross-section of alignments, based on mapping quality.
|
||||
* @return Array of the next set of alignments of a given mapping quality.
|
||||
*/
|
||||
public Alignment[] next() {
|
||||
if(position >= paths.length)
|
||||
throw new UnsupportedOperationException("Out of alignments to return.");
|
||||
int score = paths[position].score;
|
||||
int startingPosition = position;
|
||||
while(position < paths.length && paths[position].score == score) position++;
|
||||
return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position));
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsupported.
|
||||
*/
|
||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a iterator of aligned reads, batched by mapping quality.
|
||||
* @param read Read to align.
|
||||
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
|
||||
* @return Iterator to alignments.
|
||||
*/
|
||||
@Override
|
||||
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader) {
|
||||
if(bwtFiles.autogenerated)
|
||||
throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
|
||||
final Iterable<Alignment[]> alignments = getAllAlignments(read.getReadBases());
|
||||
return new Iterable<SAMRecord[]>() {
|
||||
public Iterator<SAMRecord[]> iterator() {
|
||||
final Iterator<Alignment[]> alignmentIterator = alignments.iterator();
|
||||
return new Iterator<SAMRecord[]>() {
|
||||
/**
|
||||
* Whether all alignments have been seen based on the current position.
|
||||
* @return True if any more alignments are pending. False otherwise.
|
||||
*/
|
||||
public boolean hasNext() { return alignmentIterator.hasNext(); }
|
||||
|
||||
/**
|
||||
* Return the next cross-section of alignments, based on mapping quality.
|
||||
* @return Array of the next set of alignments of a given mapping quality.
|
||||
*/
|
||||
public SAMRecord[] next() {
|
||||
Alignment[] alignmentsOfQuality = alignmentIterator.next();
|
||||
SAMRecord[] reads = new SAMRecord[alignmentsOfQuality.length];
|
||||
for(int i = 0; i < alignmentsOfQuality.length; i++) {
|
||||
reads[i] = Alignment.convertToRead(alignmentsOfQuality[i],read,newHeader);
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsupported.
|
||||
*/
|
||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the paths associated with the given base string.
|
||||
* @param bases List of bases.
|
||||
* @return A set of paths through the BWA.
|
||||
*/
|
||||
public BWAPath[] getPaths(byte[] bases) {
|
||||
if(thunkPointer == 0)
|
||||
throw new ReviewedStingException("BWA/C getPaths attempted, but BWA/C is not properly initialized.");
|
||||
return getPaths(thunkPointer,bases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a pointer to the BWA/C thunk.
|
||||
* @param files BWT source files.
|
||||
* @param configuration Configuration of the aligner.
|
||||
* @return Pointer to the BWA/C thunk.
|
||||
*/
|
||||
protected native long create(BWTFiles files, BWAConfiguration configuration);
|
||||
|
||||
/**
|
||||
* Update the configuration passed to the BWA aligner. For internal use only.
|
||||
* @param thunkPointer pointer to BWA object.
|
||||
* @param configuration New configuration to set.
|
||||
*/
|
||||
protected native void updateConfiguration(long thunkPointer, BWAConfiguration configuration);
|
||||
|
||||
/**
|
||||
* Destroy the BWA/C thunk.
|
||||
* @param thunkPointer Pointer to the allocated thunk.
|
||||
*/
|
||||
protected native void destroy(long thunkPointer);
|
||||
|
||||
/**
|
||||
* Do the extra steps involved in converting a local alignment to a global alignment.
|
||||
* @param bases ASCII representation of byte array.
|
||||
* @param paths Paths through the current BWT.
|
||||
* @return A list of alignments.
|
||||
*/
|
||||
protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) {
|
||||
if(thunkPointer == 0)
|
||||
throw new ReviewedStingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized.");
|
||||
return convertPathsToAlignments(thunkPointer,bases,paths);
|
||||
}
|
||||
|
||||
/**
|
||||
* Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead.
|
||||
* @param thunkPointer pointer to the C++ object managing BWA/C.
|
||||
* @param bases ASCII representation of byte array.
|
||||
* @return A list of paths through the specified BWT.
|
||||
*/
|
||||
protected native BWAPath[] getPaths(long thunkPointer, byte[] bases);
|
||||
|
||||
/**
|
||||
* Do the extra steps involved in converting a local alignment to a global alignment.
|
||||
* Call this method's convertPathsToAlignments() wrapper (above) instead.
|
||||
* @param thunkPointer pointer to the C++ object managing BWA/C.
|
||||
* @param bases ASCII representation of byte array.
|
||||
* @param paths Paths through the current BWT.
|
||||
* @return A list of alignments.
|
||||
*/
|
||||
protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths);
|
||||
|
||||
/**
|
||||
* Gets the best alignment from BWA/C, randomly selected from all best-aligned reads.
|
||||
* @param thunkPointer Pointer to BWA thunk.
|
||||
* @param bases bases to align.
|
||||
* @return The best alignment from BWA/C.
|
||||
*/
|
||||
protected native Alignment getBestAlignment(long thunkPointer, byte[] bases);
|
||||
}
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment.bwa.c;
|
||||
|
||||
/**
|
||||
* Models a BWA path.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWAPath {
|
||||
/**
|
||||
* Number of mismatches encountered along this path.
|
||||
*/
|
||||
public final int numMismatches;
|
||||
|
||||
/**
|
||||
* Number of gap opens encountered along this path.
|
||||
*/
|
||||
public final int numGapOpens;
|
||||
|
||||
/**
|
||||
* Number of gap extensions along this path.
|
||||
*/
|
||||
public final int numGapExtensions;
|
||||
|
||||
/**
|
||||
* Whether this alignment was found on the positive or negative strand.
|
||||
*/
|
||||
public final boolean negativeStrand;
|
||||
|
||||
/**
|
||||
* Starting coordinate in the BWT.
|
||||
*/
|
||||
public final long k;
|
||||
|
||||
/**
|
||||
* Ending coordinate in the BWT.
|
||||
*/
|
||||
public final long l;
|
||||
|
||||
/**
|
||||
* The score of this path.
|
||||
*/
|
||||
public final int score;
|
||||
|
||||
/**
|
||||
* The number of best alignments seen along this path.
|
||||
*/
|
||||
public final int bestCount;
|
||||
|
||||
/**
|
||||
* The number of second best alignments seen along this path.
|
||||
*/
|
||||
public final int secondBestCount;
|
||||
|
||||
/**
|
||||
* Create a new path with the given attributes.
|
||||
* @param numMismatches Number of mismatches along path.
|
||||
* @param numGapOpens Number of gap opens along path.
|
||||
* @param numGapExtensions Number of gap extensions along path.
|
||||
* @param k Index to first coordinate within BWT.
|
||||
* @param l Index to last coordinate within BWT.
|
||||
* @param score Score of this alignment. Not the mapping quality.
|
||||
*/
|
||||
public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) {
|
||||
this.numMismatches = numMismatches;
|
||||
this.numGapOpens = numGapOpens;
|
||||
this.numGapExtensions = numGapExtensions;
|
||||
this.negativeStrand = negativeStrand;
|
||||
this.k = k;
|
||||
this.l = l;
|
||||
this.score = score;
|
||||
this.bestCount = bestCount;
|
||||
this.secondBestCount = secondBestCount;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.java;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.alignment.Aligner;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* A test harness to ensure that the perfect aligner works.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AlignerTestHarness {
|
||||
public static void main( String argv[] ) throws FileNotFoundException {
|
||||
if( argv.length != 6 ) {
|
||||
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <rbwt> <sa> <rsa> <bam>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
File referenceFile = new File(argv[0]);
|
||||
File bwtFile = new File(argv[1]);
|
||||
File rbwtFile = new File(argv[2]);
|
||||
File suffixArrayFile = new File(argv[3]);
|
||||
File reverseSuffixArrayFile = new File(argv[4]);
|
||||
File bamFile = new File(argv[5]);
|
||||
|
||||
align(referenceFile,bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile,bamFile);
|
||||
}
|
||||
|
||||
private static void align(File referenceFile, File bwtFile, File rbwtFile, File suffixArrayFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException {
|
||||
Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile);
|
||||
int count = 0;
|
||||
|
||||
SAMFileReader reader = new SAMFileReader(bamFile);
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||
|
||||
int mismatches = 0;
|
||||
int failures = 0;
|
||||
|
||||
for(SAMRecord read: reader) {
|
||||
count++;
|
||||
if( count > 200000 ) break;
|
||||
//if( count < 366000 ) continue;
|
||||
//if( count > 2 ) break;
|
||||
//if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
|
||||
// continue;
|
||||
//if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )
|
||||
// continue;
|
||||
//if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") )
|
||||
// continue;
|
||||
|
||||
SAMRecord alignmentCleaned = null;
|
||||
try {
|
||||
alignmentCleaned = (SAMRecord)read.clone();
|
||||
}
|
||||
catch( CloneNotSupportedException ex ) {
|
||||
throw new ReviewedStingException("SAMRecord clone not supported", ex);
|
||||
}
|
||||
|
||||
if( alignmentCleaned.getReadNegativeStrandFlag() )
|
||||
alignmentCleaned.setReadBases(BaseUtils.simpleReverseComplement(alignmentCleaned.getReadBases()));
|
||||
|
||||
alignmentCleaned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
|
||||
alignmentCleaned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||
alignmentCleaned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
|
||||
alignmentCleaned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
|
||||
|
||||
// Clear everything except flags pertaining to pairing and set 'unmapped' status to true.
|
||||
alignmentCleaned.setFlags(alignmentCleaned.getFlags() & 0x00A1 | 0x000C);
|
||||
|
||||
Iterable<Alignment[]> alignments = aligner.getAllAlignments(alignmentCleaned.getReadBases());
|
||||
if(!alignments.iterator().hasNext() ) {
|
||||
//throw new StingException(String.format("Unable to align read %s to reference; count = %d",read.getReadName(),count));
|
||||
System.out.printf("Unable to align read %s to reference; count = %d%n",read.getReadName(),count);
|
||||
failures++;
|
||||
}
|
||||
|
||||
Alignment foundAlignment = null;
|
||||
for(Alignment[] alignmentsOfQuality: alignments) {
|
||||
for(Alignment alignment: alignmentsOfQuality) {
|
||||
if( read.getReadNegativeStrandFlag() != alignment.isNegativeStrand() )
|
||||
continue;
|
||||
if( read.getAlignmentStart() != alignment.getAlignmentStart() )
|
||||
continue;
|
||||
|
||||
foundAlignment = alignment;
|
||||
}
|
||||
}
|
||||
|
||||
if( foundAlignment != null ) {
|
||||
//System.out.printf("%s: Aligned read to reference at position %d with %d mismatches, %d gap opens, and %d gap extensions.%n", read.getReadName(), foundAlignment.getAlignmentStart(), foundAlignment.getMismatches(), foundAlignment.getGapOpens(), foundAlignment.getGapExtensions());
|
||||
}
|
||||
else {
|
||||
System.out.printf("Error aligning read %s%n", read.getReadName());
|
||||
|
||||
mismatches++;
|
||||
|
||||
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
|
||||
|
||||
System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION),
|
||||
read.getAlignmentStart(),
|
||||
read.getReadNegativeStrandFlag());
|
||||
int numDeletions = numDeletionsInCigar(read.getCigar());
|
||||
String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()+numDeletions-1).getBases());
|
||||
System.out.printf("expected ref = %s%n", formatBasesBasedOnCigar(expectedRef,read.getCigar(),CigarOperator.INSERTION));
|
||||
|
||||
for(Alignment[] alignmentsOfQuality: alignments) {
|
||||
for(Alignment alignment: alignmentsOfQuality) {
|
||||
System.out.println();
|
||||
|
||||
Cigar cigar = ((BWAAlignment)alignment).getCigar();
|
||||
|
||||
System.out.printf("read = %s%n", formatBasesBasedOnCigar(read.getReadString(),cigar,CigarOperator.DELETION));
|
||||
|
||||
int deletionCount = ((BWAAlignment)alignment).getNumberOfBasesMatchingState(AlignmentState.DELETION);
|
||||
String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignment.getAlignmentStart(),alignment.getAlignmentStart()+read.getReadLength()+deletionCount-1).getBases());
|
||||
System.out.printf("actual ref = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(alignedRef,cigar,CigarOperator.INSERTION),
|
||||
alignment.getAlignmentStart(),
|
||||
alignment.isNegativeStrand());
|
||||
}
|
||||
}
|
||||
|
||||
//throw new StingException(String.format("Read %s was placed at incorrect location; count = %d%n",read.getReadName(),count));
|
||||
}
|
||||
|
||||
|
||||
if( count % 1000 == 0 )
|
||||
System.out.printf("%d reads examined.%n",count);
|
||||
}
|
||||
|
||||
System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures);
|
||||
}
|
||||
|
||||
private static String formatBasesBasedOnCigar( String bases, Cigar cigar, CigarOperator toBlank ) {
|
||||
StringBuilder formatted = new StringBuilder();
|
||||
int readIndex = 0;
|
||||
for(CigarElement cigarElement: cigar.getCigarElements()) {
|
||||
if(cigarElement.getOperator() == toBlank) {
|
||||
int number = cigarElement.getLength();
|
||||
while( number-- > 0 ) formatted.append(' ');
|
||||
}
|
||||
else {
|
||||
int number = cigarElement.getLength();
|
||||
while( number-- > 0 ) formatted.append(bases.charAt(readIndex++));
|
||||
}
|
||||
}
|
||||
return formatted.toString();
|
||||
}
|
||||
|
||||
private static int numDeletionsInCigar( Cigar cigar ) {
|
||||
int numDeletions = 0;
|
||||
for(CigarElement cigarElement: cigar.getCigarElements()) {
|
||||
if(cigarElement.getOperator() == CigarOperator.DELETION)
|
||||
numDeletions += cigarElement.getLength();
|
||||
}
|
||||
return numDeletions;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.java;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Deque;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Represents a sequence of matches.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AlignmentMatchSequence implements Cloneable {
|
||||
/**
|
||||
* Stores the particular match entries in the order they occur.
|
||||
*/
|
||||
private Deque<AlignmentMatchSequenceEntry> entries = new ArrayDeque<AlignmentMatchSequenceEntry>();
|
||||
|
||||
/**
|
||||
* Clone the given match sequence.
|
||||
* @return A deep copy of the current match sequence.
|
||||
*/
|
||||
public AlignmentMatchSequence clone() {
|
||||
AlignmentMatchSequence copy = null;
|
||||
try {
|
||||
copy = (AlignmentMatchSequence)super.clone();
|
||||
}
|
||||
catch( CloneNotSupportedException ex ) {
|
||||
throw new ReviewedStingException("Unable to clone AlignmentMatchSequence.");
|
||||
}
|
||||
|
||||
copy.entries = new ArrayDeque<AlignmentMatchSequenceEntry>();
|
||||
for( AlignmentMatchSequenceEntry entry: entries )
|
||||
copy.entries.add(entry.clone());
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
public Cigar convertToCigar(boolean negativeStrand) {
|
||||
Cigar cigar = new Cigar();
|
||||
Iterator<AlignmentMatchSequenceEntry> iterator = negativeStrand ? entries.descendingIterator() : entries.iterator();
|
||||
while( iterator.hasNext() ) {
|
||||
AlignmentMatchSequenceEntry entry = iterator.next();
|
||||
CigarOperator operator;
|
||||
switch( entry.getAlignmentState() ) {
|
||||
case MATCH_MISMATCH: operator = CigarOperator.MATCH_OR_MISMATCH; break;
|
||||
case INSERTION: operator = CigarOperator.INSERTION; break;
|
||||
case DELETION: operator = CigarOperator.DELETION; break;
|
||||
default: throw new ReviewedStingException("convertToCigar: cannot process state: " + entry.getAlignmentState());
|
||||
}
|
||||
cigar.add( new CigarElement(entry.count,operator) );
|
||||
}
|
||||
return cigar;
|
||||
}
|
||||
|
||||
/**
|
||||
* All a new alignment of the given state.
|
||||
* @param state State to add to the sequence.
|
||||
*/
|
||||
public void addNext( AlignmentState state ) {
|
||||
AlignmentMatchSequenceEntry last = entries.peekLast();
|
||||
// If the last entry is the same as this one, increment it. Otherwise, add a new entry.
|
||||
if( last != null && last.alignmentState == state )
|
||||
last.increment();
|
||||
else
|
||||
entries.add(new AlignmentMatchSequenceEntry(state));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current state of this alignment (what's the state of the last base?)
|
||||
* @return State of the most recently aligned base.
|
||||
*/
|
||||
public AlignmentState getCurrentState() {
|
||||
if( entries.size() == 0 )
|
||||
return AlignmentState.MATCH_MISMATCH;
|
||||
return entries.peekLast().getAlignmentState();
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bases in the read match the given state.
|
||||
* @param state State to test.
|
||||
* @return number of bases which match that state.
|
||||
*/
|
||||
public int getNumberOfBasesMatchingState(AlignmentState state) {
|
||||
int matches = 0;
|
||||
for( AlignmentMatchSequenceEntry entry: entries ) {
|
||||
if( entry.getAlignmentState() == state )
|
||||
matches += entry.count;
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores an individual match sequence entry.
|
||||
*/
|
||||
private class AlignmentMatchSequenceEntry implements Cloneable {
|
||||
/**
|
||||
* The state of the alignment throughout a given point in the sequence.
|
||||
*/
|
||||
private final AlignmentState alignmentState;
|
||||
|
||||
/**
|
||||
* The number of bases having this particular state.
|
||||
*/
|
||||
private int count;
|
||||
|
||||
/**
|
||||
* Create a new sequence entry with the given state.
|
||||
* @param alignmentState The state that this sequence should contain.
|
||||
*/
|
||||
AlignmentMatchSequenceEntry( AlignmentState alignmentState ) {
|
||||
this.alignmentState = alignmentState;
|
||||
this.count = 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone the given match sequence entry.
|
||||
* @return A deep copy of the current match sequence entry.
|
||||
*/
|
||||
public AlignmentMatchSequenceEntry clone() {
|
||||
try {
|
||||
return (AlignmentMatchSequenceEntry)super.clone();
|
||||
}
|
||||
catch( CloneNotSupportedException ex ) {
|
||||
throw new ReviewedStingException("Unable to clone AlignmentMatchSequenceEntry.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the current state of the alignment.
|
||||
* @return The state of the current sequence.
|
||||
*/
|
||||
AlignmentState getAlignmentState() {
|
||||
return alignmentState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the count of alignments having this particular state.
|
||||
*/
|
||||
void increment() {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.java;
|
||||
|
||||
/**
|
||||
* The state of a given base in the alignment.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public enum AlignmentState {
|
||||
MATCH_MISMATCH,
|
||||
INSERTION,
|
||||
DELETION
|
||||
}
|
||||
|
|
@ -0,0 +1,190 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.java;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* An alignment object to be used incrementally as the BWA aligner
|
||||
* inspects the read.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWAAlignment extends Alignment implements Cloneable {
|
||||
/**
|
||||
* Track the number of alignments that have been created.
|
||||
*/
|
||||
private static long numCreated;
|
||||
|
||||
/**
|
||||
* Which number alignment is this?
|
||||
*/
|
||||
private long creationNumber;
|
||||
|
||||
/**
|
||||
* The aligner performing the alignments.
|
||||
*/
|
||||
protected BWAJavaAligner aligner;
|
||||
|
||||
/**
|
||||
* The sequence of matches/mismatches/insertions/deletions.
|
||||
*/
|
||||
private AlignmentMatchSequence alignmentMatchSequence = new AlignmentMatchSequence();
|
||||
|
||||
/**
|
||||
* Working variable. How many bases have been matched at this point.
|
||||
*/
|
||||
protected int position;
|
||||
|
||||
/**
|
||||
* Working variable. How many mismatches have been encountered at this point.
|
||||
*/
|
||||
private int mismatches;
|
||||
|
||||
/**
|
||||
* Number of gap opens in alignment.
|
||||
*/
|
||||
private int gapOpens;
|
||||
|
||||
/**
|
||||
* Number of gap extensions in alignment.
|
||||
*/
|
||||
private int gapExtensions;
|
||||
|
||||
/**
|
||||
* Working variable. The lower bound of the alignment within the BWT.
|
||||
*/
|
||||
protected long loBound;
|
||||
|
||||
/**
|
||||
* Working variable. The upper bound of the alignment within the BWT.
|
||||
*/
|
||||
protected long hiBound;
|
||||
|
||||
protected void setAlignmentStart(long position) {
|
||||
this.alignmentStart = position;
|
||||
}
|
||||
|
||||
protected void setNegativeStrand(boolean negativeStrand) {
|
||||
this.negativeStrand = negativeStrand;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cache the score.
|
||||
*/
|
||||
private int score;
|
||||
|
||||
public Cigar getCigar() {
|
||||
return alignmentMatchSequence.convertToCigar(isNegativeStrand());
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current state of this alignment (state of the last base viewed)..
|
||||
* @return Current state of the alignment.
|
||||
*/
|
||||
public AlignmentState getCurrentState() {
|
||||
return alignmentMatchSequence.getCurrentState();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given state to the current alignment.
|
||||
* @param state State to add to the given alignment.
|
||||
*/
|
||||
public void addState( AlignmentState state ) {
|
||||
alignmentMatchSequence.addNext(state);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the BWA score of this alignment.
|
||||
* @return BWA-style scores. 0 is best.
|
||||
*/
|
||||
public int getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public int getMismatches() { return mismatches; }
|
||||
public int getGapOpens() { return gapOpens; }
|
||||
public int getGapExtensions() { return gapExtensions; }
|
||||
|
||||
public void incrementMismatches() {
|
||||
this.mismatches++;
|
||||
updateScore();
|
||||
}
|
||||
|
||||
public void incrementGapOpens() {
|
||||
this.gapOpens++;
|
||||
updateScore();
|
||||
}
|
||||
|
||||
public void incrementGapExtensions() {
|
||||
this.gapExtensions++;
|
||||
updateScore();
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the score based on new information about matches / mismatches.
|
||||
*/
|
||||
private void updateScore() {
|
||||
score = mismatches*aligner.MISMATCH_PENALTY + gapOpens*aligner.GAP_OPEN_PENALTY + gapExtensions*aligner.GAP_EXTENSION_PENALTY;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new alignment with the given parent aligner.
|
||||
* @param aligner Aligner being used.
|
||||
*/
|
||||
public BWAAlignment( BWAJavaAligner aligner ) {
|
||||
this.aligner = aligner;
|
||||
this.creationNumber = numCreated++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone the alignment.
|
||||
* @return New instance of the alignment.
|
||||
*/
|
||||
public BWAAlignment clone() {
|
||||
BWAAlignment newAlignment = null;
|
||||
try {
|
||||
newAlignment = (BWAAlignment)super.clone();
|
||||
}
|
||||
catch( CloneNotSupportedException ex ) {
|
||||
throw new ReviewedStingException("Unable to clone BWAAlignment.");
|
||||
}
|
||||
newAlignment.creationNumber = numCreated++;
|
||||
newAlignment.alignmentMatchSequence = alignmentMatchSequence.clone();
|
||||
|
||||
return newAlignment;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bases in the read match the given state.
|
||||
* @param state State to test.
|
||||
* @return number of bases which match that state.
|
||||
*/
|
||||
public int getNumberOfBasesMatchingState(AlignmentState state) {
|
||||
return alignmentMatchSequence.getNumberOfBasesMatchingState(state);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare this alignment to another alignment.
|
||||
* @param rhs Other alignment to which to compare.
|
||||
* @return < 0 if this < other, == 0 if this == other, > 0 if this > other
|
||||
*/
|
||||
public int compareTo(Alignment rhs) {
|
||||
BWAAlignment other = (BWAAlignment)rhs;
|
||||
|
||||
// If the scores are different, disambiguate using the score.
|
||||
if(score != other.score)
|
||||
return score > other.score ? 1 : -1;
|
||||
|
||||
// Otherwise, use the order in which the elements were created.
|
||||
if(creationNumber != other.creationNumber)
|
||||
return creationNumber > other.creationNumber ? -1 : 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("position: %d, strand: %b, state: %s, mismatches: %d, gap opens: %d, gap extensions: %d, loBound: %d, hiBound: %d, score: %d, creationNumber: %d", position, negativeStrand, alignmentMatchSequence.getCurrentState(), mismatches, gapOpens, gapExtensions, loBound, hiBound, getScore(), creationNumber);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,393 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.java;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.reference.bwt.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Create imperfect alignments from the read to the genome represented by the given BWT / suffix array.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWAJavaAligner extends BWAAligner {
|
||||
/**
|
||||
* BWT in the forward direction.
|
||||
*/
|
||||
private BWT forwardBWT;
|
||||
|
||||
/**
|
||||
* BWT in the reverse direction.
|
||||
*/
|
||||
private BWT reverseBWT;
|
||||
|
||||
/**
|
||||
* Suffix array in the forward direction.
|
||||
*/
|
||||
private SuffixArray forwardSuffixArray;
|
||||
|
||||
/**
|
||||
* Suffix array in the reverse direction.
|
||||
*/
|
||||
private SuffixArray reverseSuffixArray;
|
||||
|
||||
/**
|
||||
* Maximum edit distance (-n option from original BWA).
|
||||
*/
|
||||
private final int MAXIMUM_EDIT_DISTANCE = 4;
|
||||
|
||||
/**
|
||||
* Maximum number of gap opens (-o option from original BWA).
|
||||
*/
|
||||
private final int MAXIMUM_GAP_OPENS = 1;
|
||||
|
||||
/**
|
||||
* Maximum number of gap extensions (-e option from original BWA).
|
||||
*/
|
||||
private final int MAXIMUM_GAP_EXTENSIONS = 6;
|
||||
|
||||
/**
|
||||
* Penalty for straight mismatches (-M option from original BWA).
|
||||
*/
|
||||
public final int MISMATCH_PENALTY = 3;
|
||||
|
||||
/**
|
||||
* Penalty for gap opens (-O option from original BWA).
|
||||
*/
|
||||
public final int GAP_OPEN_PENALTY = 11;
|
||||
|
||||
/**
|
||||
* Penalty for gap extensions (-E option from original BWA).
|
||||
*/
|
||||
public final int GAP_EXTENSION_PENALTY = 4;
|
||||
|
||||
/**
|
||||
* Skip the ends of indels.
|
||||
*/
|
||||
public final int INDEL_END_SKIP = 5;
|
||||
|
||||
public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
|
||||
super(null,null);
|
||||
forwardBWT = new BWTReader(forwardBWTFile).read();
|
||||
reverseBWT = new BWTReader(reverseBWTFile).read();
|
||||
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
|
||||
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
|
||||
}
|
||||
|
||||
/**
|
||||
* Close this instance of the BWA pointer and delete its resources.
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
throw new UnsupportedOperationException("BWA aligner can't currently be closed.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the current parameters of this aligner.
|
||||
* @param configuration New configuration to set.
|
||||
*/
|
||||
public void updateConfiguration(BWAConfiguration configuration) {
|
||||
throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
|
||||
* @param bases Bases to align.
|
||||
* @return An align
|
||||
*/
|
||||
public Alignment getBestAlignment(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||
|
||||
/**
|
||||
* Align the read to the reference.
|
||||
* @param read Read to align.
|
||||
* @param header Optional header to drop in place.
|
||||
* @return A list of the alignments.
|
||||
*/
|
||||
public SAMRecord align(final SAMRecord read, final SAMFileHeader header) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||
|
||||
/**
|
||||
* Get a iterator of alignments, batched by mapping quality.
|
||||
* @param bases List of bases.
|
||||
* @return Iterator to alignments.
|
||||
*/
|
||||
public Iterable<Alignment[]> getAllAlignments(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||
|
||||
/**
|
||||
* Get a iterator of aligned reads, batched by mapping quality.
|
||||
* @param read Read to align.
|
||||
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
|
||||
* @return Iterator to alignments.
|
||||
*/
|
||||
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||
|
||||
|
||||
public List<Alignment> align( SAMRecord read ) {
|
||||
List<Alignment> successfulMatches = new ArrayList<Alignment>();
|
||||
|
||||
Byte[] uncomplementedBases = normalizeBases(read.getReadBases());
|
||||
Byte[] complementedBases = normalizeBases(Utils.reverse(BaseUtils.simpleReverseComplement(read.getReadBases())));
|
||||
|
||||
List<LowerBound> forwardLowerBounds = LowerBound.create(uncomplementedBases,forwardBWT);
|
||||
List<LowerBound> reverseLowerBounds = LowerBound.create(complementedBases,reverseBWT);
|
||||
|
||||
// Seed the best score with any score that won't overflow on comparison.
|
||||
int bestScore = Integer.MAX_VALUE - MISMATCH_PENALTY;
|
||||
int bestDiff = MAXIMUM_EDIT_DISTANCE+1;
|
||||
int maxDiff = MAXIMUM_EDIT_DISTANCE;
|
||||
|
||||
PriorityQueue<BWAAlignment> alignments = new PriorityQueue<BWAAlignment>();
|
||||
|
||||
// Create a fictional initial alignment, with the position just off the end of the read, and the limits
|
||||
// set as the entire BWT.
|
||||
alignments.add(createSeedAlignment(reverseBWT));
|
||||
alignments.add(createSeedAlignment(forwardBWT));
|
||||
|
||||
while(!alignments.isEmpty()) {
|
||||
BWAAlignment alignment = alignments.remove();
|
||||
|
||||
// From bwtgap.c in the original BWT; if the rank is worse than the best score + the mismatch PENALTY, move on.
|
||||
if( alignment.getScore() > bestScore + MISMATCH_PENALTY )
|
||||
break;
|
||||
|
||||
Byte[] bases = alignment.isNegativeStrand() ? complementedBases : uncomplementedBases;
|
||||
BWT bwt = alignment.isNegativeStrand() ? forwardBWT : reverseBWT;
|
||||
List<LowerBound> lowerBounds = alignment.isNegativeStrand() ? reverseLowerBounds : forwardLowerBounds;
|
||||
|
||||
// if z < D(i) then return {}
|
||||
int mismatches = maxDiff - alignment.getMismatches() - alignment.getGapOpens() - alignment.getGapExtensions();
|
||||
if( alignment.position < lowerBounds.size()-1 && mismatches < lowerBounds.get(alignment.position+1).value )
|
||||
continue;
|
||||
|
||||
if(mismatches == 0) {
|
||||
exactMatch(alignment,bases,bwt);
|
||||
if(alignment.loBound > alignment.hiBound)
|
||||
continue;
|
||||
}
|
||||
|
||||
// Found a valid alignment; store it and move on.
|
||||
if(alignment.position >= read.getReadLength()-1) {
|
||||
for(long bwtIndex = alignment.loBound; bwtIndex <= alignment.hiBound; bwtIndex++) {
|
||||
BWAAlignment finalAlignment = alignment.clone();
|
||||
|
||||
if( finalAlignment.isNegativeStrand() )
|
||||
finalAlignment.setAlignmentStart(forwardSuffixArray.get(bwtIndex) + 1);
|
||||
else {
|
||||
int sizeAlongReference = read.getReadLength() -
|
||||
finalAlignment.getNumberOfBasesMatchingState(AlignmentState.INSERTION) +
|
||||
finalAlignment.getNumberOfBasesMatchingState(AlignmentState.DELETION);
|
||||
finalAlignment.setAlignmentStart(reverseBWT.length() - reverseSuffixArray.get(bwtIndex) - sizeAlongReference + 1);
|
||||
}
|
||||
|
||||
successfulMatches.add(finalAlignment);
|
||||
|
||||
bestScore = Math.min(finalAlignment.getScore(),bestScore);
|
||||
bestDiff = Math.min(finalAlignment.getMismatches()+finalAlignment.getGapOpens()+finalAlignment.getGapExtensions(),bestDiff);
|
||||
maxDiff = bestDiff + 1;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
|
||||
/*
|
||||
System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
|
||||
alignment.negativeStrand?1:0,
|
||||
bases.length-alignment.position-1,
|
||||
alignment.getCurrentState().toString().charAt(0),
|
||||
alignment.getMismatches(),
|
||||
alignment.getGapOpens(),
|
||||
alignment.getGapExtensions(),
|
||||
lowerBounds.get(alignment.position+1).value,
|
||||
lowerBounds.get(alignment.position+1).width,
|
||||
alignment.loBound,
|
||||
alignment.hiBound);
|
||||
*/
|
||||
|
||||
// Temporary -- look ahead to see if the next alignment is bounded.
|
||||
boolean allowDifferences = mismatches > 0;
|
||||
boolean allowMismatches = mismatches > 0;
|
||||
|
||||
if( allowDifferences &&
|
||||
alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&
|
||||
read.getReadLength()-1-(alignment.position+1) >= INDEL_END_SKIP+alignment.getGapOpens()+alignment.getGapExtensions() ) {
|
||||
if( alignment.getCurrentState() == AlignmentState.MATCH_MISMATCH ) {
|
||||
if( alignment.getGapOpens() < MAXIMUM_GAP_OPENS ) {
|
||||
// Add a potential insertion extension.
|
||||
BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
|
||||
insertionAlignment.incrementGapOpens();
|
||||
alignments.add(insertionAlignment);
|
||||
|
||||
// Add a potential deletion by marking a deletion and augmenting the position.
|
||||
List<BWAAlignment> deletionAlignments = createDeletionAlignments(bwt,alignment);
|
||||
for( BWAAlignment deletionAlignment: deletionAlignments )
|
||||
deletionAlignment.incrementGapOpens();
|
||||
alignments.addAll(deletionAlignments);
|
||||
}
|
||||
}
|
||||
else if( alignment.getCurrentState() == AlignmentState.INSERTION ) {
|
||||
if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
|
||||
// Add a potential insertion extension.
|
||||
BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
|
||||
insertionAlignment.incrementGapExtensions();
|
||||
alignments.add(insertionAlignment);
|
||||
}
|
||||
}
|
||||
else if( alignment.getCurrentState() == AlignmentState.DELETION ) {
|
||||
if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
|
||||
// Add a potential deletion by marking a deletion and augmenting the position.
|
||||
List<BWAAlignment> deletionAlignments = createDeletionAlignments(bwt,alignment);
|
||||
for( BWAAlignment deletionAlignment: deletionAlignments )
|
||||
deletionAlignment.incrementGapExtensions();
|
||||
alignments.addAll(deletionAlignments);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mismatches
|
||||
alignments.addAll(createMatchedAlignments(bwt,alignment,bases,allowDifferences&&allowMismatches));
|
||||
}
|
||||
|
||||
return successfulMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an seeding alignment to use as a starting point when traversing.
|
||||
* @param bwt source BWT.
|
||||
* @return Seed alignment.
|
||||
*/
|
||||
private BWAAlignment createSeedAlignment(BWT bwt) {
|
||||
BWAAlignment seed = new BWAAlignment(this);
|
||||
seed.setNegativeStrand(bwt == forwardBWT);
|
||||
seed.position = -1;
|
||||
seed.loBound = 0;
|
||||
seed.hiBound = bwt.length();
|
||||
return seed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new alignments representing direct matches / mismatches.
|
||||
* @param bwt Source BWT with which to work.
|
||||
* @param alignment Alignment for the previous position.
|
||||
* @param bases The bases in the read.
|
||||
* @param allowMismatch Should mismatching bases be allowed?
|
||||
* @return New alignment representing this position if valid; null otherwise.
|
||||
*/
|
||||
private List<BWAAlignment> createMatchedAlignments( BWT bwt, BWAAlignment alignment, Byte[] bases, boolean allowMismatch ) {
|
||||
List<BWAAlignment> newAlignments = new ArrayList<BWAAlignment>();
|
||||
|
||||
List<Byte> baseChoices = new ArrayList<Byte>();
|
||||
Byte thisBase = bases[alignment.position+1];
|
||||
|
||||
if( allowMismatch )
|
||||
baseChoices.addAll(Bases.allOf());
|
||||
else
|
||||
baseChoices.add(thisBase);
|
||||
|
||||
if( thisBase != null ) {
|
||||
// Keep rotating the current base to the last position until we've hit the current base.
|
||||
for( ;; ) {
|
||||
baseChoices.add(baseChoices.remove(0));
|
||||
if( thisBase.equals(baseChoices.get(baseChoices.size()-1)) )
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for(byte base: baseChoices) {
|
||||
BWAAlignment newAlignment = alignment.clone();
|
||||
|
||||
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||
|
||||
// If this alignment is valid, skip it.
|
||||
if( newAlignment.loBound > newAlignment.hiBound )
|
||||
continue;
|
||||
|
||||
newAlignment.position++;
|
||||
newAlignment.addState(AlignmentState.MATCH_MISMATCH);
|
||||
if( bases[newAlignment.position] == null || base != bases[newAlignment.position] )
|
||||
newAlignment.incrementMismatches();
|
||||
|
||||
newAlignments.add(newAlignment);
|
||||
}
|
||||
|
||||
return newAlignments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new alignment representing an insertion at this point in the read.
|
||||
* @param alignment Alignment from which to derive the insertion.
|
||||
* @return New alignment reflecting the insertion.
|
||||
*/
|
||||
private BWAAlignment createInsertionAlignment( BWAAlignment alignment ) {
|
||||
// Add a potential insertion extension.
|
||||
BWAAlignment newAlignment = alignment.clone();
|
||||
newAlignment.position++;
|
||||
newAlignment.addState(AlignmentState.INSERTION);
|
||||
return newAlignment;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create new alignments representing a deletion at this point in the read.
|
||||
* @param bwt source BWT for inferring deletion info.
|
||||
* @param alignment Alignment from which to derive the deletion.
|
||||
* @return New alignments reflecting all possible deletions.
|
||||
*/
|
||||
private List<BWAAlignment> createDeletionAlignments( BWT bwt, BWAAlignment alignment) {
|
||||
List<BWAAlignment> newAlignments = new ArrayList<BWAAlignment>();
|
||||
for(byte base: Bases.instance) {
|
||||
BWAAlignment newAlignment = alignment.clone();
|
||||
|
||||
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||
|
||||
// If this alignment is valid, skip it.
|
||||
if( newAlignment.loBound > newAlignment.hiBound )
|
||||
continue;
|
||||
|
||||
newAlignment.addState(AlignmentState.DELETION);
|
||||
|
||||
newAlignments.add(newAlignment);
|
||||
}
|
||||
|
||||
return newAlignments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Exactly match the given alignment against the given BWT.
|
||||
* @param alignment Alignment to match.
|
||||
* @param bases Bases to use.
|
||||
* @param bwt BWT to use.
|
||||
*/
|
||||
private void exactMatch( BWAAlignment alignment, Byte[] bases, BWT bwt ) {
|
||||
while( ++alignment.position < bases.length ) {
|
||||
byte base = bases[alignment.position];
|
||||
alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||
alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||
if( alignment.loBound > alignment.hiBound )
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make each base into A/C/G/T or null if unknown.
|
||||
* @param bases Base string to normalize.
|
||||
* @return Array of normalized bases.
|
||||
*/
|
||||
private Byte[] normalizeBases( byte[] bases ) {
|
||||
Byte[] normalBases = new Byte[bases.length];
|
||||
for(int i = 0; i < bases.length; i++)
|
||||
normalBases[i] = Bases.fromASCII(bases[i]);
|
||||
return normalBases;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.alignment.bwa.java;
|
||||
|
||||
import org.broadinstitute.sting.alignment.reference.bwt.BWT;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* At any point along the given read, what is a good lower bound for the
|
||||
* total number of differences?
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class LowerBound {
|
||||
/**
|
||||
* Lower bound of the suffix array.
|
||||
*/
|
||||
public final long loIndex;
|
||||
|
||||
/**
|
||||
* Upper bound of the suffix array.
|
||||
*/
|
||||
public final long hiIndex;
|
||||
|
||||
/**
|
||||
* Width of the bwt from loIndex -> hiIndex, inclusive.
|
||||
*/
|
||||
public final long width;
|
||||
|
||||
/**
|
||||
* The lower bound at the given point.
|
||||
*/
|
||||
public final int value;
|
||||
|
||||
/**
|
||||
* Create a new lower bound with the given value.
|
||||
* @param loIndex The lower bound of the BWT.
|
||||
* @param hiIndex The upper bound of the BWT.
|
||||
* @param value Value for the lower bound at this site.
|
||||
*/
|
||||
private LowerBound(long loIndex, long hiIndex, int value) {
|
||||
this.loIndex = loIndex;
|
||||
this.hiIndex = hiIndex;
|
||||
this.width = hiIndex - loIndex + 1;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper.
|
||||
* @param bases Bases of the read to use when creating a new BWT.
|
||||
* @param bwt BWT to check against.
|
||||
* @return A list of lower bounds at every point in the reference.
|
||||
*
|
||||
*/
|
||||
public static List<LowerBound> create(Byte[] bases, BWT bwt) {
|
||||
List<LowerBound> bounds = new ArrayList<LowerBound>();
|
||||
|
||||
long loIndex = 0, hiIndex = bwt.length();
|
||||
int mismatches = 0;
|
||||
for( int i = bases.length-1; i >= 0; i-- ) {
|
||||
Byte base = bases[i];
|
||||
|
||||
// Ignore non-ACGT bases.
|
||||
if( base != null ) {
|
||||
loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1;
|
||||
hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex);
|
||||
}
|
||||
|
||||
if( base == null || loIndex > hiIndex ) {
|
||||
loIndex = 0;
|
||||
hiIndex = bwt.length();
|
||||
mismatches++;
|
||||
}
|
||||
bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches));
|
||||
}
|
||||
|
||||
return bounds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a string representation of this bound.
|
||||
* @return String version of this bound.
|
||||
*/
|
||||
public String toString() {
|
||||
return String.format("LowerBound: w = %d, value = %d",width,value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
/**
|
||||
* Analyses used to validate the correctness and performance the BWA Java bindings.
|
||||
*/
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Writes .amb files - a file indicating where 'holes' (indeterminant bases)
|
||||
* exist in the contig. Currently, only empty, placeholder AMBs are supported.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AMBWriter {
|
||||
/**
|
||||
* Number of holes is fixed at zero.
|
||||
*/
|
||||
private static final int NUM_HOLES = 0;
|
||||
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private final PrintStream out;
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given file.
|
||||
* @param file file into which ANN data should be written.
|
||||
* @throws java.io.IOException if there is a problem opening the output file.
|
||||
*/
|
||||
public AMBWriter(File file) throws IOException {
|
||||
out = new PrintStream(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given OutputStream.
|
||||
* @param stream Stream into which ANN data should be written.
|
||||
*/
|
||||
public AMBWriter(OutputStream stream) {
|
||||
out = new PrintStream(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the contents of the given dictionary into the AMB file.
|
||||
* Assumes that there are no holes in the dictionary.
|
||||
* @param dictionary Dictionary to write.
|
||||
*/
|
||||
public void writeEmpty(SAMSequenceDictionary dictionary) {
|
||||
long genomeLength = 0L;
|
||||
for(SAMSequenceRecord sequence: dictionary.getSequences())
|
||||
genomeLength += sequence.getSequenceLength();
|
||||
|
||||
int sequences = dictionary.getSequences().size();
|
||||
|
||||
// Write the header
|
||||
out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the given output stream.
|
||||
*/
|
||||
public void close() {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Writes .ann files - an alternate sequence dictionary format
|
||||
* used by BWA/C. For best results, the input sequence dictionary
|
||||
* should be created with Picard's CreateSequenceDictionary.jar,
|
||||
* TRUNCATE_NAMES_AT_WHITESPACE=false.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class ANNWriter {
|
||||
/**
|
||||
* BWA uses a fixed seed of 11, written into every file.
|
||||
*/
|
||||
private static final int BNS_SEED = 11;
|
||||
|
||||
/**
|
||||
* A seemingly unused value that appears in every contig in the ANN.
|
||||
*/
|
||||
private static final int GI = 0;
|
||||
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private final PrintStream out;
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given file.
|
||||
* @param file file into which ANN data should be written.
|
||||
* @throws IOException if there is a problem opening the output file.
|
||||
*/
|
||||
public ANNWriter(File file) throws IOException {
|
||||
out = new PrintStream(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ANNWriter targeting the given OutputStream.
|
||||
* @param stream Stream into which ANN data should be written.
|
||||
*/
|
||||
public ANNWriter(OutputStream stream) {
|
||||
out = new PrintStream(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the contents of the given dictionary into the ANN file.
|
||||
* Assumes that no ambs (blocks of indeterminate base) are present in the dictionary.
|
||||
* @param dictionary Dictionary to write.
|
||||
*/
|
||||
public void write(SAMSequenceDictionary dictionary) {
|
||||
long genomeLength = 0L;
|
||||
for(SAMSequenceRecord sequence: dictionary.getSequences())
|
||||
genomeLength += sequence.getSequenceLength();
|
||||
|
||||
int sequences = dictionary.getSequences().size();
|
||||
|
||||
// Write the header
|
||||
out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED);
|
||||
|
||||
for(SAMSequenceRecord sequence: dictionary.getSequences()) {
|
||||
String fullSequenceName = sequence.getSequenceName();
|
||||
String trimmedSequenceName = fullSequenceName;
|
||||
String sequenceComment = "(null)";
|
||||
|
||||
long offset = 0;
|
||||
|
||||
// Separate the sequence name from the sequence comment, based on BWA's definition.
|
||||
// BWA's definition appears to accept a zero-length contig name, so mimic that behavior.
|
||||
if(fullSequenceName.indexOf(' ') >= 0) {
|
||||
trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' '));
|
||||
sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1);
|
||||
}
|
||||
|
||||
// Write the sequence GI (?), name, and comment.
|
||||
out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment);
|
||||
// Write the sequence offset, length, and ambs (currently fixed at 0).
|
||||
out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the given output stream.
|
||||
*/
|
||||
public void close() {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWT {
|
||||
/**
|
||||
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
|
||||
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
|
||||
*/
|
||||
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||
|
||||
/**
|
||||
* The inverse SA, used as a placeholder for determining where the special EOL character sits.
|
||||
*/
|
||||
protected final long inverseSA0;
|
||||
|
||||
/**
|
||||
* Cumulative counts for the entire BWT.
|
||||
*/
|
||||
protected final Counts counts;
|
||||
|
||||
/**
|
||||
* The individual sequence blocks, modelling how they appear on disk.
|
||||
*/
|
||||
protected final SequenceBlock[] sequenceBlocks;
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param counts Cumulative count of bases, in A,C,G,T order.
|
||||
* @param sequenceBlocks The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( long inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.counts = counts;
|
||||
this.sequenceBlocks = sequenceBlocks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param counts Count of bases, in A,C,G,T order.
|
||||
* @param sequence The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( long inverseSA0, Counts counts, byte[] sequence ) {
|
||||
this(inverseSA0,counts,generateSequenceBlocks(sequence));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the full sequence from the list of block.
|
||||
* @return The full BWT string as a byte array.
|
||||
*/
|
||||
public byte[] getSequence() {
|
||||
byte[] sequence = new byte[(int)counts.getTotal()];
|
||||
for( SequenceBlock block: sequenceBlocks )
|
||||
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
|
||||
return sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
|
||||
* @param base The base.
|
||||
* @return Total counts for all bases lexicographically smaller than this base.
|
||||
*/
|
||||
public long counts(byte base) {
|
||||
return counts.getCumulative(base);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
|
||||
* @param base The base.
|
||||
* @param index The position to search within the BWT.
|
||||
* @return Total counts for all bases lexicographically smaller than this base.
|
||||
*/
|
||||
public long occurrences(byte base,long index) {
|
||||
SequenceBlock block = getSequenceBlock(index);
|
||||
int position = getSequencePosition(index);
|
||||
long accumulator = block.occurrences.get(base);
|
||||
for(int i = 0; i <= position; i++) {
|
||||
if(base == block.sequence[i])
|
||||
accumulator++;
|
||||
}
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
/**
|
||||
* The number of bases in the BWT as a whole.
|
||||
* @return Number of bases.
|
||||
*/
|
||||
public long length() {
|
||||
return counts.getTotal();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new BWT from the given reference sequence.
|
||||
* @param referenceSequence Sequence from which to derive the BWT.
|
||||
* @return reference sequence-derived BWT.
|
||||
*/
|
||||
public static BWT createFromReferenceSequence(byte[] referenceSequence) {
|
||||
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
|
||||
|
||||
byte[] bwt = new byte[(int)suffixArray.length()-1];
|
||||
int bwtIndex = 0;
|
||||
for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) {
|
||||
if(suffixArray.get(suffixArrayIndex) == 0)
|
||||
continue;
|
||||
bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1];
|
||||
}
|
||||
|
||||
return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the base at a given position in the BWT.
|
||||
* @param index The index to use.
|
||||
* @return The base at that location.
|
||||
*/
|
||||
protected byte getBase(long index) {
|
||||
if(index == inverseSA0)
|
||||
throw new ReviewedStingException(String.format("Base at index %d does not have a text representation",index));
|
||||
|
||||
SequenceBlock block = getSequenceBlock(index);
|
||||
int position = getSequencePosition(index);
|
||||
return block.sequence[position];
|
||||
}
|
||||
|
||||
private SequenceBlock getSequenceBlock(long index) {
|
||||
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||
if(index > inverseSA0) index--;
|
||||
return sequenceBlocks[(int)(index/SEQUENCE_BLOCK_SIZE)];
|
||||
}
|
||||
|
||||
private int getSequencePosition(long index) {
|
||||
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||
if(index > inverseSA0) index--;
|
||||
return (int)(index%SEQUENCE_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a set of sequence blocks from one long sequence.
|
||||
* @param sequence Sequence from which to derive blocks.
|
||||
* @return Array of sequence blocks containing data from the sequence.
|
||||
*/
|
||||
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
|
||||
Counts occurrences = new Counts();
|
||||
|
||||
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
|
||||
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
|
||||
|
||||
for( int block = 0; block < numSequenceBlocks; block++ ) {
|
||||
int blockStart = block*SEQUENCE_BLOCK_SIZE;
|
||||
int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
|
||||
byte[] subsequence = new byte[blockLength];
|
||||
|
||||
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
|
||||
|
||||
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence);
|
||||
|
||||
for( byte base: subsequence )
|
||||
occurrences.increment(base);
|
||||
}
|
||||
|
||||
return sequenceBlocks;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.alignment.reference.packing.BasePackedInputStream;
|
||||
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteOrder;
|
||||
/**
|
||||
* Reads a BWT from a given file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTReader {
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private FileInputStream inputStream;
|
||||
|
||||
/**
|
||||
* Create a new BWT reader.
|
||||
* @param inputFile File in which the BWT is stored.
|
||||
*/
|
||||
public BWTReader( File inputFile ) {
|
||||
try {
|
||||
this.inputStream = new FileInputStream(inputFile);
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new ReviewedStingException("Unable to open input file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a BWT from the input stream.
|
||||
* @return The BWT stored in the input stream.
|
||||
*/
|
||||
public BWT read() {
|
||||
UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
long inverseSA0;
|
||||
long[] count;
|
||||
SequenceBlock[] sequenceBlocks;
|
||||
|
||||
try {
|
||||
inverseSA0 = uintPackedInputStream.read();
|
||||
count = new long[PackUtils.ALPHABET_SIZE];
|
||||
uintPackedInputStream.read(count);
|
||||
|
||||
long bwtSize = count[PackUtils.ALPHABET_SIZE-1];
|
||||
sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
|
||||
|
||||
for( int block = 0; block < sequenceBlocks.length; block++ ) {
|
||||
int sequenceStart = block* BWT.SEQUENCE_BLOCK_SIZE;
|
||||
int sequenceLength = (int)Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
|
||||
|
||||
long[] occurrences = new long[PackUtils.ALPHABET_SIZE];
|
||||
byte[] bwt = new byte[sequenceLength];
|
||||
|
||||
uintPackedInputStream.read(occurrences);
|
||||
basePackedInputStream.read(bwt);
|
||||
|
||||
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt);
|
||||
}
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Generate BWA supplementary files (.ann, .amb) from the command line.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTSupplementaryFileGenerator {
|
||||
enum SupplementaryFileType { ANN, AMB }
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
if(args.length < 3)
|
||||
usage("Incorrect number of arguments supplied");
|
||||
|
||||
File fastaFile = new File(args[0]);
|
||||
File outputFile = new File(args[1]);
|
||||
SupplementaryFileType outputType = null;
|
||||
try {
|
||||
outputType = Enum.valueOf(SupplementaryFileType.class,args[2]);
|
||||
}
|
||||
catch(IllegalArgumentException ex) {
|
||||
usage("Invalid output type: " + args[2]);
|
||||
}
|
||||
|
||||
ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile);
|
||||
SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary();
|
||||
|
||||
switch(outputType) {
|
||||
case ANN:
|
||||
ANNWriter annWriter = new ANNWriter(outputFile);
|
||||
annWriter.write(dictionary);
|
||||
annWriter.close();
|
||||
break;
|
||||
case AMB:
|
||||
AMBWriter ambWriter = new AMBWriter(outputFile);
|
||||
ambWriter.writeEmpty(dictionary);
|
||||
ambWriter.close();
|
||||
break;
|
||||
default:
|
||||
usage("Unsupported output type: " + outputType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Print usage information and exit.
|
||||
*/
|
||||
private static void usage(String message) {
|
||||
System.err.println(message);
|
||||
System.err.println("Usage: BWTSupplementaryFileGenerator <fasta> <output file> <output type>");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.alignment.reference.packing.BasePackedOutputStream;
|
||||
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Writes an in-memory BWT to an outputstream.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTWriter {
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private final OutputStream outputStream;
|
||||
|
||||
/**
|
||||
* Create a new BWT writer.
|
||||
* @param outputFile File in which the BWT is stored.
|
||||
*/
|
||||
public BWTWriter( File outputFile ) {
|
||||
try {
|
||||
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new ReviewedStingException("Unable to open output file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a BWT to the output stream.
|
||||
* @param bwt Transform to be written to the output stream.
|
||||
*/
|
||||
public void write( BWT bwt ) {
|
||||
UnsignedIntPackedOutputStream intPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream<Integer>(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
try {
|
||||
intPackedOutputStream.write(bwt.inverseSA0);
|
||||
intPackedOutputStream.write(bwt.counts.toArray(true));
|
||||
|
||||
for( SequenceBlock block: bwt.sequenceBlocks ) {
|
||||
intPackedOutputStream.write(block.occurrences.toArray(false));
|
||||
basePackedOutputStream.write(block.sequence);
|
||||
}
|
||||
|
||||
// The last block is the last set of counts in the structure.
|
||||
intPackedOutputStream.write(bwt.counts.toArray(false));
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
outputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Enhanced enum representation of a base.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class Bases implements Iterable<Byte>
|
||||
{
|
||||
public static byte A = 'A';
|
||||
public static byte C = 'C';
|
||||
public static byte G = 'G';
|
||||
public static byte T = 'T';
|
||||
|
||||
public static final Bases instance = new Bases();
|
||||
|
||||
private static final List<Byte> allBases;
|
||||
|
||||
/**
|
||||
* Representation of the base broken down by packed value.
|
||||
*/
|
||||
private static final Map<Integer,Byte> basesByPack = new HashMap<Integer,Byte>();
|
||||
|
||||
static {
|
||||
List<Byte> bases = new ArrayList<Byte>();
|
||||
bases.add(A);
|
||||
bases.add(C);
|
||||
bases.add(G);
|
||||
bases.add(T);
|
||||
allBases = Collections.unmodifiableList(bases);
|
||||
|
||||
for(int i = 0; i < allBases.size(); i++)
|
||||
basesByPack.put(i,allBases.get(i));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new base with the given ascii representation and
|
||||
* pack value.
|
||||
*/
|
||||
private Bases() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all possible bases.
|
||||
* @return Byte representation of all bases.
|
||||
*/
|
||||
public static Collection<Byte> allOf() {
|
||||
return allBases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the number of known bases.
|
||||
* @return The number of known bases.
|
||||
*/
|
||||
public static int size() {
|
||||
return allBases.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an iterator over the total number of known base types.
|
||||
* @return Iterator over all known bases.
|
||||
*/
|
||||
public Iterator<Byte> iterator() {
|
||||
return basesByPack.values().iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the given base from the packed representation.
|
||||
* @param pack Packed representation.
|
||||
* @return base.
|
||||
*/
|
||||
public static byte fromPack( int pack ) { return basesByPack.get(pack); }
|
||||
|
||||
/**
|
||||
* Convert the given base to its packed value.
|
||||
* @param ascii ASCII representation of the base.
|
||||
* @return Packed value.
|
||||
*/
|
||||
public static int toPack( byte ascii )
|
||||
{
|
||||
for( Map.Entry<Integer,Byte> entry: basesByPack.entrySet() ) {
|
||||
if( entry.getValue().equals(ascii) )
|
||||
return entry.getKey();
|
||||
}
|
||||
throw new ReviewedStingException(String.format("Base %c is an invalid base to pack", (char)ascii));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the ASCII representation of a base to its 'normalized' representation.
|
||||
* @param base The base itself.
|
||||
* @return The byte, if present. Null if unknown.
|
||||
*/
|
||||
public static Byte fromASCII( byte base ) {
|
||||
Byte found = null;
|
||||
for( Byte normalized: allBases ) {
|
||||
if( normalized.equals(base) ) {
|
||||
found = normalized;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Counts of how many bases of each type have been seen.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class Counts implements Cloneable {
|
||||
/**
|
||||
* Internal representation of counts, broken down by ASCII value.
|
||||
*/
|
||||
private Map<Byte,Long> counts = new HashMap<Byte,Long>();
|
||||
|
||||
/**
|
||||
* Internal representation of cumulative counts, broken down by ASCII value.
|
||||
*/
|
||||
private Map<Byte,Long> cumulativeCounts = new HashMap<Byte,Long>();
|
||||
|
||||
/**
|
||||
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
|
||||
*/
|
||||
public Counts()
|
||||
{
|
||||
for(byte base: Bases.instance) {
|
||||
counts.put(base,0L);
|
||||
cumulativeCounts.put(base,0L);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a counts data structure with the given initial values.
|
||||
* @param data Count data, broken down by base.
|
||||
* @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example).
|
||||
*/
|
||||
public Counts( long[] data, boolean cumulative ) {
|
||||
if(cumulative) {
|
||||
long priorCount = 0;
|
||||
for(byte base: Bases.instance) {
|
||||
long count = data[Bases.toPack(base)];
|
||||
counts.put(base,count-priorCount);
|
||||
cumulativeCounts.put(base,priorCount);
|
||||
priorCount = count;
|
||||
}
|
||||
}
|
||||
else {
|
||||
long priorCount = 0;
|
||||
for(byte base: Bases.instance) {
|
||||
long count = data[Bases.toPack(base)];
|
||||
counts.put(base,count);
|
||||
cumulativeCounts.put(base,priorCount);
|
||||
priorCount += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert to an array for persistence.
|
||||
* @param cumulative Use a cumulative representation.
|
||||
* @return Array of count values.
|
||||
*/
|
||||
public long[] toArray(boolean cumulative) {
|
||||
long[] countArray = new long[counts.size()];
|
||||
if(cumulative) {
|
||||
int index = 0;
|
||||
boolean first = true;
|
||||
for(byte base: Bases.instance) {
|
||||
if(first) {
|
||||
first = false;
|
||||
continue;
|
||||
}
|
||||
countArray[index++] = getCumulative(base);
|
||||
}
|
||||
countArray[countArray.length-1] = getTotal();
|
||||
}
|
||||
else {
|
||||
int index = 0;
|
||||
for(byte base: Bases.instance)
|
||||
countArray[index++] = counts.get(base);
|
||||
}
|
||||
return countArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a unique copy of the current object.
|
||||
* @return A duplicate of this object.
|
||||
*/
|
||||
public Counts clone() {
|
||||
Counts other;
|
||||
try {
|
||||
other = (Counts)super.clone();
|
||||
}
|
||||
catch(CloneNotSupportedException ex) {
|
||||
throw new ReviewedStingException("Unable to clone counts object", ex);
|
||||
}
|
||||
other.counts = new HashMap<Byte,Long>(counts);
|
||||
other.cumulativeCounts = new HashMap<Byte,Long>(cumulativeCounts);
|
||||
return other;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of bases seen at the given location.
|
||||
* @param base Base to increment.
|
||||
*/
|
||||
public void increment(byte base) {
|
||||
counts.put(base,counts.get(base)+1);
|
||||
boolean increment = false;
|
||||
for(byte cumulative: Bases.instance) {
|
||||
if(increment) cumulativeCounts.put(cumulative,cumulativeCounts.get(cumulative)+1);
|
||||
increment |= (cumulative == base);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a count of the number of bases seen at a given location.
|
||||
* Note that counts in this case are not cumulative (counts for A,C,G,T
|
||||
* are independent).
|
||||
* @param base Base for which to query counts.
|
||||
* @return Number of bases of this type seen.
|
||||
*/
|
||||
public long get(byte base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a count of the number of bases seen before this base.
|
||||
* Note that counts in this case are cumulative.
|
||||
* @param base Base for which to query counts.
|
||||
* @return Number of bases of this type seen.
|
||||
*/
|
||||
public long getCumulative(byte base) {
|
||||
return cumulativeCounts.get(base);
|
||||
}
|
||||
|
||||
/**
|
||||
* How many total bases are represented by this count structure?
|
||||
* @return Total bases represented.
|
||||
*/
|
||||
public long getTotal() {
|
||||
int accumulator = 0;
|
||||
for(byte base: Bases.instance) {
|
||||
accumulator += get(base);
|
||||
}
|
||||
return accumulator;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITHoc THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Create a suffix array data structure.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class CreateBWTFromReference {
|
||||
private byte[] loadReference( File inputFile ) {
|
||||
// Read in the first sequence in the input file
|
||||
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||
ReferenceSequence sequence = reference.nextSequence();
|
||||
return sequence.getBases();
|
||||
}
|
||||
|
||||
private byte[] loadReverseReference( File inputFile ) {
|
||||
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||
ReferenceSequence sequence = reference.nextSequence();
|
||||
PackUtils.reverse(sequence.getBases());
|
||||
return sequence.getBases();
|
||||
}
|
||||
|
||||
private Counts countOccurrences( byte[] sequence ) {
|
||||
Counts occurrences = new Counts();
|
||||
for( byte base: sequence )
|
||||
occurrences.increment(base);
|
||||
return occurrences;
|
||||
}
|
||||
|
||||
private long[] createSuffixArray( byte[] sequence ) {
|
||||
return SuffixArray.createFromReferenceSequence(sequence).sequence;
|
||||
}
|
||||
|
||||
private long[] invertSuffixArray( long[] suffixArray ) {
|
||||
long[] inverseSuffixArray = new long[suffixArray.length];
|
||||
for( int i = 0; i < suffixArray.length; i++ )
|
||||
inverseSuffixArray[(int)suffixArray[i]] = i;
|
||||
return inverseSuffixArray;
|
||||
}
|
||||
|
||||
private long[] createCompressedSuffixArray( int[] suffixArray, int[] inverseSuffixArray ) {
|
||||
long[] compressedSuffixArray = new long[suffixArray.length];
|
||||
compressedSuffixArray[0] = inverseSuffixArray[0];
|
||||
for( int i = 1; i < suffixArray.length; i++ )
|
||||
compressedSuffixArray[i] = inverseSuffixArray[suffixArray[i]+1];
|
||||
return compressedSuffixArray;
|
||||
}
|
||||
|
||||
private long[] createInversedCompressedSuffixArray( int[] compressedSuffixArray ) {
|
||||
long[] inverseCompressedSuffixArray = new long[compressedSuffixArray.length];
|
||||
for( int i = 0; i < compressedSuffixArray.length; i++ )
|
||||
inverseCompressedSuffixArray[compressedSuffixArray[i]] = i;
|
||||
return inverseCompressedSuffixArray;
|
||||
}
|
||||
|
||||
public static void main( String argv[] ) throws IOException {
|
||||
if( argv.length != 5 ) {
|
||||
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
|
||||
return;
|
||||
}
|
||||
|
||||
String inputFileName = argv[0];
|
||||
File inputFile = new File(inputFileName);
|
||||
|
||||
String bwtFileName = argv[1];
|
||||
File bwtFile = new File(bwtFileName);
|
||||
|
||||
String rbwtFileName = argv[2];
|
||||
File rbwtFile = new File(rbwtFileName);
|
||||
|
||||
String saFileName = argv[3];
|
||||
File saFile = new File(saFileName);
|
||||
|
||||
String rsaFileName = argv[4];
|
||||
File rsaFile = new File(rsaFileName);
|
||||
|
||||
CreateBWTFromReference creator = new CreateBWTFromReference();
|
||||
|
||||
byte[] sequence = creator.loadReference(inputFile);
|
||||
byte[] reverseSequence = creator.loadReverseReference(inputFile);
|
||||
|
||||
// Count the occurences of each given base.
|
||||
Counts occurrences = creator.countOccurrences(sequence);
|
||||
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A),
|
||||
occurrences.getCumulative(Bases.C),
|
||||
occurrences.getCumulative(Bases.G),
|
||||
occurrences.getCumulative(Bases.T));
|
||||
|
||||
// Generate the suffix array and print diagnostics.
|
||||
long[] suffixArrayData = creator.createSuffixArray(sequence);
|
||||
long[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);
|
||||
|
||||
// Invert the suffix array and print diagnostics.
|
||||
long[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
|
||||
long[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);
|
||||
|
||||
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
|
||||
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
|
||||
|
||||
/*
|
||||
// Create the data structure for the compressed suffix array and print diagnostics.
|
||||
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
|
||||
int reconstructedInverseSA = compressedSuffixArray[0];
|
||||
for( int i = 0; i < 8; i++ ) {
|
||||
System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA);
|
||||
reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA];
|
||||
}
|
||||
|
||||
// Create the data structure for the inverse compressed suffix array and print diagnostics.
|
||||
int[] inverseCompressedSuffixArray = creator.createInversedCompressedSuffixArray(compressedSuffixArray);
|
||||
for( int i = 0; i < 8; i++ ) {
|
||||
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
|
||||
}
|
||||
*/
|
||||
|
||||
// Create the BWT.
|
||||
BWT bwt = BWT.createFromReferenceSequence(sequence);
|
||||
BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence);
|
||||
|
||||
byte[] bwtSequence = bwt.getSequence();
|
||||
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
|
||||
|
||||
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||
bwtWriter.write(bwt);
|
||||
bwtWriter.close();
|
||||
|
||||
BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
|
||||
reverseBWTWriter.write(reverseBWT);
|
||||
reverseBWTWriter.close();
|
||||
|
||||
/*
|
||||
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
||||
saWriter.write(suffixArray);
|
||||
saWriter.close();
|
||||
|
||||
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
|
||||
reverseSAWriter.write(reverseSuffixArray);
|
||||
reverseSAWriter.close();
|
||||
*/
|
||||
|
||||
File existingBWTFile = new File(inputFileName+".bwt");
|
||||
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
||||
BWT existingBWT = existingBWTReader.read();
|
||||
|
||||
byte[] existingBWTSequence = existingBWT.getSequence();
|
||||
System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());
|
||||
|
||||
for( int i = 0; i < bwt.length(); i++ ) {
|
||||
if( bwtSequence[i] != existingBWTSequence[i] )
|
||||
throw new ReviewedStingException("BWT mismatch at " + i);
|
||||
}
|
||||
|
||||
File existingSAFile = new File(inputFileName+".sa");
|
||||
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
|
||||
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
|
||||
|
||||
for(int i = 0; i < suffixArray.length(); i++) {
|
||||
if( i % 10000 == 0 )
|
||||
System.out.printf("Validating suffix array entry %d%n", i);
|
||||
if( suffixArray.get(i) != existingSuffixArray.get(i) )
|
||||
throw new ReviewedStingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
/**
|
||||
* Models a block of bases within the BWT.
|
||||
*/
|
||||
public class SequenceBlock {
|
||||
/**
|
||||
* Start position of this sequence within the BWT.
|
||||
*/
|
||||
public final int sequenceStart;
|
||||
|
||||
/**
|
||||
* Length of this sequence within the BWT.
|
||||
*/
|
||||
public final int sequenceLength;
|
||||
|
||||
|
||||
/**
|
||||
* Occurrences of each letter up to this sequence block.
|
||||
*/
|
||||
public final Counts occurrences;
|
||||
|
||||
/**
|
||||
* Sequence for this segment.
|
||||
*/
|
||||
public final byte[] sequence;
|
||||
|
||||
/**
|
||||
* Create a new block within this BWT.
|
||||
* @param sequenceStart Starting position of this sequence within the BWT.
|
||||
* @param sequenceLength Length of this sequence.
|
||||
* @param occurrences How many of each base has been seen before this sequence began.
|
||||
* @param sequence The actual sequence from the BWT.
|
||||
*/
|
||||
public SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) {
|
||||
this.sequenceStart = sequenceStart;
|
||||
this.sequenceLength = sequenceLength;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* An in-memory representation of a suffix array.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SuffixArray {
|
||||
public final long inverseSA0;
|
||||
public final Counts occurrences;
|
||||
|
||||
/**
|
||||
* The elements of the sequence actually stored in memory.
|
||||
*/
|
||||
protected final long[] sequence;
|
||||
|
||||
/**
|
||||
* How often are individual elements in the sequence actually stored
|
||||
* in memory, as opposed to being calculated on the fly?
|
||||
*/
|
||||
protected final int sequenceInterval;
|
||||
|
||||
/**
|
||||
* The BWT used to calculate missing portions of the sequence.
|
||||
*/
|
||||
protected final BWT bwt;
|
||||
|
||||
public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence) {
|
||||
this(inverseSA0,occurrences,sequence,1,null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new sequence array with the given inverse SA, occurrences, and values.
|
||||
* @param inverseSA0 Inverse SA entry for the first element.
|
||||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||
* @param sequence The full suffix array.
|
||||
* @param sequenceInterval How frequently is the sequence interval stored.
|
||||
* @param bwt bwt used to infer the remaining entries in the BWT.
|
||||
*/
|
||||
public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence, int sequenceInterval, BWT bwt) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
this.sequenceInterval = sequenceInterval;
|
||||
this.bwt = bwt;
|
||||
|
||||
if(sequenceInterval != 1 && bwt == null)
|
||||
throw new ReviewedStingException("A BWT must be provided if the sequence interval is not 1");
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the length of the sequence array.
|
||||
* @return Length of the suffix array.
|
||||
*/
|
||||
public long length() {
|
||||
if( bwt != null )
|
||||
return bwt.length()+1;
|
||||
else
|
||||
return sequence.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the suffix array value at a given sequence.
|
||||
* @param index Index at which to retrieve the suffix array vaule.
|
||||
* @return The suffix array value at that entry.
|
||||
*/
|
||||
public long get(long index) {
|
||||
int iterations = 0;
|
||||
while(index%sequenceInterval != 0) {
|
||||
// The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case.
|
||||
if(index == inverseSA0)
|
||||
index = 0;
|
||||
else {
|
||||
byte base = bwt.getBase(index);
|
||||
index = bwt.counts(base) + bwt.occurrences(base,index);
|
||||
}
|
||||
iterations++;
|
||||
}
|
||||
return (sequence[(int)(index/sequenceInterval)]+iterations) % length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a suffix array from a given reference sequence.
|
||||
* @param sequence The reference sequence to use when building the suffix array.
|
||||
* @return a constructed suffix array.
|
||||
*/
|
||||
public static SuffixArray createFromReferenceSequence(byte[] sequence) {
|
||||
// The builder for the suffix array. Use an integer in this case because
|
||||
// Java arrays can only hold an integer.
|
||||
TreeSet<Integer> suffixArrayBuilder = new TreeSet<Integer>(new SuffixArrayComparator(sequence));
|
||||
|
||||
Counts occurrences = new Counts();
|
||||
for( byte base: sequence )
|
||||
occurrences.increment(base);
|
||||
|
||||
// Build out the suffix array using a custom comparator.
|
||||
for( int i = 0; i <= sequence.length; i++ )
|
||||
suffixArrayBuilder.add(i);
|
||||
|
||||
// Copy the suffix array into an array.
|
||||
long[] suffixArray = new long[suffixArrayBuilder.size()];
|
||||
int i = 0;
|
||||
for( Integer element: suffixArrayBuilder )
|
||||
suffixArray[i++] = element;
|
||||
|
||||
// Find the first element in the inverse suffix array.
|
||||
long inverseSA0 = -1;
|
||||
for(i = 0; i < suffixArray.length; i++) {
|
||||
if(suffixArray[i] == 0)
|
||||
inverseSA0 = i;
|
||||
}
|
||||
if(inverseSA0 < 0)
|
||||
throw new ReviewedStingException("Unable to find first inverse SA entry in generated suffix array.");
|
||||
|
||||
return new SuffixArray(inverseSA0,occurrences,suffixArray);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two suffix arrays of the given sequence. Will return whichever string appears
|
||||
* first in lexicographic order.
|
||||
*/
|
||||
private static class SuffixArrayComparator implements Comparator<Integer> {
|
||||
/**
|
||||
* The data source for all suffix arrays.
|
||||
*/
|
||||
private final String sequence;
|
||||
|
||||
/**
|
||||
* Create a new comparator.
|
||||
* @param sequence Reference sequence to use as basis for comparison.
|
||||
*/
|
||||
public SuffixArrayComparator( byte[] sequence ) {
|
||||
// Processing the suffix array tends to be easier as a string.
|
||||
this.sequence = StringUtil.bytesToString(sequence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of
|
||||
* the two substrings sequence[lhs:], sequence[rhs:].
|
||||
* @param lhs Left-hand side of comparison.
|
||||
* @param rhs Right-hand side of comparison.
|
||||
* @return How the suffix arrays represented by lhs, rhs compare.
|
||||
*/
|
||||
public int compare( Integer lhs, Integer rhs ) {
|
||||
String lhsSuffixArray = sequence.substring(lhs);
|
||||
String rhsSuffixArray = sequence.substring(rhs);
|
||||
return lhsSuffixArray.compareTo(rhsSuffixArray);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* A reader for suffix arrays in permanent storage.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SuffixArrayReader {
|
||||
/**
|
||||
* Input stream from which to read suffix array data.
|
||||
*/
|
||||
private FileInputStream inputStream;
|
||||
|
||||
/**
|
||||
* BWT to use to fill in missing data.
|
||||
*/
|
||||
private BWT bwt;
|
||||
|
||||
/**
|
||||
* Create a new suffix array reader.
|
||||
* @param inputFile File in which the suffix array is stored.
|
||||
* @param bwt BWT to use when filling in missing data.
|
||||
*/
|
||||
public SuffixArrayReader(File inputFile, BWT bwt) {
|
||||
try {
|
||||
this.inputStream = new FileInputStream(inputFile);
|
||||
this.bwt = bwt;
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new ReviewedStingException("Unable to open input file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a suffix array from the input stream.
|
||||
* @return The suffix array stored in the input stream.
|
||||
*/
|
||||
public SuffixArray read() {
|
||||
UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
long inverseSA0;
|
||||
long[] occurrences;
|
||||
long[] suffixArray;
|
||||
int suffixArrayInterval;
|
||||
|
||||
try {
|
||||
inverseSA0 = uintPackedInputStream.read();
|
||||
occurrences = new long[PackUtils.ALPHABET_SIZE];
|
||||
uintPackedInputStream.read(occurrences);
|
||||
// Throw away the suffix array size in bytes and use the occurrences table directly.
|
||||
suffixArrayInterval = (int)uintPackedInputStream.read();
|
||||
suffixArray = new long[(int)((occurrences[occurrences.length-1]+suffixArrayInterval-1)/suffixArrayInterval)];
|
||||
uintPackedInputStream.read(suffixArray);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||
|
||||
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Javadoc goes here.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SuffixArrayWriter {
|
||||
/**
|
||||
* Input stream from which to read suffix array data.
|
||||
*/
|
||||
private OutputStream outputStream;
|
||||
|
||||
/**
|
||||
* Create a new suffix array reader.
|
||||
* @param outputFile File in which the suffix array is stored.
|
||||
*/
|
||||
public SuffixArrayWriter( File outputFile ) {
|
||||
try {
|
||||
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new ReviewedStingException("Unable to open input file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a suffix array to the output stream.
|
||||
* @param suffixArray suffix array to write.
|
||||
*/
|
||||
public void write(SuffixArray suffixArray) {
|
||||
UnsignedIntPackedOutputStream uintPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
try {
|
||||
uintPackedOutputStream.write(suffixArray.inverseSA0);
|
||||
uintPackedOutputStream.write(suffixArray.occurrences.toArray(true));
|
||||
// How frequently the suffix array entry is placed.
|
||||
uintPackedOutputStream.write(1);
|
||||
// Length of the suffix array.
|
||||
uintPackedOutputStream.write(suffixArray.length()-1);
|
||||
uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length-1);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
outputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new ReviewedStingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
package org.broadinstitute.sting.alignment.reference.packing;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
/**
|
||||
* Reads a packed version of the input stream.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BasePackedInputStream<T> {
|
||||
/**
|
||||
* Type of object to unpack.
|
||||
*/
|
||||
private final Class<T> type;
|
||||
|
||||
/**
|
||||
* Ultimate source for packed bases.
|
||||
*/
|
||||
private final FileInputStream targetInputStream;
|
||||
|
||||
/**
|
||||
* Channel source for packed bases.
|
||||
*/
|
||||
private final FileChannel targetInputChannel;
|
||||
|
||||
/**
|
||||
* A fixed-size buffer for word-packed data.
|
||||
*/
|
||||
private final ByteOrder byteOrder;
|
||||
|
||||
/**
|
||||
* How many bases are in a given packed word.
|
||||
*/
|
||||
private final int basesPerPackedWord = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
|
||||
|
||||
/**
|
||||
* How many bytes in an integer?
|
||||
*/
|
||||
private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE;
|
||||
|
||||
|
||||
public BasePackedInputStream( Class<T> type, File inputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||
this(type,new FileInputStream(inputFile),byteOrder);
|
||||
}
|
||||
|
||||
public BasePackedInputStream( Class<T> type, FileInputStream inputStream, ByteOrder byteOrder ) {
|
||||
if( type != Integer.class )
|
||||
throw new ReviewedStingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName());
|
||||
this.type = type;
|
||||
this.targetInputStream = inputStream;
|
||||
this.targetInputChannel = inputStream.getChannel();
|
||||
this.byteOrder = byteOrder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the entire contents of the input stream.
|
||||
* @param bwt array into which bases should be read.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read(byte[] bwt) throws IOException {
|
||||
read(bwt,0,bwt.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next <code>length</code> bases into the bwt array, starting at the given offset.
|
||||
* @param bwt array holding the given data.
|
||||
* @param offset target position in the bases array into which bytes should be written.
|
||||
* @param length number of bases to read from the stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read(byte[] bwt, int offset, int length) throws IOException {
|
||||
int bufferWidth = ((bwt.length+basesPerPackedWord-1)/basesPerPackedWord)*bytesPerInteger;
|
||||
ByteBuffer buffer = ByteBuffer.allocate(bufferWidth).order(byteOrder);
|
||||
targetInputChannel.read(buffer);
|
||||
targetInputChannel.position(targetInputChannel.position()+buffer.remaining());
|
||||
buffer.flip();
|
||||
|
||||
int packedWord = 0;
|
||||
int i = 0;
|
||||
while(i < length) {
|
||||
if(i % basesPerPackedWord == 0) packedWord = buffer.getInt();
|
||||
int position = basesPerPackedWord - i%basesPerPackedWord - 1;
|
||||
bwt[offset+i++] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
package org.broadinstitute.sting.alignment.reference.packing;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* A general-purpose stream for writing packed bases.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BasePackedOutputStream<T> {
|
||||
/**
|
||||
* Type of object to pack.
|
||||
*/
|
||||
private final Class<T> type;
|
||||
|
||||
/**
|
||||
* How many bases can be stored in the given data structure?
|
||||
*/
|
||||
private final int basesPerType;
|
||||
|
||||
/**
|
||||
* Ultimate target for the packed bases.
|
||||
*/
|
||||
private final OutputStream targetOutputStream;
|
||||
|
||||
/**
|
||||
* A fixed-size buffer for word-packed data.
|
||||
*/
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public BasePackedOutputStream( Class<T> type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||
this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write packed bases to the given output stream.
|
||||
* @param type Type of data to pack bases into.
|
||||
* @param outputStream Output stream to which to write packed bases.
|
||||
* @param byteOrder Switch between big endian / little endian when reading / writing files.
|
||||
*/
|
||||
public BasePackedOutputStream( Class<T> type, OutputStream outputStream, ByteOrder byteOrder) {
|
||||
this.targetOutputStream = outputStream;
|
||||
this.type = type;
|
||||
basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE;
|
||||
this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the given base to the output stream. Will write only this base; no packing will be performed.
|
||||
* @param base List of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( int base ) throws IOException {
|
||||
write( new byte[] { (byte)base } );
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an array of bases to the target output stream.
|
||||
* @param bases List of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases ) throws IOException {
|
||||
write(bases,0,bases.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a subset of the array of bases to the output stream.
|
||||
* @param bases List of bases to write.
|
||||
* @param offset site at which to start writing.
|
||||
* @param length number of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases, int offset, int length ) throws IOException {
|
||||
int packedBases = 0;
|
||||
int positionInPack = 0;
|
||||
|
||||
for( int base = offset; base < offset+length; base++ ) {
|
||||
packedBases = packBase(bases[base], packedBases, positionInPack);
|
||||
|
||||
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||
positionInPack = ++positionInPack % basesPerType;
|
||||
if( positionInPack == 0 ) {
|
||||
writePackedBases(packedBases);
|
||||
packedBases = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( positionInPack > 0 )
|
||||
writePackedBases(packedBases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the contents of the OutputStream to disk.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
targetOutputStream.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the given output stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
targetOutputStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Pack the given base into the basepack.
|
||||
* @param base The base to pack.
|
||||
* @param basePack Target for the pack operation.
|
||||
* @param position Position within the pack to which to add the base.
|
||||
* @return The packed integer.
|
||||
*/
|
||||
private int packBase( byte base, int basePack, int position ) {
|
||||
basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1));
|
||||
return basePack;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the given packed base structure to the output file.
|
||||
* @param packedBases Packed bases to write.
|
||||
* @throws IOException on error writing to the file.
|
||||
*/
|
||||
private void writePackedBases(int packedBases) throws IOException {
|
||||
buffer.rewind();
|
||||
if( type == Integer.class )
|
||||
buffer.putInt(packedBases);
|
||||
else if( type == Byte.class )
|
||||
buffer.put((byte)packedBases);
|
||||
else
|
||||
throw new ReviewedStingException("Cannot pack bases into type " + type.getName());
|
||||
targetOutputStream.write(buffer.array());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment.reference.packing;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Generate a .PAC file from a given reference.
|
||||
*
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class CreatePACFromReference {
|
||||
public static void main( String argv[] ) throws IOException {
|
||||
if( argv.length != 3 ) {
|
||||
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output pac> <output rpac>");
|
||||
return;
|
||||
}
|
||||
|
||||
// Read in the first sequence in the input file
|
||||
String inputFileName = argv[0];
|
||||
File inputFile = new File(inputFileName);
|
||||
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||
ReferenceSequence sequence = reference.nextSequence();
|
||||
|
||||
// Target file for output
|
||||
PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() );
|
||||
|
||||
// Reverse the bases in the reference
|
||||
PackUtils.reverse(sequence.getBases());
|
||||
|
||||
// Target file for output
|
||||
PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
package org.broadinstitute.sting.alignment.reference.packing;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Utilities designed for packing / unpacking bases.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class PackUtils {
|
||||
/**
|
||||
* How many possible bases can be encoded?
|
||||
*/
|
||||
public static final int ALPHABET_SIZE = 4;
|
||||
|
||||
/**
|
||||
* How many bits does it take to store a single base?
|
||||
*/
|
||||
public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2));
|
||||
|
||||
/**
|
||||
* How many bits fit into a single byte?
|
||||
*/
|
||||
public static final int BITS_PER_BYTE = 8;
|
||||
|
||||
/**
|
||||
* Writes a reference sequence to a PAC file.
|
||||
* @param outputFile Filename for the PAC file.
|
||||
* @param referenceSequence Reference sequence to write.
|
||||
* @throws IOException If there's a problem writing to the output file.
|
||||
*/
|
||||
public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException {
|
||||
OutputStream outputStream = new FileOutputStream(outputFile);
|
||||
|
||||
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
|
||||
basePackedOutputStream.write(referenceSequence);
|
||||
|
||||
outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE);
|
||||
|
||||
outputStream.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* How many bits can a given type hold?
|
||||
* @param type Type to test.
|
||||
* @return Number of bits that the given type can hold.
|
||||
*/
|
||||
public static int bitsInType( Class<?> type ) {
|
||||
try {
|
||||
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
|
||||
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
|
||||
if( typeSize > intTypeSize )
|
||||
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName());
|
||||
return (int)(Math.log(typeSize)/Math.log(2));
|
||||
}
|
||||
catch( NoSuchFieldException ex ) {
|
||||
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex);
|
||||
}
|
||||
catch( IllegalAccessException ex ) {
|
||||
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
|
||||
* @param base ASCII value for the base to pack.
|
||||
* @return A byte from 0-3 indicating the base's packed value.
|
||||
*/
|
||||
public static byte packBase(byte base) {
|
||||
switch( base ) {
|
||||
case 'A':
|
||||
return 0;
|
||||
case 'C':
|
||||
return 1;
|
||||
case 'G':
|
||||
return 2;
|
||||
case 'T':
|
||||
return 3;
|
||||
default:
|
||||
throw new ReviewedStingException("Unknown base type: " + base);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a two-bit representation of a base into an ASCII representation of a base.
|
||||
* @param pack Byte from 0-3 indicating which base is represented.
|
||||
* @return An ASCII value representing the packed base.
|
||||
*/
|
||||
public static byte unpackBase(byte pack) {
|
||||
switch( pack ) {
|
||||
case 0:
|
||||
return 'A';
|
||||
case 1:
|
||||
return 'C';
|
||||
case 2:
|
||||
return 'G';
|
||||
case 3:
|
||||
return 'T';
|
||||
default:
|
||||
throw new ReviewedStingException("Unknown pack type: " + pack);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses an unpacked sequence of bases.
|
||||
* @param bases bases to reverse.
|
||||
*/
|
||||
public static void reverse( byte[] bases ) {
|
||||
for( int i = 0, j = bases.length-1; i < j; i++, j-- ) {
|
||||
byte temp = bases[j];
|
||||
bases[j] = bases[i];
|
||||
bases[i] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a structure of size <code>size</code> that should be split
|
||||
* into <code>partitionSize</code> partitions, how many partitions should
|
||||
* be created? Size of last partition will be <= partitionSize.
|
||||
* @param size Total size of the data structure.
|
||||
* @param partitionSize Size of an individual partition.
|
||||
* @return Number of partitions that would be created.
|
||||
*/
|
||||
public static int numberOfPartitions( long size, long partitionSize ) {
|
||||
return (int)((size+partitionSize-1) / partitionSize);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
package org.broadinstitute.sting.alignment.reference.packing;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
/**
|
||||
* Read a set of integers packed into
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class UnsignedIntPackedInputStream {
|
||||
/**
|
||||
* Ultimate target for the occurrence array.
|
||||
*/
|
||||
private final FileInputStream targetInputStream;
|
||||
|
||||
/**
|
||||
* Target channel from which to pull file data.
|
||||
*/
|
||||
private final FileChannel targetInputChannel;
|
||||
|
||||
/**
|
||||
* The byte order in which integer input data appears.
|
||||
*/
|
||||
private final ByteOrder byteOrder;
|
||||
|
||||
/**
|
||||
* How many bytes are required to store an integer?
|
||||
*/
|
||||
private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE;
|
||||
|
||||
/**
|
||||
* Create a new PackedIntInputStream, writing to the given target file.
|
||||
* @param inputFile target input file.
|
||||
* @param byteOrder Endianness to use when writing a list of integers.
|
||||
* @throws java.io.IOException if an I/O error occurs.
|
||||
*/
|
||||
public UnsignedIntPackedInputStream(File inputFile, ByteOrder byteOrder) throws IOException {
|
||||
this(new FileInputStream(inputFile),byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read ints from the given InputStream.
|
||||
* @param inputStream Input stream from which to read ints.
|
||||
* @param byteOrder Endianness to use when writing a list of integers.
|
||||
*/
|
||||
public UnsignedIntPackedInputStream(FileInputStream inputStream, ByteOrder byteOrder) {
|
||||
this.targetInputStream = inputStream;
|
||||
this.targetInputChannel = inputStream.getChannel();
|
||||
this.byteOrder = byteOrder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a datum from the input stream.
|
||||
* @return The next input datum in the stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public long read() throws IOException {
|
||||
long[] data = new long[1];
|
||||
read(data);
|
||||
return data[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the data from the input stream.
|
||||
* @param data placeholder for input data.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read( long[] data ) throws IOException {
|
||||
read( data, 0, data.length );
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the data from the input stream, starting at the given offset.
|
||||
* @param data placeholder for input data.
|
||||
* @param offset place in the array to start reading in data.
|
||||
* @param length number of ints to read in.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read( long[] data, int offset, int length ) throws IOException {
|
||||
ByteBuffer readBuffer = ByteBuffer.allocate(bytesPerInteger*length).order(byteOrder);
|
||||
|
||||
targetInputChannel.read(readBuffer,targetInputChannel.position());
|
||||
readBuffer.flip();
|
||||
targetInputChannel.position(targetInputChannel.position()+readBuffer.remaining());
|
||||
|
||||
int i = 0;
|
||||
while(i < length)
|
||||
data[offset+i++] = readBuffer.getInt() & 0xFFFFFFFFL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the given output stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
targetInputStream.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment.reference.packing;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Writes an list of integers to the output file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class UnsignedIntPackedOutputStream {
|
||||
/**
|
||||
* Ultimate target for the occurrence array.
|
||||
*/
|
||||
private final OutputStream targetOutputStream;
|
||||
|
||||
/**
|
||||
* A fixed-size buffer for int-packed data.
|
||||
*/
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
/**
|
||||
* Create a new PackedIntOutputStream, writing to the given target file.
|
||||
* @param outputFile target output file.
|
||||
* @param byteOrder Endianness to use when writing a list of integers.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public UnsignedIntPackedOutputStream(File outputFile, ByteOrder byteOrder) throws IOException {
|
||||
this(new FileOutputStream(outputFile),byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write packed ints to the given OutputStream.
|
||||
* @param outputStream Output stream to which to write packed ints.
|
||||
* @param byteOrder Endianness to use when writing a list of integers.
|
||||
*/
|
||||
public UnsignedIntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
|
||||
this.targetOutputStream = outputStream;
|
||||
buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the data to the output stream.
|
||||
* @param datum datum to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( long datum ) throws IOException {
|
||||
buffer.rewind();
|
||||
buffer.putInt((int)datum);
|
||||
targetOutputStream.write(buffer.array());
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the data to the output stream.
|
||||
* @param data data to write. occurrences.length must match alphabet size.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( long[] data ) throws IOException {
|
||||
for(long datum: data)
|
||||
write(datum);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the given chunk of data to the input stream.
|
||||
* @param data data to write.
|
||||
* @param offset position at which to start.
|
||||
* @param length number of ints to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( long[] data, int offset, int length ) throws IOException {
|
||||
for( int i = offset; i < offset+length; i++ )
|
||||
write(data[i]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the contents of the OutputStream to disk.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
targetOutputStream.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the given output stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
targetOutputStream.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Create a mask for use with the PickSequenomProbes walker.
|
||||
*/
|
||||
public class CreateSequenomMask extends RodWalker<Integer, Integer> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
int result = 0;
|
||||
for ( VariantContext vc : tracker.getAllVariantContexts(ref) ) {
|
||||
if ( vc.isSNP() ) {
|
||||
GenomeLoc loc = context.getLocation();
|
||||
out.println(loc.getContig() + "\t" + (loc.getStart()-1) + "\t" + loc.getStop());
|
||||
result = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
logger.info("Found " + sum + " masking sites.");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,332 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
||||
|
||||
import org.broad.tribble.bed.BEDCodec;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Generates Sequenom probe information given a single variant track. Emitted is the variant
|
||||
* along with the 200 reference bases on each side of the variant.
|
||||
*/
|
||||
@WalkerName("PickSequenomProbes")
|
||||
@Requires(value={DataSource.REFERENCE})
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
public class PickSequenomProbes extends RodWalker<String, String> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
@Argument(required=false, shortName="snp_mask", doc="positions to be masked with N's")
|
||||
protected String SNP_MASK = null;
|
||||
@Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'")
|
||||
String project_id = null;
|
||||
@Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")")
|
||||
boolean omitWindow = false;
|
||||
@Argument(required = false, fullName="usePlinkRODNamingConvention", shortName="nameConvention",doc="Use the naming convention defined in PLINKROD")
|
||||
boolean useNamingConvention = false;
|
||||
@Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes")
|
||||
int noMaskWindow = 0;
|
||||
@Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name")
|
||||
boolean addCounter = false;
|
||||
|
||||
private byte [] maskFlags = new byte[401];
|
||||
|
||||
private LocationAwareSeekableRODIterator snpMaskIterator=null;
|
||||
|
||||
private GenomeLoc positionOfLastVariant = null;
|
||||
|
||||
private int cnt = 0;
|
||||
private int discarded = 0;
|
||||
|
||||
VariantCollection VCs ; // will keep a set of distinct variants at a given site
|
||||
private List<GenomeLoc> processedVariantsInScope = new LinkedList<GenomeLoc>();
|
||||
|
||||
public void initialize() {
|
||||
if ( SNP_MASK != null ) {
|
||||
logger.info("Loading SNP mask... ");
|
||||
ReferenceOrderedData snp_mask;
|
||||
//if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) {
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
|
||||
RMDTrack track = builder.createInstanceOfTrack(BEDCodec.class, new File(SNP_MASK));
|
||||
snpMaskIterator = new SeekableRODIterator(track.getHeader(),
|
||||
track.getSequenceDictionary(),
|
||||
getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
track.getIterator());
|
||||
//} else {
|
||||
// // TODO: fix me when Plink is back
|
||||
// throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)");
|
||||
//}
|
||||
|
||||
}
|
||||
VCs = new VariantCollection();
|
||||
}
|
||||
|
||||
|
||||
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return "";
|
||||
|
||||
logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow());
|
||||
|
||||
VCs.clear();
|
||||
VCs.addAll( tracker.getAllVariantContexts(ref), ref.getLocus() );
|
||||
|
||||
discarded += VCs.discarded();
|
||||
|
||||
if ( VCs.size() == 0 ) {
|
||||
logger.debug(" Context empty");
|
||||
return "";
|
||||
}
|
||||
|
||||
if ( VCs.size() > 1 ) {
|
||||
logger.debug(" "+VCs.size()+ " variants at the locus");
|
||||
}
|
||||
|
||||
// System.out.print("At locus "+ref.getLocus()+": ");
|
||||
// for ( VariantContext vc : VCs ) {
|
||||
// System.out.println(vc.toString());
|
||||
// }
|
||||
|
||||
// little optimization: since we may have few events at the current site on the reference,
|
||||
// we are going to make sure we compute the mask and ref bases only once for each location and only if we need to
|
||||
boolean haveMaskForWindow = false;
|
||||
boolean haveBasesForWindow = false;
|
||||
String leading_bases = null;
|
||||
String trailing_bases = null;
|
||||
|
||||
StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed)
|
||||
|
||||
// get all variant contexts!!!!
|
||||
for ( VariantContext vc : VCs ) {
|
||||
|
||||
// we can only deal with biallelic sites for now
|
||||
if ( !vc.isBiallelic() ) {
|
||||
logger.debug(" Not biallelic; skipped");
|
||||
continue;
|
||||
}
|
||||
|
||||
// we don't want to see the same multi-base event (deletion, DNP etc) multiple times.
|
||||
// All the vcs we are currently seeing are clearly on the same contig as the current reference
|
||||
// poisiton (or we would not see them at all!). All we need to check is if the vc starts at the
|
||||
// current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already).
|
||||
if ( ref.getLocus().getStart() != vc.getStart() )
|
||||
continue;
|
||||
|
||||
if ( ! haveMaskForWindow ) {
|
||||
String contig = context.getLocation().getContig();
|
||||
int offset = context.getLocation().getStart();
|
||||
int true_offset = offset - 200;
|
||||
|
||||
// we have variant; let's load all the snps falling into the current window and prepare the mask array.
|
||||
// we need to do it only once per window, regardless of how many vcs we may have at this location!
|
||||
if ( snpMaskIterator != null ) {
|
||||
// clear the mask
|
||||
for ( int i = 0 ; i < 401; i++ )
|
||||
maskFlags[i] = 0;
|
||||
|
||||
RODRecordList snpList = snpMaskIterator.seekForward(getToolkit().getGenomeLocParser().createGenomeLoc(contig,offset-200,offset+200));
|
||||
if ( snpList != null && snpList.size() != 0 ) {
|
||||
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
|
||||
int i = 0;
|
||||
while ( snpsInWindow.hasNext() ) {
|
||||
GenomeLoc snp = snpsInWindow.next().getLocation();
|
||||
// we don't really want to mask out multi-base indels
|
||||
if ( snp.size() > 1 )
|
||||
continue;
|
||||
logger.debug(" SNP at "+snp.getStart());
|
||||
int offsetInWindow = (int)(snp.getStart() - true_offset);
|
||||
maskFlags[offsetInWindow] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window...
|
||||
}
|
||||
|
||||
if ( ! haveBasesForWindow ) {
|
||||
byte[] context_bases = ref.getBases();
|
||||
for (int i = 0; i < 401; i++) {
|
||||
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
|
||||
context_bases[i] = 'N';
|
||||
}
|
||||
}
|
||||
leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
|
||||
trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
|
||||
// masked bases are not gonna change for the current window, unless we use windowed masking;
|
||||
// in the latter case the bases (N's) will depend on the event we are currently looking at,
|
||||
// so we better recompute..
|
||||
if ( noMaskWindow == 0 ) haveBasesForWindow = true;
|
||||
}
|
||||
|
||||
|
||||
// below, build single assay line for the current VC:
|
||||
|
||||
String assay_sequence;
|
||||
if ( vc.isSNP() )
|
||||
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
|
||||
else if ( vc.isMNP() )
|
||||
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
|
||||
else if ( vc.isInsertion() )
|
||||
assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
|
||||
else if ( vc.isDeletion() )
|
||||
assay_sequence = leading_bases + (char)ref.getBase() + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length());
|
||||
else
|
||||
continue;
|
||||
|
||||
StringBuilder assay_id = new StringBuilder();
|
||||
if ( project_id != null ) {
|
||||
assay_id.append(project_id);
|
||||
assay_id.append('|');
|
||||
}
|
||||
if ( useNamingConvention ) {
|
||||
assay_id.append('c');
|
||||
assay_id.append(context.getLocation().toString().replace(":","_p"));
|
||||
} else {
|
||||
assay_id.append(context.getLocation().toString().replace(':','_'));
|
||||
}
|
||||
if ( vc.isInsertion() ) assay_id.append("_gI");
|
||||
else if ( vc.isDeletion()) assay_id.append("_gD");
|
||||
|
||||
if ( ! omitWindow ) {
|
||||
assay_id.append("_");
|
||||
assay_id.append(ref.getWindow().toString().replace(':', '_'));
|
||||
}
|
||||
++cnt;
|
||||
if ( addCounter ) assay_id.append("_"+cnt);
|
||||
|
||||
assaysForLocus.append(assay_id);
|
||||
assaysForLocus.append('\t');
|
||||
assaysForLocus.append(assay_sequence);
|
||||
assaysForLocus.append('\n');
|
||||
}
|
||||
return assaysForLocus.toString();
|
||||
}
|
||||
|
||||
public String reduceInit() {
|
||||
return "";
|
||||
}
|
||||
|
||||
public String reduce(String data, String sum) {
|
||||
out.print(data);
|
||||
return "";
|
||||
}
|
||||
|
||||
private int getNoMaskWindowRightEnd(VariantContext vc, int window) {
|
||||
if ( window == 0 ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ( vc.isInsertion() ) {
|
||||
return window-1;
|
||||
}
|
||||
|
||||
int max = 0;
|
||||
for (Allele a : vc.getAlleles() ) {
|
||||
if ( vc.isInsertion() ) {
|
||||
logger.debug("Getting length of allele "+a.toString()+" it is "+a.getBases().length+" (ref allele is "+vc.getReference().toString()+")");
|
||||
}
|
||||
if ( a.getBases().length > max ) {
|
||||
max = a.getBases().length;
|
||||
}
|
||||
}
|
||||
return max+window-1;
|
||||
}
|
||||
|
||||
public void onTraversalDone(String sum) {
|
||||
logger.info(cnt+" assay seqences generated");
|
||||
logger.info(discarded+" events were found to be duplicates and discarded (no redundant assays generated)");
|
||||
}
|
||||
|
||||
static class EventComparator implements Comparator<VariantContext> {
|
||||
|
||||
public int compare(VariantContext o1, VariantContext o2) {
|
||||
// if variants start at different positions, they are different. All we actually
|
||||
// care about is detecting the variants that are strictly the same; the actual ordering of distinct variants
|
||||
// (which one we deem less and which one greater) is utterly unimportant. We just need to be consistent.
|
||||
if ( o1.getStart() < o2.getStart() ) return -1;
|
||||
if ( o1.getStart() > o2.getStart() ) return 1;
|
||||
|
||||
if ( o1.getType() != o2.getType() ) return o1.getType().compareTo(o2.getType());
|
||||
|
||||
int refComp = o1.getReference().compareTo(o2.getReference());
|
||||
if ( refComp != 0 ) return refComp;
|
||||
|
||||
return o1.getAlternateAllele(0).compareTo(o2.getAlternateAllele(0));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static class VariantCollection implements Iterable<VariantContext> {
|
||||
TreeSet<VariantContext> variants = new TreeSet(new EventComparator());
|
||||
int discarded = 0;
|
||||
|
||||
public void add(VariantContext vc, GenomeLoc current) {
|
||||
if ( vc.getStart() != current.getStart() ) return; // we add only variants that start at current locus
|
||||
// note that we do not check chr here, since the way this class is used, the mathod is always called with
|
||||
// VCs coming from the same metadata tracker, so they simply can not be on different chrs!
|
||||
if ( !vc.isBiallelic() ) {
|
||||
logger.info(" Non-biallelic variant encountered; skipped");
|
||||
return;
|
||||
}
|
||||
if ( variants.add(vc) == false ) discarded++;
|
||||
}
|
||||
|
||||
public void addAll(Collection<VariantContext> c, GenomeLoc current) {
|
||||
for ( VariantContext vc : c ) add(vc,current);
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
variants.clear();
|
||||
discarded = 0;
|
||||
}
|
||||
|
||||
public int discarded() { return discarded; }
|
||||
|
||||
public int size() { return variants.size(); }
|
||||
|
||||
public Iterator<VariantContext> iterator() { return variants.iterator(); }
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,413 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.validation;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: 6/13/11
|
||||
* Time: 2:12 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class),
|
||||
@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)})
|
||||
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||
|
||||
@Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false)
|
||||
boolean lowerCaseSNPs = false;
|
||||
|
||||
@Argument(doc="Size of the virtual primer to use for lower-casing regions with low specificity",fullName="virtualPrimerSize",required=false)
|
||||
int virtualPrimerSize = 20;
|
||||
|
||||
@Argument(doc="Monomorphic sites in the mask file will be treated as filtered",fullName="filterMonomorphic",required=false)
|
||||
boolean filterMonomorphic = false;
|
||||
|
||||
@Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false)
|
||||
boolean doNotUseBWA = false;
|
||||
|
||||
GenomeLoc prevInterval;
|
||||
GenomeLoc allelePos;
|
||||
String probeName;
|
||||
StringBuilder sequence;
|
||||
StringBuilder rawSequence;
|
||||
boolean sequenceInvalid;
|
||||
List<String> invReason;
|
||||
int indelCounter;
|
||||
|
||||
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
|
||||
"generated by bwa index -d bwtsw. If unspecified, will default " +
|
||||
"to the reference specified via the -R argument.",required=false)
|
||||
private File targetReferenceFile = null;
|
||||
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
BWACAligner aligner = null;
|
||||
|
||||
private SAMFileHeader header = null;
|
||||
|
||||
public void initialize() {
|
||||
if ( ! doNotUseBWA ) {
|
||||
if(targetReferenceFile == null)
|
||||
targetReferenceFile = getToolkit().getArguments().referenceFile;
|
||||
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
header = new SAMFileHeader();
|
||||
SAMSequenceDictionary referenceDictionary =
|
||||
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
|
||||
header.setSequenceDictionary(referenceDictionary);
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
prevInterval = null;
|
||||
sequence = null;
|
||||
rawSequence = null;
|
||||
sequenceInvalid = false;
|
||||
probeName = null;
|
||||
invReason = null;
|
||||
indelCounter = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; }
|
||||
|
||||
GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation();
|
||||
//logger.debug(interval);
|
||||
if ( prevInterval == null || ! interval.equals(prevInterval) ) {
|
||||
// we're in a new interval, we should:
|
||||
// 1) print out previous data
|
||||
// 2) reset internal data
|
||||
// 3) instantiate traversal of this interval
|
||||
|
||||
// step 1:
|
||||
if ( prevInterval != null ) {
|
||||
// there was a previous interval
|
||||
validateSequence(); // ensure the sequence in the region is valid
|
||||
// next line removed in favor of the one after
|
||||
if ( doNotUseBWA ) {
|
||||
lowerRepeats(); // change repeats in sequence to lower case
|
||||
} else {
|
||||
lowerNonUniqueSegments();
|
||||
}
|
||||
print(); // print out the fasta sequence
|
||||
}
|
||||
|
||||
// step 2:
|
||||
prevInterval = interval;
|
||||
allelePos = null;
|
||||
sequence = new StringBuilder();
|
||||
rawSequence = new StringBuilder();
|
||||
sequenceInvalid = false;
|
||||
invReason = new LinkedList<String>();
|
||||
logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues()));
|
||||
probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1);
|
||||
indelCounter = 0;
|
||||
}
|
||||
|
||||
// step 3 (or 1 if not new):
|
||||
// build up the sequence
|
||||
|
||||
VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus());
|
||||
VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus());
|
||||
|
||||
if ( mask == null && validate == null ) {
|
||||
if ( indelCounter > 0 ) {
|
||||
sequence.append('N');
|
||||
indelCounter--;
|
||||
} else {
|
||||
sequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||
}
|
||||
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||
} else if ( validate != null ) {
|
||||
// doesn't matter if there's a mask here too -- this is what we want to validate
|
||||
if ( validate.isFiltered() ) {
|
||||
logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site.");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("SITE_IS_FILTERED");
|
||||
}
|
||||
if ( validate.isIndel() ) {
|
||||
sequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||
}
|
||||
sequence.append('[');
|
||||
sequence.append(validate.getAlternateAllele(0).toString());
|
||||
sequence.append('/');
|
||||
sequence.append(validate.getReference().toString());
|
||||
sequence.append(']');
|
||||
// do this to the raw sequence to -- the indeces will line up that way
|
||||
rawSequence.append('[');
|
||||
rawSequence.append(validate.getAlternateAllele(0).getBaseString());
|
||||
rawSequence.append('/');
|
||||
rawSequence.append(validate.getReference().getBaseString());
|
||||
rawSequence.append(']');
|
||||
allelePos = ref.getLocus();
|
||||
if ( indelCounter > 0 ) {
|
||||
logger.warn("An indel event overlaps the event to be validated. This completely invalidates the probe.");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("INDEL_OVERLAPS_VALIDATION_SITE");
|
||||
if ( validate.isSNP() ) {
|
||||
indelCounter--;
|
||||
} else {
|
||||
indelCounter -= validate.getEnd()-validate.getStart();
|
||||
}
|
||||
}
|
||||
} else /* (mask != null && validate == null ) */ {
|
||||
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
|
||||
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
|
||||
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
|
||||
sequenceInvalid = true;
|
||||
invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
|
||||
// note: indelCounter could be > 0 (could have small deletion within larger one). This always selects
|
||||
// the larger event.
|
||||
int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart();
|
||||
if ( indelCounterNew > indelCounter ) {
|
||||
indelCounter = indelCounterNew;
|
||||
}
|
||||
//sequence.append((char) ref.getBase());
|
||||
//sequence.append(mask.isInsertion() ? 'I' : 'D');
|
||||
sequence.append("N");
|
||||
indelCounter--;
|
||||
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||
} else if ( indelCounter > 0 ) {
|
||||
// previous section resets the indel counter. Doesn't matter if there's a SNP underlying this, we just want to append an 'N' and move on.
|
||||
sequence.append('N');
|
||||
indelCounter--;
|
||||
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){
|
||||
logger.debug("SNP in mask found at " + ref.getLocus().toString());
|
||||
|
||||
if ( lowerCaseSNPs ) {
|
||||
sequence.append(Character.toLowerCase((char) ref.getBase()));
|
||||
} else {
|
||||
sequence.append((char) BaseUtils.N);
|
||||
}
|
||||
|
||||
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||
} else if ( mask.isSNP() ) {
|
||||
logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic");
|
||||
sequence.append((Character.toUpperCase((char) ref.getBase())));
|
||||
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer i, Integer j) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer fin ) {
|
||||
validateSequence();
|
||||
if ( doNotUseBWA ) {
|
||||
lowerRepeats();
|
||||
} else {
|
||||
lowerNonUniqueSegments();
|
||||
}
|
||||
print();
|
||||
}
|
||||
|
||||
public void validateSequence() {
|
||||
// code for ensuring primer sequence is valid goes here
|
||||
|
||||
// validate that there are no masked sites near to the variant site
|
||||
String seq = sequence.toString();
|
||||
int start = seq.indexOf('[') - 4;
|
||||
int end = seq.indexOf(']') + 5;
|
||||
|
||||
if ( start < 50 ) {
|
||||
logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed.");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("START_TOO_CLOSE");
|
||||
} else if ( end > seq.length() - 50 ) {
|
||||
logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. ");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("END_TOO_CLOSE");
|
||||
} else {
|
||||
boolean maskNearVariantSite = false;
|
||||
for ( int i = start; i < end; i++ ) {
|
||||
maskNearVariantSite |= (seq.charAt(i) == 'N' || Character.isLowerCase(seq.charAt(i)));
|
||||
}
|
||||
|
||||
if ( maskNearVariantSite ) {
|
||||
logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed.");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("VARIANT_TOO_NEAR_PROBE");
|
||||
}
|
||||
}
|
||||
|
||||
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
|
||||
logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap.");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("MULTIPLE_PROBES");
|
||||
}
|
||||
|
||||
if ( seq.indexOf("[") < 0 ) {
|
||||
logger.warn("No variants in region were found. This site will not be designed.");
|
||||
sequenceInvalid = true;
|
||||
invReason.add("NO_VARIANTS_FOUND");
|
||||
}
|
||||
}
|
||||
|
||||
public void lowerNonUniqueSegments() {
|
||||
if ( ! invReason.contains("MULTIPLE_PROBES") && !invReason.contains("NO_VARIANTS_FOUND") ) {
|
||||
String leftFlank = rawSequence.toString().split("\\[")[0];
|
||||
String rightFlank = rawSequence.toString().split("\\]")[1];
|
||||
List<Integer> badLeft = getBadIndeces(leftFlank);
|
||||
List<Integer> badRight = getBadIndeces(rightFlank);
|
||||
// propagate lowercases into the printed sequence
|
||||
for ( int idx = 0; idx < leftFlank.length(); idx++ ) {
|
||||
while ( badLeft.size() > 0 && idx > badLeft.get(0) + virtualPrimerSize ) {
|
||||
badLeft.remove(0);
|
||||
}
|
||||
|
||||
if ( badLeft.size() > 0 && badLeft.get(0) <= idx && idx <= badLeft.get(0) + virtualPrimerSize ) {
|
||||
sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx)));
|
||||
}
|
||||
}
|
||||
|
||||
int offset = 1 + rawSequence.indexOf("]");
|
||||
for ( int i= 0; i < rightFlank.length(); i++ ) {
|
||||
int idx = i + offset;
|
||||
while ( badRight.size() > 0 && i > badRight.get(0) + virtualPrimerSize ) {
|
||||
//logger.debug("Removing "+badRight.get(0)+" because "+(badRight.get(0)+virtualPrimerSize)+" < "+i);
|
||||
badRight.remove(0);
|
||||
}
|
||||
|
||||
if ( badRight.size() > 0 && badRight.get(0) <= i && i <= badRight.get(0) + virtualPrimerSize ) {
|
||||
//logger.debug("Resetting character on right flank: "+idx+" "+i+" offset="+offset);
|
||||
//logger.debug(sequence);
|
||||
sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx)));
|
||||
//logger.debug(sequence);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<Integer> getBadIndeces(String sequence) {
|
||||
|
||||
List<Integer> badLeftIndeces = new ArrayList<Integer>(sequence.length()-virtualPrimerSize);
|
||||
for ( int i = 0; i < sequence.length()-virtualPrimerSize ; i++ ) {
|
||||
String toAlign = sequence.substring(i,i+virtualPrimerSize);
|
||||
Iterable<Alignment[]> allAlignments = aligner.getAllAlignments(toAlign.getBytes());
|
||||
for ( Alignment[] alignments : allAlignments ) {
|
||||
if ( alignments.length > 1 ) {
|
||||
if ( alignments[0].getMappingQuality() == 0 ) {
|
||||
// this region is bad -- multiple MQ alignments
|
||||
badLeftIndeces.add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return badLeftIndeces;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Note- this is an old function - a proxy for identifying regions with low specificity to genome. Saved in case the alignment-based version
|
||||
* turns out to be worse than just doing a simple repeat-lowering method.
|
||||
*/
|
||||
public void lowerRepeats() {
|
||||
// convert to lower case low-complexity repeats, e.g. tandem k-mers
|
||||
final int K_LIM = 8;
|
||||
String seq = sequence.toString();
|
||||
StringBuilder newSequence = new StringBuilder();
|
||||
int start_pos = 0;
|
||||
while( start_pos < seq.length() ) {
|
||||
boolean broke = false;
|
||||
for ( int length = K_LIM; length > 1; length -- ) {
|
||||
//logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length()));
|
||||
if ( start_pos + 2*length> seq.length() ) {
|
||||
continue;
|
||||
}
|
||||
if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) {
|
||||
newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase());
|
||||
newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase());
|
||||
start_pos += 2*length;
|
||||
broke = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! broke ) {
|
||||
newSequence.append(seq.substring(start_pos,start_pos+1));
|
||||
start_pos++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
|
||||
return;
|
||||
}
|
||||
|
||||
sequence = newSequence;
|
||||
}
|
||||
|
||||
public boolean equalsIgnoreNs(String one, String two) {
|
||||
if ( one.length() != two.length() ) { return false; }
|
||||
for ( int idx = 0; idx < one.length(); idx++ ) {
|
||||
if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) {
|
||||
if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//logger.debug(String.format("one: %s two: %s",one,two));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public void print() {
|
||||
String valid;
|
||||
if ( sequenceInvalid ) {
|
||||
valid = "";
|
||||
while ( invReason.size() > 0 ) {
|
||||
String reason = invReason.get(0);
|
||||
invReason.remove(reason);
|
||||
int num = 1;
|
||||
while ( invReason.contains(reason) ) {
|
||||
num++;
|
||||
invReason.remove(reason);
|
||||
}
|
||||
valid += String.format("%s=%d,",reason,num);
|
||||
}
|
||||
} else {
|
||||
valid = "Valid";
|
||||
}
|
||||
|
||||
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
|
||||
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Integration tests for the aligner.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AlignerIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void testBasicAlignment() {
|
||||
String md5 = "34eb4323742999d6d250a0aaa803c6d5";
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + GATKDataLocation + "bwa/human_b36_both.fasta" +
|
||||
" -T Align" +
|
||||
" -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
executeTest("testBasicAlignment", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class PickSequenomProbesIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void testProbes() {
|
||||
String testVCF = validationDataLocation + "complexExample.vcf4";
|
||||
String testArgs = "-R " + b36KGReference + " -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B:input,VCF "+testVCF+" -o %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("6b5409cc78960f1be855536ed89ea9dd"));
|
||||
executeTest("Test probes", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbesUsingDbSNPMask() {
|
||||
|
||||
String md5 = "46d53491af1d3aa0ee1f1e13d68b732d";
|
||||
String testVCF = validationDataLocation + "pickSeqIntegrationTest.vcf";
|
||||
|
||||
String testArgs = "-snp_mask " + validationDataLocation + "pickSeqIntegrationTest.bed -R "
|
||||
+ b36KGReference + " -omitWindow -nameConvention "
|
||||
+ "-project_id 1kgp3_s4_lf -T PickSequenomProbes -B:input,VCF "+testVCF+" -o %s";
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5));
|
||||
executeTest("Test probes", spec1);
|
||||
|
||||
testArgs += " -nmw 1";
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5));
|
||||
executeTest("Test probes", spec2);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.validation;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: Ghost
|
||||
* Date: 7/19/11
|
||||
* Time: 7:39 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ValidationAmpliconsIntegrationTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
public void testWikiExample() {
|
||||
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
|
||||
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
|
||||
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
|
||||
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
|
||||
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
|
||||
testArgs += " --virtualPrimerSize 30";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("27f9450afa132888a8994167f0035fd7"));
|
||||
executeTest("Test probes", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWikiExampleNoBWA() {
|
||||
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
|
||||
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
|
||||
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
|
||||
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
|
||||
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
|
||||
testArgs += " --virtualPrimerSize 30 --doNotUseBWA";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("f2611ff1d9cd5bedaad003251fed8bc1"));
|
||||
executeTest("Test probes", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWikiExampleMonoFilter() {
|
||||
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
|
||||
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
|
||||
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
|
||||
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
|
||||
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
|
||||
testArgs += " --virtualPrimerSize 30 --filterMonomorphic";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||
Arrays.asList("77b3f30e38fedad812125bdf6cf3255f"));
|
||||
executeTest("Test probes", spec);
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue