Merge branch 'master' of ssh://copper.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
e2d0a2c459
|
|
@ -0,0 +1,70 @@
|
||||||
|
#include "MainTools.h"
|
||||||
|
#include "Basevector.h"
|
||||||
|
#include "lookup/LookAlign.h"
|
||||||
|
#include "lookup/SerialQltout.h"
|
||||||
|
|
||||||
|
unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) {
|
||||||
|
//la.PrintParseable(cout);
|
||||||
|
|
||||||
|
for (int i = 0; i < candidates.size(); i++) {
|
||||||
|
look_align newla = la;
|
||||||
|
|
||||||
|
if (newla.rc1) { candidates[i].ReverseComplement(); }
|
||||||
|
newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]);
|
||||||
|
|
||||||
|
//newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]);
|
||||||
|
//cout << newla.Errors() << " " << la.Errors() << endl;
|
||||||
|
|
||||||
|
if (newla.Errors() == la.Errors()) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//FatalErr("Query id " + ToString(la.query_id) + " had no matches.");
|
||||||
|
|
||||||
|
return candidates.size() + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
RunTime();
|
||||||
|
|
||||||
|
BeginCommandArguments;
|
||||||
|
CommandArgument_String(ALIGNS);
|
||||||
|
CommandArgument_String(FASTB_END_1);
|
||||||
|
CommandArgument_String(FASTB_END_2);
|
||||||
|
CommandArgument_String(REFERENCE);
|
||||||
|
|
||||||
|
CommandArgument_String(ALIGNS_END_1_OUT);
|
||||||
|
CommandArgument_String(ALIGNS_END_2_OUT);
|
||||||
|
EndCommandArguments;
|
||||||
|
|
||||||
|
vecbasevector ref(REFERENCE);
|
||||||
|
vecbasevector reads1(FASTB_END_1);
|
||||||
|
vecbasevector reads2(FASTB_END_2);
|
||||||
|
|
||||||
|
ofstream aligns1stream(ALIGNS_END_1_OUT.c_str());
|
||||||
|
ofstream aligns2stream(ALIGNS_END_2_OUT.c_str());
|
||||||
|
|
||||||
|
basevector bv;
|
||||||
|
|
||||||
|
SerialQltout sqltout(ALIGNS);
|
||||||
|
look_align la;
|
||||||
|
while (sqltout.Next(la)) {
|
||||||
|
vecbasevector candidates(2);
|
||||||
|
candidates[0] = reads1[la.query_id];
|
||||||
|
candidates[1] = reads2[la.query_id];
|
||||||
|
|
||||||
|
unsigned int matchingend = MatchingEnd(la, candidates, ref);
|
||||||
|
if (matchingend < 2) {
|
||||||
|
bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id];
|
||||||
|
|
||||||
|
//la.PrintParseable(cout, &bv, &ref[la.target_id]);
|
||||||
|
la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
aligns1stream.close();
|
||||||
|
aligns2stream.close();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
CXX=g++
|
||||||
|
CXXFLAGS=-g -Wall -O2 -m64 -fPIC
|
||||||
|
|
||||||
|
.cpp.o:
|
||||||
|
$(CXX) -c $(CXXFLAGS) -I$(BWA_HOME) -I$(JAVA_INCLUDE) $< -o $@
|
||||||
|
|
||||||
|
all: init lib
|
||||||
|
|
||||||
|
init:
|
||||||
|
@echo Please make sure the following platforms are set correctly on your machine.
|
||||||
|
@echo BWA_HOME=$(BWA_HOME)
|
||||||
|
@echo JAVA_INCLUDE=$(JAVA_INCLUDE)
|
||||||
|
@echo TARGET_LIB=$(TARGET_LIB)
|
||||||
|
@echo EXTRA_LIBS=$(EXTRA_LIBS)
|
||||||
|
@echo LIBTOOL_COMMAND=$(LIBTOOL_COMMAND)
|
||||||
|
|
||||||
|
lib: org_broadinstitute_sting_alignment_bwa_c_BWACAligner.o bwa_gateway.o
|
||||||
|
$(LIBTOOL_COMMAND) $? -o $(TARGET_LIB) -L$(BWA_HOME) -lbwacore $(EXTRA_LIBS)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm *.o libbwa.*
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/sh
|
||||||
|
export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa"
|
||||||
|
export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux"
|
||||||
|
export TARGET_LIB="libbwa.so"
|
||||||
|
export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread"
|
||||||
|
export LIBTOOL_COMMAND="g++ -shared -Wl,-soname,libbwa.so"
|
||||||
|
make
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/sh
|
||||||
|
export BWA_HOME="/Users/mhanna/src/bwa"
|
||||||
|
export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers"
|
||||||
|
export TARGET_LIB="libbwa.dylib"
|
||||||
|
export EXTRA_LIBS="-lc -lz -lsupc++"
|
||||||
|
export LIBTOOL_COMMAND="libtool -dynamic"
|
||||||
|
make
|
||||||
|
|
@ -0,0 +1,268 @@
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "bwase.h"
|
||||||
|
#include "bwa_gateway.h"
|
||||||
|
|
||||||
|
BWA::BWA(const char* ann_filename,
|
||||||
|
const char* amb_filename,
|
||||||
|
const char* pac_filename,
|
||||||
|
const char* forward_bwt_filename,
|
||||||
|
const char* forward_sa_filename,
|
||||||
|
const char* reverse_bwt_filename,
|
||||||
|
const char* reverse_sa_filename)
|
||||||
|
{
|
||||||
|
// Load the bns (?) and reference
|
||||||
|
bns = bns_restore_core(ann_filename,amb_filename,pac_filename);
|
||||||
|
reference = new ubyte_t[bns->l_pac/4+1];
|
||||||
|
rewind(bns->fp_pac);
|
||||||
|
fread(reference, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||||
|
fclose(bns->fp_pac);
|
||||||
|
bns->fp_pac = NULL;
|
||||||
|
|
||||||
|
// Load the BWTs (both directions) and suffix arrays (both directions)
|
||||||
|
bwts[0] = bwt_restore_bwt(forward_bwt_filename);
|
||||||
|
bwt_restore_sa(forward_sa_filename, bwts[0]);
|
||||||
|
bwts[1] = bwt_restore_bwt(reverse_bwt_filename);
|
||||||
|
bwt_restore_sa(reverse_sa_filename, bwts[1]);
|
||||||
|
load_default_options();
|
||||||
|
|
||||||
|
// initialize the bwase subsystem
|
||||||
|
bwase_initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
BWA::~BWA() {
|
||||||
|
delete[] reference;
|
||||||
|
bns_destroy(bns);
|
||||||
|
bwt_destroy(bwts[0]);
|
||||||
|
bwt_destroy(bwts[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count)
|
||||||
|
{
|
||||||
|
bwa_seq_t* sequence = create_sequence(bases, read_length);
|
||||||
|
|
||||||
|
// Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln).
|
||||||
|
// This method will destroy the contents of seq and rseq.
|
||||||
|
bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
|
||||||
|
|
||||||
|
paths = new bwt_aln1_t[sequence->n_aln];
|
||||||
|
memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t));
|
||||||
|
num_paths = sequence->n_aln;
|
||||||
|
|
||||||
|
// Call aln2seq to initialize the type of match present.
|
||||||
|
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
|
||||||
|
best_path_count = sequence->c1;
|
||||||
|
second_best_path_count = sequence->c2;
|
||||||
|
|
||||||
|
bwa_free_read_seq(1,sequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) {
|
||||||
|
bwa_seq_t* sequence = create_sequence(bases,read_length);
|
||||||
|
|
||||||
|
// Calculate paths.
|
||||||
|
bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
|
||||||
|
|
||||||
|
// Check for no alignments found and return null.
|
||||||
|
if(sequence->n_aln == 0) {
|
||||||
|
bwa_free_read_seq(1,sequence);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in.
|
||||||
|
copy_bases_into_sequence(sequence,bases,read_length);
|
||||||
|
|
||||||
|
// Pick best alignment and propagate its information into the sequence.
|
||||||
|
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
|
||||||
|
|
||||||
|
// Generate the best alignment from the sequence.
|
||||||
|
Alignment* alignment = new Alignment;
|
||||||
|
*alignment = generate_final_alignment_from_sequence(sequence);
|
||||||
|
|
||||||
|
bwa_free_read_seq(1,sequence);
|
||||||
|
|
||||||
|
return alignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BWA::generate_alignments_from_paths(const char* bases,
|
||||||
|
const unsigned read_length,
|
||||||
|
bwt_aln1_t* paths,
|
||||||
|
const unsigned num_paths,
|
||||||
|
const unsigned best_count,
|
||||||
|
const unsigned second_best_count,
|
||||||
|
Alignment*& alignments,
|
||||||
|
unsigned& num_alignments)
|
||||||
|
{
|
||||||
|
bwa_seq_t* sequence = create_sequence(bases,read_length);
|
||||||
|
|
||||||
|
sequence->aln = paths;
|
||||||
|
sequence->n_aln = num_paths;
|
||||||
|
|
||||||
|
// (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself.
|
||||||
|
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
|
||||||
|
|
||||||
|
// But overwrite key parts of the sequence in case the user passed back only a smaller subset
|
||||||
|
// of the paths.
|
||||||
|
sequence->c1 = best_count;
|
||||||
|
sequence->c2 = second_best_count;
|
||||||
|
sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
||||||
|
|
||||||
|
num_alignments = 0;
|
||||||
|
for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++)
|
||||||
|
num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1;
|
||||||
|
|
||||||
|
alignments = new Alignment[num_alignments];
|
||||||
|
unsigned alignment_idx = 0;
|
||||||
|
|
||||||
|
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
|
||||||
|
// Stub in a 'working' path, so that only the desired alignment is local-aligned.
|
||||||
|
const bwt_aln1_t* path = paths + path_idx;
|
||||||
|
bwt_aln1_t working_path = *path;
|
||||||
|
|
||||||
|
// Loop through all alignments, aligning each one individually.
|
||||||
|
for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) {
|
||||||
|
working_path.k = working_path.l = sa_idx;
|
||||||
|
sequence->aln = &working_path;
|
||||||
|
sequence->n_aln = 1;
|
||||||
|
|
||||||
|
sequence->sa = sa_idx;
|
||||||
|
sequence->strand = path->a;
|
||||||
|
sequence->score = path->score;
|
||||||
|
|
||||||
|
// Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse.
|
||||||
|
// TODO: Fix the interface to bwa_refine_gapped so its easier to work with.
|
||||||
|
if(alignment_idx > 0)
|
||||||
|
seq_reverse(sequence->len, sequence->seq, 0);
|
||||||
|
|
||||||
|
// Copy the local alignment data into the alignment object.
|
||||||
|
*(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence);
|
||||||
|
|
||||||
|
alignment_idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence->aln = NULL;
|
||||||
|
sequence->n_aln = 0;
|
||||||
|
|
||||||
|
bwa_free_read_seq(1,sequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) {
|
||||||
|
// Calculate the local coordinate and local alignment.
|
||||||
|
bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr);
|
||||||
|
bwa_refine_gapped(bns, 1, sequence, reference, NULL);
|
||||||
|
|
||||||
|
// Copy the local alignment data into the alignment object.
|
||||||
|
Alignment alignment;
|
||||||
|
|
||||||
|
// Populate basic path info
|
||||||
|
alignment.edit_distance = sequence->nm;
|
||||||
|
alignment.num_mismatches = sequence->n_mm;
|
||||||
|
alignment.num_gap_opens = sequence->n_gapo;
|
||||||
|
alignment.num_gap_extensions = sequence->n_gape;
|
||||||
|
alignment.num_best = sequence->c1;
|
||||||
|
alignment.num_second_best = sequence->c2;
|
||||||
|
|
||||||
|
// Final alignment position.
|
||||||
|
alignment.type = sequence->type;
|
||||||
|
bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig);
|
||||||
|
alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1;
|
||||||
|
alignment.negative_strand = sequence->strand;
|
||||||
|
alignment.mapping_quality = sequence->mapQ;
|
||||||
|
|
||||||
|
// Cigar step.
|
||||||
|
alignment.cigar = NULL;
|
||||||
|
if(sequence->cigar) {
|
||||||
|
alignment.cigar = new uint16_t[sequence->n_cigar];
|
||||||
|
memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t));
|
||||||
|
}
|
||||||
|
alignment.n_cigar = sequence->n_cigar;
|
||||||
|
|
||||||
|
// MD tag with a better breakdown of differences in the cigar
|
||||||
|
alignment.md = strdup(sequence->md);
|
||||||
|
delete[] sequence->md;
|
||||||
|
sequence->md = NULL;
|
||||||
|
|
||||||
|
return alignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BWA::load_default_options()
|
||||||
|
{
|
||||||
|
options.s_mm = 3;
|
||||||
|
options.s_gapo = 11;
|
||||||
|
options.s_gape = 4;
|
||||||
|
options.mode = 3;
|
||||||
|
options.indel_end_skip = 5;
|
||||||
|
options.max_del_occ = 10;
|
||||||
|
options.max_entries = 2000000;
|
||||||
|
options.fnr = 0.04;
|
||||||
|
options.max_diff = -1;
|
||||||
|
options.max_gapo = 1;
|
||||||
|
options.max_gape = 6;
|
||||||
|
options.max_seed_diff = 2;
|
||||||
|
options.seed_len = 2147483647;
|
||||||
|
options.n_threads = 1;
|
||||||
|
options.max_top2 = 30;
|
||||||
|
options.trim_qual = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BWA::set_max_edit_distance(float edit_distance) {
|
||||||
|
if(edit_distance > 0 && edit_distance < 1) {
|
||||||
|
options.fnr = edit_distance;
|
||||||
|
options.max_diff = -1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
options.fnr = -1.0;
|
||||||
|
options.max_diff = (int)edit_distance;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; }
|
||||||
|
void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; }
|
||||||
|
void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; }
|
||||||
|
void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; }
|
||||||
|
void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; }
|
||||||
|
void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a sequence with a set of reasonable initial defaults.
|
||||||
|
* Will leave seq and rseq empty.
|
||||||
|
*/
|
||||||
|
bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length)
|
||||||
|
{
|
||||||
|
bwa_seq_t* sequence = new bwa_seq_t;
|
||||||
|
|
||||||
|
sequence->tid = -1;
|
||||||
|
|
||||||
|
sequence->name = 0;
|
||||||
|
|
||||||
|
copy_bases_into_sequence(sequence, bases, read_length);
|
||||||
|
|
||||||
|
sequence->qual = 0;
|
||||||
|
sequence->aln = 0;
|
||||||
|
sequence->md = 0;
|
||||||
|
|
||||||
|
sequence->cigar = NULL;
|
||||||
|
sequence->n_cigar = 0;
|
||||||
|
|
||||||
|
sequence->multi = NULL;
|
||||||
|
sequence->n_multi = 0;
|
||||||
|
|
||||||
|
return sequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length)
|
||||||
|
{
|
||||||
|
// seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap
|
||||||
|
sequence->seq = new ubyte_t[read_length];
|
||||||
|
sequence->rseq = new ubyte_t[read_length];
|
||||||
|
for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]];
|
||||||
|
memcpy(sequence->rseq,sequence->seq,read_length);
|
||||||
|
|
||||||
|
// BWA expects the read bases to arrive reversed.
|
||||||
|
seq_reverse(read_length,sequence->seq,0);
|
||||||
|
seq_reverse(read_length,sequence->rseq,1);
|
||||||
|
|
||||||
|
sequence->full_len = sequence->len = read_length;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,82 @@
|
||||||
|
#ifndef BWA_GATEWAY
|
||||||
|
#define BWA_GATEWAY
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
#include "bntseq.h"
|
||||||
|
#include "bwt.h"
|
||||||
|
#include "bwtaln.h"
|
||||||
|
|
||||||
|
class Alignment {
|
||||||
|
public:
|
||||||
|
uint32_t type;
|
||||||
|
int contig;
|
||||||
|
bwtint_t pos;
|
||||||
|
bool negative_strand;
|
||||||
|
uint32_t mapping_quality;
|
||||||
|
|
||||||
|
uint16_t *cigar;
|
||||||
|
int n_cigar;
|
||||||
|
|
||||||
|
uint8_t num_mismatches;
|
||||||
|
uint8_t num_gap_opens;
|
||||||
|
uint8_t num_gap_extensions;
|
||||||
|
uint16_t edit_distance;
|
||||||
|
|
||||||
|
uint32_t num_best;
|
||||||
|
uint32_t num_second_best;
|
||||||
|
|
||||||
|
char* md;
|
||||||
|
};
|
||||||
|
|
||||||
|
class BWA {
|
||||||
|
private:
|
||||||
|
bntseq_t *bns;
|
||||||
|
ubyte_t* reference;
|
||||||
|
bwt_t* bwts[2];
|
||||||
|
gap_opt_t options;
|
||||||
|
|
||||||
|
void load_default_options();
|
||||||
|
bwa_seq_t* create_sequence(const char* bases, const unsigned read_length);
|
||||||
|
void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length);
|
||||||
|
Alignment generate_final_alignment_from_sequence(bwa_seq_t* sequence);
|
||||||
|
|
||||||
|
public:
|
||||||
|
BWA(const char* ann_filename,
|
||||||
|
const char* amb_filename,
|
||||||
|
const char* pac_filename,
|
||||||
|
const char* forward_bwt_filename,
|
||||||
|
const char* forward_sa_filename,
|
||||||
|
const char* reverse_bwt_filename,
|
||||||
|
const char* reverse_sa_filename);
|
||||||
|
~BWA();
|
||||||
|
|
||||||
|
// Parameterize the aligner.
|
||||||
|
void set_max_edit_distance(float edit_distance);
|
||||||
|
void set_max_gap_opens(int max_gap_opens);
|
||||||
|
void set_max_gap_extensions(int max_gap_extensions);
|
||||||
|
void set_disallow_indel_within_range(int indel_range);
|
||||||
|
void set_mismatch_penalty(int penalty);
|
||||||
|
void set_gap_open_penalty(int penalty);
|
||||||
|
void set_gap_extension_penalty(int penalty);
|
||||||
|
|
||||||
|
// Perform the alignment
|
||||||
|
Alignment* generate_single_alignment(const char* bases,
|
||||||
|
const unsigned read_length);
|
||||||
|
void find_paths(const char* bases,
|
||||||
|
const unsigned read_length,
|
||||||
|
bwt_aln1_t*& paths,
|
||||||
|
unsigned& num_paths,
|
||||||
|
unsigned& best_path_count,
|
||||||
|
unsigned& second_best_path_count);
|
||||||
|
void generate_alignments_from_paths(const char* bases,
|
||||||
|
const unsigned read_length,
|
||||||
|
bwt_aln1_t* paths,
|
||||||
|
const unsigned num_paths,
|
||||||
|
const unsigned best_count,
|
||||||
|
const unsigned second_best_count,
|
||||||
|
Alignment*& alignments,
|
||||||
|
unsigned& num_alignments);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BWA_GATEWAY
|
||||||
Binary file not shown.
|
|
@ -0,0 +1,437 @@
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include "bntseq.h"
|
||||||
|
#include "bwt.h"
|
||||||
|
#include "bwtaln.h"
|
||||||
|
#include "bwa_gateway.h"
|
||||||
|
#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h"
|
||||||
|
|
||||||
|
typedef void (BWA::*int_setter)(int value);
|
||||||
|
typedef void (BWA::*float_setter)(float value);
|
||||||
|
|
||||||
|
static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
|
||||||
|
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
|
||||||
|
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
|
||||||
|
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
|
||||||
|
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
|
||||||
|
|
||||||
|
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration)
|
||||||
|
{
|
||||||
|
jstring java_ann = get_configuration_file(env,bwtFiles,"annFile");
|
||||||
|
if(java_ann == NULL) return 0L;
|
||||||
|
jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile");
|
||||||
|
if(java_amb == NULL) return 0L;
|
||||||
|
jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile");
|
||||||
|
if(java_pac == NULL) return 0L;
|
||||||
|
jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile");
|
||||||
|
if(java_forward_bwt == NULL) return 0L;
|
||||||
|
jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile");
|
||||||
|
if(java_forward_sa == NULL) return 0L;
|
||||||
|
jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile");
|
||||||
|
if(java_reverse_bwt == NULL) return 0L;
|
||||||
|
jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile");
|
||||||
|
if(java_reverse_sa == NULL) return 0L;
|
||||||
|
|
||||||
|
const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
const char* amb_filename = env->GetStringUTFChars(java_amb,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
const char* pac_filename = env->GetStringUTFChars(java_pac,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
const char* forward_bwt_filename = env->GetStringUTFChars(java_forward_bwt,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
const char* forward_sa_filename = env->GetStringUTFChars(java_forward_sa,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
const char* reverse_bwt_filename = env->GetStringUTFChars(java_reverse_bwt,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
const char* reverse_sa_filename = env->GetStringUTFChars(java_reverse_sa,JNI_FALSE);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
|
||||||
|
BWA* bwa = new BWA(ann_filename,
|
||||||
|
amb_filename,
|
||||||
|
pac_filename,
|
||||||
|
forward_bwt_filename,
|
||||||
|
forward_sa_filename,
|
||||||
|
reverse_bwt_filename,
|
||||||
|
reverse_sa_filename);
|
||||||
|
|
||||||
|
Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(env,instance,(jlong)bwa,configuration);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
|
||||||
|
env->ReleaseStringUTFChars(java_ann,ann_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
env->ReleaseStringUTFChars(java_amb,amb_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
env->ReleaseStringUTFChars(java_pac,pac_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
env->ReleaseStringUTFChars(java_forward_bwt,forward_bwt_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
env->ReleaseStringUTFChars(java_forward_sa,forward_sa_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
env->ReleaseStringUTFChars(java_reverse_bwt,reverse_bwt_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
env->ReleaseStringUTFChars(java_reverse_sa,reverse_sa_filename);
|
||||||
|
if(env->ExceptionCheck()) return 0L;
|
||||||
|
|
||||||
|
return (jlong)bwa;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy(JNIEnv* env, jobject instance, jlong java_bwa)
|
||||||
|
{
|
||||||
|
BWA* bwa = (BWA*)java_bwa;
|
||||||
|
delete bwa;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(JNIEnv *env, jobject instance, jlong java_bwa, jobject configuration) {
|
||||||
|
BWA* bwa = (BWA*)java_bwa;
|
||||||
|
set_float_configuration_param(env, configuration, "maximumEditDistance", bwa, &BWA::set_max_edit_distance);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
set_int_configuration_param(env, configuration, "maximumGapOpens", bwa, &BWA::set_max_gap_opens);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
set_int_configuration_param(env, configuration, "maximumGapExtensions", bwa, &BWA::set_max_gap_extensions);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
set_int_configuration_param(env, configuration, "disallowIndelWithinRange", bwa, &BWA::set_disallow_indel_within_range);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
set_int_configuration_param(env, configuration, "mismatchPenalty", bwa, &BWA::set_mismatch_penalty);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
set_int_configuration_param(env, configuration, "gapOpenPenalty", bwa, &BWA::set_gap_open_penalty);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases)
|
||||||
|
{
|
||||||
|
BWA* bwa = (BWA*)java_bwa;
|
||||||
|
|
||||||
|
const jsize read_length = env->GetArrayLength(java_bases);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
|
||||||
|
if(read_bases == NULL) return NULL;
|
||||||
|
|
||||||
|
bwt_aln1_t* paths = NULL;
|
||||||
|
unsigned num_paths = 0;
|
||||||
|
|
||||||
|
unsigned best_path_count, second_best_path_count;
|
||||||
|
bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count);
|
||||||
|
|
||||||
|
jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL);
|
||||||
|
if(java_paths == NULL) return NULL;
|
||||||
|
|
||||||
|
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
|
||||||
|
bwt_aln1_t& path = *(paths + path_idx);
|
||||||
|
|
||||||
|
jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath");
|
||||||
|
if(java_path_class == NULL) return NULL;
|
||||||
|
|
||||||
|
jmethodID java_path_constructor = env->GetMethodID(java_path_class, "<init>", "(IIIZJJIII)V");
|
||||||
|
if(java_path_constructor == NULL) return NULL;
|
||||||
|
|
||||||
|
// Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints.
|
||||||
|
jobject java_path = env->NewObject(java_path_class,
|
||||||
|
java_path_constructor,
|
||||||
|
path.n_mm,
|
||||||
|
path.n_gapo,
|
||||||
|
path.n_gape,
|
||||||
|
path.a,
|
||||||
|
(jlong)path.k,
|
||||||
|
(jlong)path.l,
|
||||||
|
path.score,
|
||||||
|
best_path_count,
|
||||||
|
second_best_path_count);
|
||||||
|
if(java_path == NULL) return NULL;
|
||||||
|
|
||||||
|
env->SetObjectArrayElement(java_paths,path_idx,java_path);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
env->DeleteLocalRef(java_path_class);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] paths;
|
||||||
|
|
||||||
|
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
|
||||||
|
|
||||||
|
return env->ExceptionCheck() ? NULL : java_paths;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths)
|
||||||
|
{
|
||||||
|
BWA* bwa = (BWA*)java_bwa;
|
||||||
|
|
||||||
|
const jsize read_length = env->GetArrayLength(java_bases);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
|
||||||
|
if(read_bases == NULL) return NULL;
|
||||||
|
|
||||||
|
const jsize num_paths = env->GetArrayLength(java_paths);
|
||||||
|
bwt_aln1_t* paths = new bwt_aln1_t[num_paths];
|
||||||
|
unsigned best_count = 0, second_best_count = 0;
|
||||||
|
|
||||||
|
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
|
||||||
|
jobject java_path = env->GetObjectArrayElement(java_paths,path_idx);
|
||||||
|
jclass java_path_class = env->GetObjectClass(java_path);
|
||||||
|
if(java_path_class == NULL) return NULL;
|
||||||
|
|
||||||
|
bwt_aln1_t& path = *(paths + path_idx);
|
||||||
|
|
||||||
|
jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I");
|
||||||
|
if(mismatches_field == NULL) return NULL;
|
||||||
|
path.n_mm = env->GetIntField(java_path,mismatches_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I");
|
||||||
|
if(gap_opens_field == NULL) return NULL;
|
||||||
|
path.n_gapo = env->GetIntField(java_path,gap_opens_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I");
|
||||||
|
if(gap_extensions_field == NULL) return NULL;
|
||||||
|
path.n_gape = env->GetIntField(java_path,gap_extensions_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z");
|
||||||
|
if(negative_strand_field == NULL) return NULL;
|
||||||
|
path.a = env->GetBooleanField(java_path,negative_strand_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID k_field = env->GetFieldID(java_path_class, "k", "J");
|
||||||
|
if(k_field == NULL) return NULL;
|
||||||
|
path.k = env->GetLongField(java_path,k_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID l_field = env->GetFieldID(java_path_class, "l", "J");
|
||||||
|
if(l_field == NULL) return NULL;
|
||||||
|
path.l = env->GetLongField(java_path,l_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID score_field = env->GetFieldID(java_path_class, "score", "I");
|
||||||
|
if(score_field == NULL) return NULL;
|
||||||
|
path.score = env->GetIntField(java_path,score_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I");
|
||||||
|
if(best_count_field == NULL) return NULL;
|
||||||
|
best_count = env->GetIntField(java_path,best_count_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I");
|
||||||
|
if(second_best_count_field == NULL) return NULL;
|
||||||
|
second_best_count = env->GetIntField(java_path,second_best_count_field);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
Alignment* alignments = NULL;
|
||||||
|
unsigned num_alignments = 0;
|
||||||
|
bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments);
|
||||||
|
|
||||||
|
jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL);
|
||||||
|
if(java_alignments == NULL) return NULL;
|
||||||
|
|
||||||
|
for(unsigned alignment_idx = 0; alignment_idx < (unsigned)num_alignments; alignment_idx++) {
|
||||||
|
Alignment& alignment = *(alignments + alignment_idx);
|
||||||
|
jobject java_alignment = convert_to_java_alignment(env,read_bases,read_length,alignment);
|
||||||
|
if(java_alignment == NULL) return NULL;
|
||||||
|
env->SetObjectArrayElement(java_alignments,alignment_idx,java_alignment);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] alignments;
|
||||||
|
delete[] paths;
|
||||||
|
|
||||||
|
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
|
||||||
|
|
||||||
|
return env->ExceptionCheck() ? NULL : java_alignments;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) {
|
||||||
|
BWA* bwa = (BWA*)java_bwa;
|
||||||
|
|
||||||
|
const jsize read_length = env->GetArrayLength(java_bases);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
|
||||||
|
if(read_bases == NULL) return NULL;
|
||||||
|
|
||||||
|
Alignment* best_alignment = bwa->generate_single_alignment((const char*)read_bases,read_length);
|
||||||
|
jobject java_best_alignment = (best_alignment != NULL) ? convert_to_java_alignment(env,read_bases,read_length,*best_alignment) : NULL;
|
||||||
|
delete best_alignment;
|
||||||
|
|
||||||
|
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
|
||||||
|
|
||||||
|
return java_best_alignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment) {
|
||||||
|
unsigned cigar_length;
|
||||||
|
if(alignment.type == BWA_TYPE_NO_MATCH) cigar_length = 0;
|
||||||
|
else if(!alignment.cigar) cigar_length = 1;
|
||||||
|
else cigar_length = alignment.n_cigar;
|
||||||
|
|
||||||
|
jcharArray java_cigar_operators = env->NewCharArray(cigar_length);
|
||||||
|
if(java_cigar_operators == NULL) return NULL;
|
||||||
|
jintArray java_cigar_lengths = env->NewIntArray(cigar_length);
|
||||||
|
if(java_cigar_lengths == NULL) return NULL;
|
||||||
|
|
||||||
|
if(alignment.cigar) {
|
||||||
|
for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) {
|
||||||
|
jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14];
|
||||||
|
jint cigar_length = alignment.cigar[cigar_idx]&0x3fff;
|
||||||
|
|
||||||
|
env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if(alignment.type != BWA_TYPE_NO_MATCH) {
|
||||||
|
jchar cigar_operator = 'M';
|
||||||
|
env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete[] alignment.cigar;
|
||||||
|
|
||||||
|
jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment");
|
||||||
|
if(java_alignment_class == NULL) return NULL;
|
||||||
|
|
||||||
|
jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "<init>", "(IIZI[C[IILjava/lang/String;IIIII)V");
|
||||||
|
if(java_alignment_constructor == NULL) return NULL;
|
||||||
|
|
||||||
|
jstring java_md = env->NewStringUTF(alignment.md);
|
||||||
|
if(java_md == NULL) return NULL;
|
||||||
|
delete[] alignment.md;
|
||||||
|
|
||||||
|
jobject java_alignment = env->NewObject(java_alignment_class,
|
||||||
|
java_alignment_constructor,
|
||||||
|
alignment.contig,
|
||||||
|
alignment.pos,
|
||||||
|
alignment.negative_strand,
|
||||||
|
alignment.mapping_quality,
|
||||||
|
java_cigar_operators,
|
||||||
|
java_cigar_lengths,
|
||||||
|
alignment.edit_distance,
|
||||||
|
java_md,
|
||||||
|
alignment.num_mismatches,
|
||||||
|
alignment.num_gap_opens,
|
||||||
|
alignment.num_gap_extensions,
|
||||||
|
alignment.num_best,
|
||||||
|
alignment.num_second_best);
|
||||||
|
if(java_alignment == NULL) return NULL;
|
||||||
|
|
||||||
|
env->DeleteLocalRef(java_alignment_class);
|
||||||
|
if(env->ExceptionCheck()) return NULL;
|
||||||
|
|
||||||
|
return java_alignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) {
|
||||||
|
jclass configuration_class = env->GetObjectClass(configuration);
|
||||||
|
if(configuration_class == NULL) return NULL;
|
||||||
|
|
||||||
|
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;");
|
||||||
|
if(configuration_field == NULL) return NULL;
|
||||||
|
|
||||||
|
jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field);
|
||||||
|
|
||||||
|
jclass file_class = env->FindClass("java/io/File");
|
||||||
|
if(file_class == NULL) return NULL;
|
||||||
|
|
||||||
|
jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;");
|
||||||
|
if(path_extractor == NULL) return NULL;
|
||||||
|
|
||||||
|
jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor);
|
||||||
|
if(path == NULL) return NULL;
|
||||||
|
|
||||||
|
env->DeleteLocalRef(configuration_class);
|
||||||
|
env->DeleteLocalRef(file_class);
|
||||||
|
env->DeleteLocalRef(configuration_file);
|
||||||
|
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {
|
||||||
|
jclass configuration_class = env->GetObjectClass(configuration);
|
||||||
|
if(configuration_class == NULL) return;
|
||||||
|
|
||||||
|
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Integer;");
|
||||||
|
if(configuration_field == NULL) return;
|
||||||
|
|
||||||
|
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
|
||||||
|
if(boxed_value != NULL) {
|
||||||
|
jclass int_box_class = env->FindClass("java/lang/Integer");
|
||||||
|
if(int_box_class == NULL) return;
|
||||||
|
|
||||||
|
jmethodID int_extractor = env->GetMethodID(int_box_class,"intValue", "()I");
|
||||||
|
if(int_extractor == NULL) return;
|
||||||
|
|
||||||
|
jint value = env->CallIntMethod(boxed_value,int_extractor);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
|
||||||
|
if(value < 0)
|
||||||
|
{
|
||||||
|
throw_config_value_exception(env,field_name,"cannot be set to a negative value");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
(bwa->*setter)(value);
|
||||||
|
|
||||||
|
env->DeleteLocalRef(int_box_class);
|
||||||
|
}
|
||||||
|
|
||||||
|
env->DeleteLocalRef(boxed_value);
|
||||||
|
env->DeleteLocalRef(configuration_class);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter)
|
||||||
|
{
|
||||||
|
jclass configuration_class = env->GetObjectClass(configuration);
|
||||||
|
if(configuration_class == NULL) return;
|
||||||
|
|
||||||
|
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Float;");
|
||||||
|
if(configuration_field == NULL) return;
|
||||||
|
|
||||||
|
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
|
||||||
|
if(boxed_value != NULL) {
|
||||||
|
jclass float_box_class = env->FindClass("java/lang/Float");
|
||||||
|
if(float_box_class == NULL) return;
|
||||||
|
|
||||||
|
jmethodID float_extractor = env->GetMethodID(float_box_class,"floatValue", "()F");
|
||||||
|
if(float_extractor == NULL) return;
|
||||||
|
|
||||||
|
jfloat value = env->CallFloatMethod(boxed_value,float_extractor);
|
||||||
|
if(env->ExceptionCheck()) return;
|
||||||
|
|
||||||
|
if(value < 0)
|
||||||
|
{
|
||||||
|
throw_config_value_exception(env,field_name,"cannot be set to a negative value");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
(bwa->*setter)(value);
|
||||||
|
|
||||||
|
env->DeleteLocalRef(float_box_class);
|
||||||
|
}
|
||||||
|
|
||||||
|
env->DeleteLocalRef(boxed_value);
|
||||||
|
env->DeleteLocalRef(configuration_class);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message)
|
||||||
|
{
|
||||||
|
char* buffer = new char[strlen(field_name)+1+strlen(message)+1];
|
||||||
|
sprintf(buffer,"%s %s",field_name,message);
|
||||||
|
jclass sting_exception_class = env->FindClass("org/broadinstitute/sting/utils/StingException");
|
||||||
|
if(sting_exception_class == NULL) return;
|
||||||
|
env->ThrowNew(sting_exception_class, buffer);
|
||||||
|
delete[] buffer;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
/* DO NOT EDIT THIS FILE - it is machine generated */
|
||||||
|
#include <jni.h>
|
||||||
|
/* Header for class org_broadinstitute_sting_alignment_bwa_c_BWACAligner */
|
||||||
|
|
||||||
|
#ifndef _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
#define _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
/*
|
||||||
|
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
* Method: create
|
||||||
|
* Signature: (Lorg/broadinstitute/sting/alignment/bwa/BWTFiles;Lorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)J
|
||||||
|
*/
|
||||||
|
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create
|
||||||
|
(JNIEnv *, jobject, jobject, jobject);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
* Method: updateConfiguration
|
||||||
|
* Signature: (JLorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)V
|
||||||
|
*/
|
||||||
|
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration
|
||||||
|
(JNIEnv *, jobject, jlong, jobject);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
* Method: destroy
|
||||||
|
* Signature: (J)V
|
||||||
|
*/
|
||||||
|
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy
|
||||||
|
(JNIEnv *, jobject, jlong);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
* Method: getPaths
|
||||||
|
* Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;
|
||||||
|
*/
|
||||||
|
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths
|
||||||
|
(JNIEnv *, jobject, jlong, jbyteArray);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
* Method: convertPathsToAlignments
|
||||||
|
* Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment;
|
||||||
|
*/
|
||||||
|
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments
|
||||||
|
(JNIEnv *, jobject, jlong, jbyteArray, jobjectArray);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
|
||||||
|
* Method: getBestAlignment
|
||||||
|
* Signature: (J[B)Lorg/broadinstitute/sting/alignment/Alignment;
|
||||||
|
*/
|
||||||
|
JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment
|
||||||
|
(JNIEnv *, jobject, jlong, jbyteArray);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
CC=gcc
|
||||||
|
CCFLAGS=-Wall -dynamiclib -arch i386 -arch x86_64
|
||||||
|
|
||||||
|
libenvironhack.dylib: libenvironhack.c
|
||||||
|
$(CC) $(CCFLAGS) -init _init_environ $< -o $@
|
||||||
|
|
||||||
|
all: libenvironhack.dylib
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f libenvironhack.dylib
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+.
|
||||||
|
nm $LSF_LIBDIR/liblsf.dylib | grep environ
|
||||||
|
See "man environ" for more info, along with http://lists.apple.com/archives/java-dev/2007/Dec/msg00096.html
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <crt_externs.h>
|
||||||
|
|
||||||
|
char **environ = (char **)0;
|
||||||
|
|
||||||
|
void init_environ(void) {
|
||||||
|
environ = (*_NSGetEnviron());
|
||||||
|
}
|
||||||
Binary file not shown.
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create perfect alignments from the read to the genome represented by the given BWT / suffix array.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public interface Aligner {
|
||||||
|
/**
|
||||||
|
* Close this instance of the BWA pointer and delete its resources.
|
||||||
|
*/
|
||||||
|
public void close();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
|
||||||
|
* @param bases Bases to align.
|
||||||
|
* @return An align
|
||||||
|
*/
|
||||||
|
public Alignment getBestAlignment(final byte[] bases);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Align the read to the reference.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @param header Optional header to drop in place.
|
||||||
|
* @return A list of the alignments.
|
||||||
|
*/
|
||||||
|
public SAMRecord align(final SAMRecord read, final SAMFileHeader header);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a iterator of alignments, batched by mapping quality.
|
||||||
|
* @param bases List of bases.
|
||||||
|
* @return Iterator to alignments.
|
||||||
|
*/
|
||||||
|
public Iterable<Alignment[]> getAllAlignments(final byte[] bases);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a iterator of aligned reads, batched by mapping quality.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
|
||||||
|
* @return Iterator to alignments.
|
||||||
|
*/
|
||||||
|
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,221 @@
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
||||||
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents an alignment of a read to a site in the reference genome.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class Alignment {
|
||||||
|
protected int contigIndex;
|
||||||
|
protected long alignmentStart;
|
||||||
|
protected boolean negativeStrand;
|
||||||
|
protected int mappingQuality;
|
||||||
|
|
||||||
|
protected char[] cigarOperators;
|
||||||
|
protected int[] cigarLengths;
|
||||||
|
|
||||||
|
protected int editDistance;
|
||||||
|
protected String mismatchingPositions;
|
||||||
|
|
||||||
|
protected int numMismatches;
|
||||||
|
protected int numGapOpens;
|
||||||
|
protected int numGapExtensions;
|
||||||
|
protected int bestCount;
|
||||||
|
protected int secondBestCount;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the index of the given contig.
|
||||||
|
* @return the inde
|
||||||
|
*/
|
||||||
|
public int getContigIndex() { return contigIndex; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the starting position for the given alignment.
|
||||||
|
* @return Starting position.
|
||||||
|
*/
|
||||||
|
public long getAlignmentStart() { return alignmentStart; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is the given alignment on the reverse strand?
|
||||||
|
* @return True if the alignment is on the reverse strand.
|
||||||
|
*/
|
||||||
|
public boolean isNegativeStrand() { return negativeStrand; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the score of this alignment.
|
||||||
|
* @return The score.
|
||||||
|
*/
|
||||||
|
public int getMappingQuality() { return mappingQuality; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the edit distance; will eventually end up in the NM SAM tag
|
||||||
|
* if this alignment makes it that far.
|
||||||
|
* @return The edit distance.
|
||||||
|
*/
|
||||||
|
public int getEditDistance() { return editDistance; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A string representation of which positions mismatch; contents of MD tag.
|
||||||
|
* @return String representation of mismatching positions.
|
||||||
|
*/
|
||||||
|
public String getMismatchingPositions() { return mismatchingPositions; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the number of mismatches in the read.
|
||||||
|
* @return Number of mismatches.
|
||||||
|
*/
|
||||||
|
public int getNumMismatches() { return numMismatches; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of gap opens.
|
||||||
|
* @return Number of gap opens.
|
||||||
|
*/
|
||||||
|
public int getNumGapOpens() { return numGapOpens; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of gap extensions.
|
||||||
|
* @return Number of gap extensions.
|
||||||
|
*/
|
||||||
|
public int getNumGapExtensions() { return numGapExtensions; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of best alignments.
|
||||||
|
* @return Number of top scoring alignments.
|
||||||
|
*/
|
||||||
|
public int getBestCount() { return bestCount; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of second best alignments.
|
||||||
|
* @return Number of second best scoring alignments.
|
||||||
|
*/
|
||||||
|
public int getSecondBestCount() { return secondBestCount; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the cigar for this alignment.
|
||||||
|
* @return sam-jdk formatted alignment.
|
||||||
|
*/
|
||||||
|
public Cigar getCigar() {
|
||||||
|
Cigar cigar = new Cigar();
|
||||||
|
for(int i = 0; i < cigarOperators.length; i++) {
|
||||||
|
CigarOperator operator = CigarOperator.characterToEnum(cigarOperators[i]);
|
||||||
|
cigar.add(new CigarElement(cigarLengths[i],operator));
|
||||||
|
}
|
||||||
|
return cigar;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Temporarily implement getCigarString() for debugging; the TextCigarCodec is unfortunately
|
||||||
|
* package-protected.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getCigarString() {
|
||||||
|
Cigar cigar = getCigar();
|
||||||
|
if(cigar.isEmpty()) return "*";
|
||||||
|
|
||||||
|
StringBuilder cigarString = new StringBuilder();
|
||||||
|
for(CigarElement element: cigar.getCigarElements()) {
|
||||||
|
cigarString.append(element.getLength());
|
||||||
|
cigarString.append(element.getOperator());
|
||||||
|
}
|
||||||
|
return cigarString.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stub for inheritance.
|
||||||
|
*/
|
||||||
|
public Alignment() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new alignment object.
|
||||||
|
* @param contigIndex The contig to which this read aligned.
|
||||||
|
* @param alignmentStart The point within the contig to which this read aligned.
|
||||||
|
* @param negativeStrand Forward or reverse alignment of the given read.
|
||||||
|
* @param mappingQuality How good does BWA think this mapping is?
|
||||||
|
* @param cigarOperators The ordered operators in the cigar string.
|
||||||
|
* @param cigarLengths The lengths to which each operator applies.
|
||||||
|
* @param editDistance The edit distance (cumulative) of the read.
|
||||||
|
* @param mismatchingPositions String representation of which bases in the read mismatch.
|
||||||
|
* @param numMismatches Number of total mismatches in the read.
|
||||||
|
* @param numGapOpens Number of gap opens in the read.
|
||||||
|
* @param numGapExtensions Number of gap extensions in the read.
|
||||||
|
* @param bestCount Number of best alignments in the read.
|
||||||
|
* @param secondBestCount Number of second best alignments in the read.
|
||||||
|
*/
|
||||||
|
public Alignment(int contigIndex,
|
||||||
|
int alignmentStart,
|
||||||
|
boolean negativeStrand,
|
||||||
|
int mappingQuality,
|
||||||
|
char[] cigarOperators,
|
||||||
|
int[] cigarLengths,
|
||||||
|
int editDistance,
|
||||||
|
String mismatchingPositions,
|
||||||
|
int numMismatches,
|
||||||
|
int numGapOpens,
|
||||||
|
int numGapExtensions,
|
||||||
|
int bestCount,
|
||||||
|
int secondBestCount) {
|
||||||
|
this.contigIndex = contigIndex;
|
||||||
|
this.alignmentStart = alignmentStart;
|
||||||
|
this.negativeStrand = negativeStrand;
|
||||||
|
this.mappingQuality = mappingQuality;
|
||||||
|
this.cigarOperators = cigarOperators;
|
||||||
|
this.cigarLengths = cigarLengths;
|
||||||
|
this.editDistance = editDistance;
|
||||||
|
this.mismatchingPositions = mismatchingPositions;
|
||||||
|
this.numMismatches = numMismatches;
|
||||||
|
this.numGapOpens = numGapOpens;
|
||||||
|
this.numGapExtensions = numGapExtensions;
|
||||||
|
this.bestCount = bestCount;
|
||||||
|
this.secondBestCount = secondBestCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a read directly from an alignment.
|
||||||
|
* @param alignment The alignment to convert to a read.
|
||||||
|
* @param unmappedRead Source of the unmapped read. Should have bases, quality scores, and flags.
|
||||||
|
* @param newSAMHeader The new SAM header to use in creating this read. Can be null, but if so, the sequence
|
||||||
|
* dictionary in the
|
||||||
|
* @return A mapped alignment.
|
||||||
|
*/
|
||||||
|
public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader) {
|
||||||
|
SAMRecord read;
|
||||||
|
try {
|
||||||
|
read = (SAMRecord)unmappedRead.clone();
|
||||||
|
}
|
||||||
|
catch(CloneNotSupportedException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to create aligned read from template.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(newSAMHeader != null)
|
||||||
|
read.setHeader(newSAMHeader);
|
||||||
|
|
||||||
|
// If we're realigning a previously aligned record, strip out the placement of the alignment.
|
||||||
|
read.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
||||||
|
read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||||
|
read.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
||||||
|
read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||||
|
|
||||||
|
if(alignment != null) {
|
||||||
|
read.setReadUnmappedFlag(false);
|
||||||
|
read.setReferenceIndex(alignment.getContigIndex());
|
||||||
|
read.setAlignmentStart((int)alignment.getAlignmentStart());
|
||||||
|
read.setReadNegativeStrandFlag(alignment.isNegativeStrand());
|
||||||
|
read.setMappingQuality(alignment.getMappingQuality());
|
||||||
|
read.setCigar(alignment.getCigar());
|
||||||
|
if(alignment.isNegativeStrand()) {
|
||||||
|
read.setReadBases(BaseUtils.simpleReverseComplement(read.getReadBases()));
|
||||||
|
read.setBaseQualities(Utils.reverse(read.getBaseQualities()));
|
||||||
|
}
|
||||||
|
read.setAttribute("NM",alignment.getEditDistance());
|
||||||
|
read.setAttribute("MD",alignment.getMismatchingPositions());
|
||||||
|
}
|
||||||
|
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,157 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them
|
||||||
|
* of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original
|
||||||
|
* alignment from the input file.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class AlignmentValidationWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
/**
|
||||||
|
* The supporting BWT index generated using BWT.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
|
||||||
|
private String prefix = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The instance used to generate alignments.
|
||||||
|
*/
|
||||||
|
private BWACAligner aligner = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
if(prefix == null)
|
||||||
|
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
|
||||||
|
BWTFiles bwtFiles = new BWTFiles(prefix);
|
||||||
|
BWAConfiguration configuration = new BWAConfiguration();
|
||||||
|
aligner = new BWACAligner(bwtFiles,configuration);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aligns a read to the given reference.
|
||||||
|
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @return Number of reads aligned by this map (aka 1).
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
//logger.info(String.format("examining read %s", read.getReadName()));
|
||||||
|
|
||||||
|
byte[] bases = read.getReadBases();
|
||||||
|
if(read.getReadNegativeStrandFlag()) bases = BaseUtils.simpleReverseComplement(bases);
|
||||||
|
|
||||||
|
boolean matches = true;
|
||||||
|
Iterable<Alignment[]> alignments = aligner.getAllAlignments(bases);
|
||||||
|
Iterator<Alignment[]> alignmentIterator = alignments.iterator();
|
||||||
|
|
||||||
|
if(!alignmentIterator.hasNext()) {
|
||||||
|
matches = read.getReadUnmappedFlag();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Alignment[] alignmentsOfBestQuality = alignmentIterator.next();
|
||||||
|
for(Alignment alignment: alignmentsOfBestQuality) {
|
||||||
|
matches = (alignment.getContigIndex() == read.getReferenceIndex());
|
||||||
|
matches &= (alignment.getAlignmentStart() == read.getAlignmentStart());
|
||||||
|
matches &= (alignment.isNegativeStrand() == read.getReadNegativeStrandFlag());
|
||||||
|
matches &= (alignment.getCigar().equals(read.getCigar()));
|
||||||
|
matches &= (alignment.getMappingQuality() == read.getMappingQuality());
|
||||||
|
if(matches) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!matches) {
|
||||||
|
logger.error("Found mismatch!");
|
||||||
|
logger.error(String.format("Read %s:",read.getReadName()));
|
||||||
|
logger.error(String.format(" Contig index: %d",read.getReferenceIndex()));
|
||||||
|
logger.error(String.format(" Alignment start: %d", read.getAlignmentStart()));
|
||||||
|
logger.error(String.format(" Negative strand: %b", read.getReadNegativeStrandFlag()));
|
||||||
|
logger.error(String.format(" Cigar: %s%n", read.getCigarString()));
|
||||||
|
logger.error(String.format(" Mapping quality: %s%n", read.getMappingQuality()));
|
||||||
|
for(Alignment[] alignmentsByScore: alignments) {
|
||||||
|
for(int i = 0; i < alignmentsByScore.length; i++) {
|
||||||
|
logger.error(String.format("Alignment %d:",i));
|
||||||
|
logger.error(String.format(" Contig index: %d",alignmentsByScore[i].getContigIndex()));
|
||||||
|
logger.error(String.format(" Alignment start: %d", alignmentsByScore[i].getAlignmentStart()));
|
||||||
|
logger.error(String.format(" Negative strand: %b", alignmentsByScore[i].isNegativeStrand()));
|
||||||
|
logger.error(String.format(" Cigar: %s", alignmentsByScore[i].getCigarString()));
|
||||||
|
logger.error(String.format(" Mapping quality: %s%n", alignmentsByScore[i].getMappingQuality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new ReviewedStingException(String.format("Read %s mismatches!", read.getReadName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initial value for reduce. In this case, validated reads will be counted.
|
||||||
|
* @return 0, indicating no reads yet validated.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of reads processed.
|
||||||
|
* @param value Number of reads processed by this map.
|
||||||
|
* @param sum Number of reads processed before this map.
|
||||||
|
* @return Number of reads processed up to and including this map.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup.
|
||||||
|
* @param result Number of reads processed.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
aligner.close();
|
||||||
|
super.onTraversalDone(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,134 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
|
||||||
|
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
@WalkerName("Align")
|
||||||
|
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
|
||||||
|
"generated by bwa index -d bwtsw. If unspecified, will default " +
|
||||||
|
"to the reference specified via the -R argument.",required=false)
|
||||||
|
private File targetReferenceFile = null;
|
||||||
|
|
||||||
|
@Output
|
||||||
|
private StingSAMFileWriter out = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The actual aligner.
|
||||||
|
*/
|
||||||
|
private BWACAligner aligner = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* New header to use, if desired.
|
||||||
|
*/
|
||||||
|
private SAMFileHeader header;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
if(targetReferenceFile == null)
|
||||||
|
targetReferenceFile = getToolkit().getArguments().referenceFile;
|
||||||
|
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
|
||||||
|
BWAConfiguration configuration = new BWAConfiguration();
|
||||||
|
aligner = new BWACAligner(bwtFiles,configuration);
|
||||||
|
|
||||||
|
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
|
||||||
|
header = getToolkit().getSAMFileHeader().clone();
|
||||||
|
SAMSequenceDictionary referenceDictionary =
|
||||||
|
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
|
||||||
|
header.setSequenceDictionary(referenceDictionary);
|
||||||
|
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||||
|
|
||||||
|
out.writeHeader(header);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aligns a read to the given reference.
|
||||||
|
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @return Number of alignments found for this read.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
SAMRecord alignedRead = aligner.align(read,header);
|
||||||
|
out.addAlignment(alignedRead);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initial value for reduce. In this case, alignments will be counted.
|
||||||
|
* @return 0, indicating no alignments yet found.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of alignments found.
|
||||||
|
* @param value Number of alignments found by this map.
|
||||||
|
* @param sum Number of alignments found before this map.
|
||||||
|
* @return Number of alignments found up to and including this map.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup.
|
||||||
|
* @param result Number of reads processed.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
aligner.close();
|
||||||
|
super.onTraversalDone(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.SortedMap;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
|
||||||
|
* frequency of that number of placements.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class CountBestAlignmentsWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
/**
|
||||||
|
* The supporting BWT index generated using BWT.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
|
||||||
|
private String prefix = null;
|
||||||
|
|
||||||
|
@Output
|
||||||
|
private PrintStream out = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The actual aligner.
|
||||||
|
*/
|
||||||
|
private Aligner aligner = null;
|
||||||
|
|
||||||
|
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
if(prefix == null)
|
||||||
|
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
|
||||||
|
BWTFiles bwtFiles = new BWTFiles(prefix);
|
||||||
|
BWAConfiguration configuration = new BWAConfiguration();
|
||||||
|
aligner = new BWACAligner(bwtFiles,configuration);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aligns a read to the given reference.
|
||||||
|
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @return Number of alignments found for this read.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
|
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
|
||||||
|
if(alignmentIterator.hasNext()) {
|
||||||
|
int numAlignments = alignmentIterator.next().length;
|
||||||
|
if(alignmentFrequencies.containsKey(numAlignments))
|
||||||
|
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
|
||||||
|
else
|
||||||
|
alignmentFrequencies.put(numAlignments,1);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initial value for reduce. In this case, validated reads will be counted.
|
||||||
|
* @return 0, indicating no reads yet validated.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of reads processed.
|
||||||
|
* @param value Number of reads processed by this map.
|
||||||
|
* @param sum Number of reads processed before this map.
|
||||||
|
* @return Number of reads processed up to and including this map.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup.
|
||||||
|
* @param result Number of reads processed.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
aligner.close();
|
||||||
|
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
|
||||||
|
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
|
||||||
|
super.onTraversalDone(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.Aligner;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Align reads using BWA.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public abstract class BWAAligner implements Aligner {
|
||||||
|
/**
|
||||||
|
* The supporting files used by BWA.
|
||||||
|
*/
|
||||||
|
protected BWTFiles bwtFiles;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The current configuration for the BWA aligner.
|
||||||
|
*/
|
||||||
|
protected BWAConfiguration configuration;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct
|
||||||
|
* parameters.
|
||||||
|
* @param bwtFiles The many files representing BWTs persisted to disk.
|
||||||
|
* @param configuration Configuration parameters for the alignment.
|
||||||
|
*/
|
||||||
|
public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
|
||||||
|
this.bwtFiles = bwtFiles;
|
||||||
|
this.configuration = configuration;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the configuration passed to the BWA aligner.
|
||||||
|
* @param configuration New configuration to set.
|
||||||
|
*/
|
||||||
|
public abstract void updateConfiguration(BWAConfiguration configuration);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration for the BWA/C aligner.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWAConfiguration {
|
||||||
|
/**
|
||||||
|
* The maximum edit distance used by BWA.
|
||||||
|
*/
|
||||||
|
public Float maximumEditDistance = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many gap opens are acceptable within this alignment?
|
||||||
|
*/
|
||||||
|
public Integer maximumGapOpens = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many gap extensions are acceptable within this alignment?
|
||||||
|
*/
|
||||||
|
public Integer maximumGapExtensions = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do we disallow indels within a certain range from the start / end?
|
||||||
|
*/
|
||||||
|
public Integer disallowIndelWithinRange = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What is the scoring penalty for a mismatch?
|
||||||
|
*/
|
||||||
|
public Integer mismatchPenalty = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What is the scoring penalty for a gap open?
|
||||||
|
*/
|
||||||
|
public Integer gapOpenPenalty = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What is the scoring penalty for a gap extension?
|
||||||
|
*/
|
||||||
|
public Integer gapExtensionPenalty = null;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,234 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
import net.sf.samtools.util.StringUtil;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.bwt.*;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Support files for BWT.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWTFiles {
|
||||||
|
/**
|
||||||
|
* ANN (?) file name.
|
||||||
|
*/
|
||||||
|
public final File annFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* AMB (?) file name.
|
||||||
|
*/
|
||||||
|
public final File ambFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Packed reference sequence file.
|
||||||
|
*/
|
||||||
|
public final File pacFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverse of packed reference sequence file.
|
||||||
|
*/
|
||||||
|
public final File rpacFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Forward BWT file.
|
||||||
|
*/
|
||||||
|
public final File forwardBWTFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Forward suffix array file.
|
||||||
|
*/
|
||||||
|
public final File forwardSAFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverse BWT file.
|
||||||
|
*/
|
||||||
|
public final File reverseBWTFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverse suffix array file.
|
||||||
|
*/
|
||||||
|
public final File reverseSAFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Where these files autogenerated on the fly?
|
||||||
|
*/
|
||||||
|
public final boolean autogenerated;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new BWA configuration file using the given prefix.
|
||||||
|
* @param prefix Prefix to use when creating the configuration. Must not be null.
|
||||||
|
*/
|
||||||
|
public BWTFiles(String prefix) {
|
||||||
|
if(prefix == null)
|
||||||
|
throw new ReviewedStingException("Prefix must not be null.");
|
||||||
|
annFile = new File(prefix + ".ann");
|
||||||
|
ambFile = new File(prefix + ".amb");
|
||||||
|
pacFile = new File(prefix + ".pac");
|
||||||
|
rpacFile = new File(prefix + ".rpac");
|
||||||
|
forwardBWTFile = new File(prefix + ".bwt");
|
||||||
|
forwardSAFile = new File(prefix + ".sa");
|
||||||
|
reverseBWTFile = new File(prefix + ".rbwt");
|
||||||
|
reverseSAFile = new File(prefix + ".rsa");
|
||||||
|
autogenerated = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hand-create a new BWTFiles object, specifying a unique file object for each type.
|
||||||
|
* @param annFile ANN (alternate dictionary) file.
|
||||||
|
* @param ambFile AMB (holes) files.
|
||||||
|
* @param pacFile Packed representation of the forward reference sequence.
|
||||||
|
* @param forwardBWTFile BWT representation of the forward reference sequence.
|
||||||
|
* @param forwardSAFile SA representation of the forward reference sequence.
|
||||||
|
* @param rpacFile Packed representation of the reversed reference sequence.
|
||||||
|
* @param reverseBWTFile BWT representation of the reversed reference sequence.
|
||||||
|
* @param reverseSAFile SA representation of the reversed reference sequence.
|
||||||
|
*/
|
||||||
|
private BWTFiles(File annFile,
|
||||||
|
File ambFile,
|
||||||
|
File pacFile,
|
||||||
|
File forwardBWTFile,
|
||||||
|
File forwardSAFile,
|
||||||
|
File rpacFile,
|
||||||
|
File reverseBWTFile,
|
||||||
|
File reverseSAFile) {
|
||||||
|
this.annFile = annFile;
|
||||||
|
this.ambFile = ambFile;
|
||||||
|
this.pacFile = pacFile;
|
||||||
|
this.forwardBWTFile = forwardBWTFile;
|
||||||
|
this.forwardSAFile = forwardSAFile;
|
||||||
|
this.rpacFile = rpacFile;
|
||||||
|
this.reverseBWTFile = reverseBWTFile;
|
||||||
|
this.reverseSAFile = reverseSAFile;
|
||||||
|
autogenerated = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close out this files object, in the process deleting any temporary filse
|
||||||
|
* that were created.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
if(autogenerated) {
|
||||||
|
boolean success = true;
|
||||||
|
success = annFile.delete();
|
||||||
|
success &= ambFile.delete();
|
||||||
|
success &= pacFile.delete();
|
||||||
|
success &= forwardBWTFile.delete();
|
||||||
|
success &= forwardSAFile.delete();
|
||||||
|
success &= rpacFile.delete();
|
||||||
|
success &= reverseBWTFile.delete();
|
||||||
|
success &= reverseSAFile.delete();
|
||||||
|
|
||||||
|
if(!success)
|
||||||
|
throw new ReviewedStingException("Unable to clean up autogenerated representation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new set of BWT files from the given reference sequence.
|
||||||
|
* @param referenceSequence Sequence from which to build metadata.
|
||||||
|
* @return A new object representing encoded representations of each sequence.
|
||||||
|
*/
|
||||||
|
public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
|
||||||
|
byte[] normalizedReferenceSequence = new byte[referenceSequence.length];
|
||||||
|
System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length);
|
||||||
|
normalizeReferenceSequence(normalizedReferenceSequence);
|
||||||
|
|
||||||
|
File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
|
||||||
|
try {
|
||||||
|
// Write the ann and amb for this reference sequence.
|
||||||
|
annFile = File.createTempFile("bwt",".ann");
|
||||||
|
ambFile = File.createTempFile("bwt",".amb");
|
||||||
|
|
||||||
|
SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
|
||||||
|
dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length));
|
||||||
|
|
||||||
|
ANNWriter annWriter = new ANNWriter(annFile);
|
||||||
|
annWriter.write(dictionary);
|
||||||
|
annWriter.close();
|
||||||
|
|
||||||
|
AMBWriter ambWriter = new AMBWriter(ambFile);
|
||||||
|
ambWriter.writeEmpty(dictionary);
|
||||||
|
ambWriter.close();
|
||||||
|
|
||||||
|
// Write the encoded files for the forward version of this reference sequence.
|
||||||
|
pacFile = File.createTempFile("bwt",".pac");
|
||||||
|
bwtFile = File.createTempFile("bwt",".bwt");
|
||||||
|
saFile = File.createTempFile("bwt",".sa");
|
||||||
|
|
||||||
|
writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile);
|
||||||
|
|
||||||
|
// Write the encoded files for the reverse version of this reference sequence.
|
||||||
|
byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence);
|
||||||
|
|
||||||
|
rpacFile = File.createTempFile("bwt",".rpac");
|
||||||
|
rbwtFile = File.createTempFile("bwt",".rbwt");
|
||||||
|
rsaFile = File.createTempFile("bwt",".rsa");
|
||||||
|
|
||||||
|
writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
|
||||||
|
}
|
||||||
|
catch(IOException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to write autogenerated reference sequence to temporary files");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that, at the very least, all temporary files are deleted on exit.
|
||||||
|
annFile.deleteOnExit();
|
||||||
|
ambFile.deleteOnExit();
|
||||||
|
pacFile.deleteOnExit();
|
||||||
|
bwtFile.deleteOnExit();
|
||||||
|
saFile.deleteOnExit();
|
||||||
|
rpacFile.deleteOnExit();
|
||||||
|
rbwtFile.deleteOnExit();
|
||||||
|
rsaFile.deleteOnExit();
|
||||||
|
|
||||||
|
return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
|
||||||
|
* sequence is the reference itself in PAC format, the BWT, and the suffix array.
|
||||||
|
* @param referenceSequence The reference sequence to encode.
|
||||||
|
* @param pacFile Target for the PAC-encoded reference.
|
||||||
|
* @param bwtFile Target for the BWT representation of the reference.
|
||||||
|
* @param suffixArrayFile Target for the suffix array encoding of the reference.
|
||||||
|
* @throws java.io.IOException In case of issues writing to the file.
|
||||||
|
*/
|
||||||
|
private static void writeEncodedReferenceSequence(byte[] referenceSequence,
|
||||||
|
File pacFile,
|
||||||
|
File bwtFile,
|
||||||
|
File suffixArrayFile) throws IOException {
|
||||||
|
PackUtils.writeReferenceSequence(pacFile,referenceSequence);
|
||||||
|
|
||||||
|
BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
|
||||||
|
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||||
|
bwtWriter.write(bwt);
|
||||||
|
bwtWriter.close();
|
||||||
|
|
||||||
|
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
|
||||||
|
SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
|
||||||
|
suffixArrayWriter.write(suffixArray);
|
||||||
|
suffixArrayWriter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the given reference sequence into a form suitable for building into
|
||||||
|
* on-the-fly sequences.
|
||||||
|
* @param referenceSequence The reference sequence to normalize.
|
||||||
|
* @throws org.broadinstitute.sting.utils.exceptions.ReviewedStingException if normalized sequence cannot be generated.
|
||||||
|
*/
|
||||||
|
private static void normalizeReferenceSequence(byte[] referenceSequence) {
|
||||||
|
StringUtil.toUpperCase(referenceSequence);
|
||||||
|
for(byte base: referenceSequence) {
|
||||||
|
if(base != 'A' && base != 'C' && base != 'G' && base != 'T')
|
||||||
|
throw new ReviewedStingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,259 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.c;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.alignment.Alignment;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An aligner using the BWA/C implementation.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWACAligner extends BWAAligner {
|
||||||
|
static {
|
||||||
|
System.loadLibrary("bwa");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A pointer to the C++ object representing the BWA engine.
|
||||||
|
*/
|
||||||
|
private long thunkPointer = 0;
|
||||||
|
|
||||||
|
public BWACAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
|
||||||
|
super(bwtFiles,configuration);
|
||||||
|
if(thunkPointer != 0)
|
||||||
|
throw new ReviewedStingException("BWA/C attempting to reinitialize.");
|
||||||
|
|
||||||
|
if(!bwtFiles.annFile.exists()) throw new ReviewedStingException("ANN file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
if(!bwtFiles.ambFile.exists()) throw new ReviewedStingException("AMB file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
if(!bwtFiles.pacFile.exists()) throw new ReviewedStingException("PAC file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
if(!bwtFiles.forwardBWTFile.exists()) throw new ReviewedStingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
if(!bwtFiles.forwardSAFile.exists()) throw new ReviewedStingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
if(!bwtFiles.reverseBWTFile.exists()) throw new ReviewedStingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
if(!bwtFiles.reverseSAFile.exists()) throw new ReviewedStingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it.");
|
||||||
|
|
||||||
|
thunkPointer = create(bwtFiles,configuration);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an aligner object using an array of bytes as a reference.
|
||||||
|
* @param referenceSequence Reference sequence to encode ad-hoc.
|
||||||
|
* @param configuration Configuration for the given aligner.
|
||||||
|
*/
|
||||||
|
public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) {
|
||||||
|
this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration);
|
||||||
|
// Now that the temporary files are created, the temporary files can be destroyed.
|
||||||
|
bwtFiles.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the configuration passed to the BWA aligner.
|
||||||
|
* @param configuration New configuration to set.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void updateConfiguration(BWAConfiguration configuration) {
|
||||||
|
if(thunkPointer == 0)
|
||||||
|
throw new ReviewedStingException("BWA/C: attempting to update configuration of uninitialized aligner.");
|
||||||
|
updateConfiguration(thunkPointer,configuration);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close this instance of the BWA pointer and delete its resources.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
if(thunkPointer == 0)
|
||||||
|
throw new ReviewedStingException("BWA/C close attempted, but BWA/C is not properly initialized.");
|
||||||
|
destroy(thunkPointer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
|
||||||
|
* @param bases Bases to align.
|
||||||
|
* @return An align
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Alignment getBestAlignment(final byte[] bases) {
|
||||||
|
if(thunkPointer == 0)
|
||||||
|
throw new ReviewedStingException("BWA/C getBestAlignment attempted, but BWA/C is not properly initialized.");
|
||||||
|
return getBestAlignment(thunkPointer,bases);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the best aligned read, chosen randomly from the pile of best alignments.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @param newHeader New header to apply to this SAM file. Can be null, but if so, read header must be valid.
|
||||||
|
* @return Read with injected alignment data.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public SAMRecord align(final SAMRecord read, final SAMFileHeader newHeader) {
|
||||||
|
if(bwtFiles.autogenerated)
|
||||||
|
throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
|
||||||
|
return Alignment.convertToRead(getBestAlignment(read.getReadBases()),read,newHeader);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a iterator of alignments, batched by mapping quality.
|
||||||
|
* @param bases List of bases.
|
||||||
|
* @return Iterator to alignments.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Iterable<Alignment[]> getAllAlignments(final byte[] bases) {
|
||||||
|
final BWAPath[] paths = getPaths(bases);
|
||||||
|
return new Iterable<Alignment[]>() {
|
||||||
|
public Iterator<Alignment[]> iterator() {
|
||||||
|
return new Iterator<Alignment[]>() {
|
||||||
|
/**
|
||||||
|
* The last position accessed.
|
||||||
|
*/
|
||||||
|
private int position = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether all alignments have been seen based on the current position.
|
||||||
|
* @return True if any more alignments are pending. False otherwise.
|
||||||
|
*/
|
||||||
|
public boolean hasNext() { return position < paths.length; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the next cross-section of alignments, based on mapping quality.
|
||||||
|
* @return Array of the next set of alignments of a given mapping quality.
|
||||||
|
*/
|
||||||
|
public Alignment[] next() {
|
||||||
|
if(position >= paths.length)
|
||||||
|
throw new UnsupportedOperationException("Out of alignments to return.");
|
||||||
|
int score = paths[position].score;
|
||||||
|
int startingPosition = position;
|
||||||
|
while(position < paths.length && paths[position].score == score) position++;
|
||||||
|
return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unsupported.
|
||||||
|
*/
|
||||||
|
public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a iterator of aligned reads, batched by mapping quality.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
|
||||||
|
* @return Iterator to alignments.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader) {
|
||||||
|
if(bwtFiles.autogenerated)
|
||||||
|
throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
|
||||||
|
final Iterable<Alignment[]> alignments = getAllAlignments(read.getReadBases());
|
||||||
|
return new Iterable<SAMRecord[]>() {
|
||||||
|
public Iterator<SAMRecord[]> iterator() {
|
||||||
|
final Iterator<Alignment[]> alignmentIterator = alignments.iterator();
|
||||||
|
return new Iterator<SAMRecord[]>() {
|
||||||
|
/**
|
||||||
|
* Whether all alignments have been seen based on the current position.
|
||||||
|
* @return True if any more alignments are pending. False otherwise.
|
||||||
|
*/
|
||||||
|
public boolean hasNext() { return alignmentIterator.hasNext(); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the next cross-section of alignments, based on mapping quality.
|
||||||
|
* @return Array of the next set of alignments of a given mapping quality.
|
||||||
|
*/
|
||||||
|
public SAMRecord[] next() {
|
||||||
|
Alignment[] alignmentsOfQuality = alignmentIterator.next();
|
||||||
|
SAMRecord[] reads = new SAMRecord[alignmentsOfQuality.length];
|
||||||
|
for(int i = 0; i < alignmentsOfQuality.length; i++) {
|
||||||
|
reads[i] = Alignment.convertToRead(alignmentsOfQuality[i],read,newHeader);
|
||||||
|
}
|
||||||
|
return reads;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unsupported.
|
||||||
|
*/
|
||||||
|
public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the paths associated with the given base string.
|
||||||
|
* @param bases List of bases.
|
||||||
|
* @return A set of paths through the BWA.
|
||||||
|
*/
|
||||||
|
public BWAPath[] getPaths(byte[] bases) {
|
||||||
|
if(thunkPointer == 0)
|
||||||
|
throw new ReviewedStingException("BWA/C getPaths attempted, but BWA/C is not properly initialized.");
|
||||||
|
return getPaths(thunkPointer,bases);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a pointer to the BWA/C thunk.
|
||||||
|
* @param files BWT source files.
|
||||||
|
* @param configuration Configuration of the aligner.
|
||||||
|
* @return Pointer to the BWA/C thunk.
|
||||||
|
*/
|
||||||
|
protected native long create(BWTFiles files, BWAConfiguration configuration);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the configuration passed to the BWA aligner. For internal use only.
|
||||||
|
* @param thunkPointer pointer to BWA object.
|
||||||
|
* @param configuration New configuration to set.
|
||||||
|
*/
|
||||||
|
protected native void updateConfiguration(long thunkPointer, BWAConfiguration configuration);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Destroy the BWA/C thunk.
|
||||||
|
* @param thunkPointer Pointer to the allocated thunk.
|
||||||
|
*/
|
||||||
|
protected native void destroy(long thunkPointer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do the extra steps involved in converting a local alignment to a global alignment.
|
||||||
|
* @param bases ASCII representation of byte array.
|
||||||
|
* @param paths Paths through the current BWT.
|
||||||
|
* @return A list of alignments.
|
||||||
|
*/
|
||||||
|
protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) {
|
||||||
|
if(thunkPointer == 0)
|
||||||
|
throw new ReviewedStingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized.");
|
||||||
|
return convertPathsToAlignments(thunkPointer,bases,paths);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead.
|
||||||
|
* @param thunkPointer pointer to the C++ object managing BWA/C.
|
||||||
|
* @param bases ASCII representation of byte array.
|
||||||
|
* @return A list of paths through the specified BWT.
|
||||||
|
*/
|
||||||
|
protected native BWAPath[] getPaths(long thunkPointer, byte[] bases);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do the extra steps involved in converting a local alignment to a global alignment.
|
||||||
|
* Call this method's convertPathsToAlignments() wrapper (above) instead.
|
||||||
|
* @param thunkPointer pointer to the C++ object managing BWA/C.
|
||||||
|
* @param bases ASCII representation of byte array.
|
||||||
|
* @param paths Paths through the current BWT.
|
||||||
|
* @return A list of alignments.
|
||||||
|
*/
|
||||||
|
protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the best alignment from BWA/C, randomly selected from all best-aligned reads.
|
||||||
|
* @param thunkPointer Pointer to BWA thunk.
|
||||||
|
* @param bases bases to align.
|
||||||
|
* @return The best alignment from BWA/C.
|
||||||
|
*/
|
||||||
|
protected native Alignment getBestAlignment(long thunkPointer, byte[] bases);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,101 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.c;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models a BWA path.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWAPath {
|
||||||
|
/**
|
||||||
|
* Number of mismatches encountered along this path.
|
||||||
|
*/
|
||||||
|
public final int numMismatches;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of gap opens encountered along this path.
|
||||||
|
*/
|
||||||
|
public final int numGapOpens;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of gap extensions along this path.
|
||||||
|
*/
|
||||||
|
public final int numGapExtensions;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether this alignment was found on the positive or negative strand.
|
||||||
|
*/
|
||||||
|
public final boolean negativeStrand;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Starting coordinate in the BWT.
|
||||||
|
*/
|
||||||
|
public final long k;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ending coordinate in the BWT.
|
||||||
|
*/
|
||||||
|
public final long l;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The score of this path.
|
||||||
|
*/
|
||||||
|
public final int score;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of best alignments seen along this path.
|
||||||
|
*/
|
||||||
|
public final int bestCount;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of second best alignments seen along this path.
|
||||||
|
*/
|
||||||
|
public final int secondBestCount;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new path with the given attributes.
|
||||||
|
* @param numMismatches Number of mismatches along path.
|
||||||
|
* @param numGapOpens Number of gap opens along path.
|
||||||
|
* @param numGapExtensions Number of gap extensions along path.
|
||||||
|
* @param k Index to first coordinate within BWT.
|
||||||
|
* @param l Index to last coordinate within BWT.
|
||||||
|
* @param score Score of this alignment. Not the mapping quality.
|
||||||
|
*/
|
||||||
|
public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) {
|
||||||
|
this.numMismatches = numMismatches;
|
||||||
|
this.numGapOpens = numGapOpens;
|
||||||
|
this.numGapExtensions = numGapExtensions;
|
||||||
|
this.negativeStrand = negativeStrand;
|
||||||
|
this.k = k;
|
||||||
|
this.l = l;
|
||||||
|
this.score = score;
|
||||||
|
this.bestCount = bestCount;
|
||||||
|
this.secondBestCount = secondBestCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,164 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.java;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.alignment.Aligner;
|
||||||
|
import org.broadinstitute.sting.alignment.Alignment;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A test harness to ensure that the perfect aligner works.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class AlignerTestHarness {
|
||||||
|
public static void main( String argv[] ) throws FileNotFoundException {
|
||||||
|
if( argv.length != 6 ) {
|
||||||
|
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <rbwt> <sa> <rsa> <bam>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
File referenceFile = new File(argv[0]);
|
||||||
|
File bwtFile = new File(argv[1]);
|
||||||
|
File rbwtFile = new File(argv[2]);
|
||||||
|
File suffixArrayFile = new File(argv[3]);
|
||||||
|
File reverseSuffixArrayFile = new File(argv[4]);
|
||||||
|
File bamFile = new File(argv[5]);
|
||||||
|
|
||||||
|
align(referenceFile,bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile,bamFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void align(File referenceFile, File bwtFile, File rbwtFile, File suffixArrayFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException {
|
||||||
|
Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile);
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
SAMFileReader reader = new SAMFileReader(bamFile);
|
||||||
|
reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||||
|
|
||||||
|
int mismatches = 0;
|
||||||
|
int failures = 0;
|
||||||
|
|
||||||
|
for(SAMRecord read: reader) {
|
||||||
|
count++;
|
||||||
|
if( count > 200000 ) break;
|
||||||
|
//if( count < 366000 ) continue;
|
||||||
|
//if( count > 2 ) break;
|
||||||
|
//if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
|
||||||
|
// continue;
|
||||||
|
//if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )
|
||||||
|
// continue;
|
||||||
|
//if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") )
|
||||||
|
// continue;
|
||||||
|
|
||||||
|
SAMRecord alignmentCleaned = null;
|
||||||
|
try {
|
||||||
|
alignmentCleaned = (SAMRecord)read.clone();
|
||||||
|
}
|
||||||
|
catch( CloneNotSupportedException ex ) {
|
||||||
|
throw new ReviewedStingException("SAMRecord clone not supported", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( alignmentCleaned.getReadNegativeStrandFlag() )
|
||||||
|
alignmentCleaned.setReadBases(BaseUtils.simpleReverseComplement(alignmentCleaned.getReadBases()));
|
||||||
|
|
||||||
|
alignmentCleaned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
|
||||||
|
alignmentCleaned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||||
|
alignmentCleaned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
|
||||||
|
alignmentCleaned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
|
||||||
|
|
||||||
|
// Clear everything except flags pertaining to pairing and set 'unmapped' status to true.
|
||||||
|
alignmentCleaned.setFlags(alignmentCleaned.getFlags() & 0x00A1 | 0x000C);
|
||||||
|
|
||||||
|
Iterable<Alignment[]> alignments = aligner.getAllAlignments(alignmentCleaned.getReadBases());
|
||||||
|
if(!alignments.iterator().hasNext() ) {
|
||||||
|
//throw new StingException(String.format("Unable to align read %s to reference; count = %d",read.getReadName(),count));
|
||||||
|
System.out.printf("Unable to align read %s to reference; count = %d%n",read.getReadName(),count);
|
||||||
|
failures++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Alignment foundAlignment = null;
|
||||||
|
for(Alignment[] alignmentsOfQuality: alignments) {
|
||||||
|
for(Alignment alignment: alignmentsOfQuality) {
|
||||||
|
if( read.getReadNegativeStrandFlag() != alignment.isNegativeStrand() )
|
||||||
|
continue;
|
||||||
|
if( read.getAlignmentStart() != alignment.getAlignmentStart() )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
foundAlignment = alignment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( foundAlignment != null ) {
|
||||||
|
//System.out.printf("%s: Aligned read to reference at position %d with %d mismatches, %d gap opens, and %d gap extensions.%n", read.getReadName(), foundAlignment.getAlignmentStart(), foundAlignment.getMismatches(), foundAlignment.getGapOpens(), foundAlignment.getGapExtensions());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.printf("Error aligning read %s%n", read.getReadName());
|
||||||
|
|
||||||
|
mismatches++;
|
||||||
|
|
||||||
|
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
|
||||||
|
|
||||||
|
System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION),
|
||||||
|
read.getAlignmentStart(),
|
||||||
|
read.getReadNegativeStrandFlag());
|
||||||
|
int numDeletions = numDeletionsInCigar(read.getCigar());
|
||||||
|
String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()+numDeletions-1).getBases());
|
||||||
|
System.out.printf("expected ref = %s%n", formatBasesBasedOnCigar(expectedRef,read.getCigar(),CigarOperator.INSERTION));
|
||||||
|
|
||||||
|
for(Alignment[] alignmentsOfQuality: alignments) {
|
||||||
|
for(Alignment alignment: alignmentsOfQuality) {
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
Cigar cigar = ((BWAAlignment)alignment).getCigar();
|
||||||
|
|
||||||
|
System.out.printf("read = %s%n", formatBasesBasedOnCigar(read.getReadString(),cigar,CigarOperator.DELETION));
|
||||||
|
|
||||||
|
int deletionCount = ((BWAAlignment)alignment).getNumberOfBasesMatchingState(AlignmentState.DELETION);
|
||||||
|
String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignment.getAlignmentStart(),alignment.getAlignmentStart()+read.getReadLength()+deletionCount-1).getBases());
|
||||||
|
System.out.printf("actual ref = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(alignedRef,cigar,CigarOperator.INSERTION),
|
||||||
|
alignment.getAlignmentStart(),
|
||||||
|
alignment.isNegativeStrand());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//throw new StingException(String.format("Read %s was placed at incorrect location; count = %d%n",read.getReadName(),count));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if( count % 1000 == 0 )
|
||||||
|
System.out.printf("%d reads examined.%n",count);
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String formatBasesBasedOnCigar( String bases, Cigar cigar, CigarOperator toBlank ) {
|
||||||
|
StringBuilder formatted = new StringBuilder();
|
||||||
|
int readIndex = 0;
|
||||||
|
for(CigarElement cigarElement: cigar.getCigarElements()) {
|
||||||
|
if(cigarElement.getOperator() == toBlank) {
|
||||||
|
int number = cigarElement.getLength();
|
||||||
|
while( number-- > 0 ) formatted.append(' ');
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int number = cigarElement.getLength();
|
||||||
|
while( number-- > 0 ) formatted.append(bases.charAt(readIndex++));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return formatted.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int numDeletionsInCigar( Cigar cigar ) {
|
||||||
|
int numDeletions = 0;
|
||||||
|
for(CigarElement cigarElement: cigar.getCigarElements()) {
|
||||||
|
if(cigarElement.getOperator() == CigarOperator.DELETION)
|
||||||
|
numDeletions += cigarElement.getLength();
|
||||||
|
}
|
||||||
|
return numDeletions;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,150 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.java;
|
||||||
|
|
||||||
|
import net.sf.samtools.Cigar;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a sequence of matches.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class AlignmentMatchSequence implements Cloneable {
|
||||||
|
/**
|
||||||
|
* Stores the particular match entries in the order they occur.
|
||||||
|
*/
|
||||||
|
private Deque<AlignmentMatchSequenceEntry> entries = new ArrayDeque<AlignmentMatchSequenceEntry>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clone the given match sequence.
|
||||||
|
* @return A deep copy of the current match sequence.
|
||||||
|
*/
|
||||||
|
public AlignmentMatchSequence clone() {
|
||||||
|
AlignmentMatchSequence copy = null;
|
||||||
|
try {
|
||||||
|
copy = (AlignmentMatchSequence)super.clone();
|
||||||
|
}
|
||||||
|
catch( CloneNotSupportedException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to clone AlignmentMatchSequence.");
|
||||||
|
}
|
||||||
|
|
||||||
|
copy.entries = new ArrayDeque<AlignmentMatchSequenceEntry>();
|
||||||
|
for( AlignmentMatchSequenceEntry entry: entries )
|
||||||
|
copy.entries.add(entry.clone());
|
||||||
|
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Cigar convertToCigar(boolean negativeStrand) {
|
||||||
|
Cigar cigar = new Cigar();
|
||||||
|
Iterator<AlignmentMatchSequenceEntry> iterator = negativeStrand ? entries.descendingIterator() : entries.iterator();
|
||||||
|
while( iterator.hasNext() ) {
|
||||||
|
AlignmentMatchSequenceEntry entry = iterator.next();
|
||||||
|
CigarOperator operator;
|
||||||
|
switch( entry.getAlignmentState() ) {
|
||||||
|
case MATCH_MISMATCH: operator = CigarOperator.MATCH_OR_MISMATCH; break;
|
||||||
|
case INSERTION: operator = CigarOperator.INSERTION; break;
|
||||||
|
case DELETION: operator = CigarOperator.DELETION; break;
|
||||||
|
default: throw new ReviewedStingException("convertToCigar: cannot process state: " + entry.getAlignmentState());
|
||||||
|
}
|
||||||
|
cigar.add( new CigarElement(entry.count,operator) );
|
||||||
|
}
|
||||||
|
return cigar;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* All a new alignment of the given state.
|
||||||
|
* @param state State to add to the sequence.
|
||||||
|
*/
|
||||||
|
public void addNext( AlignmentState state ) {
|
||||||
|
AlignmentMatchSequenceEntry last = entries.peekLast();
|
||||||
|
// If the last entry is the same as this one, increment it. Otherwise, add a new entry.
|
||||||
|
if( last != null && last.alignmentState == state )
|
||||||
|
last.increment();
|
||||||
|
else
|
||||||
|
entries.add(new AlignmentMatchSequenceEntry(state));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the current state of this alignment (what's the state of the last base?)
|
||||||
|
* @return State of the most recently aligned base.
|
||||||
|
*/
|
||||||
|
public AlignmentState getCurrentState() {
|
||||||
|
if( entries.size() == 0 )
|
||||||
|
return AlignmentState.MATCH_MISMATCH;
|
||||||
|
return entries.peekLast().getAlignmentState();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bases in the read match the given state.
|
||||||
|
* @param state State to test.
|
||||||
|
* @return number of bases which match that state.
|
||||||
|
*/
|
||||||
|
public int getNumberOfBasesMatchingState(AlignmentState state) {
|
||||||
|
int matches = 0;
|
||||||
|
for( AlignmentMatchSequenceEntry entry: entries ) {
|
||||||
|
if( entry.getAlignmentState() == state )
|
||||||
|
matches += entry.count;
|
||||||
|
}
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stores an individual match sequence entry.
|
||||||
|
*/
|
||||||
|
private class AlignmentMatchSequenceEntry implements Cloneable {
|
||||||
|
/**
|
||||||
|
* The state of the alignment throughout a given point in the sequence.
|
||||||
|
*/
|
||||||
|
private final AlignmentState alignmentState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of bases having this particular state.
|
||||||
|
*/
|
||||||
|
private int count;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new sequence entry with the given state.
|
||||||
|
* @param alignmentState The state that this sequence should contain.
|
||||||
|
*/
|
||||||
|
AlignmentMatchSequenceEntry( AlignmentState alignmentState ) {
|
||||||
|
this.alignmentState = alignmentState;
|
||||||
|
this.count = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clone the given match sequence entry.
|
||||||
|
* @return A deep copy of the current match sequence entry.
|
||||||
|
*/
|
||||||
|
public AlignmentMatchSequenceEntry clone() {
|
||||||
|
try {
|
||||||
|
return (AlignmentMatchSequenceEntry)super.clone();
|
||||||
|
}
|
||||||
|
catch( CloneNotSupportedException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to clone AlignmentMatchSequenceEntry.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the current state of the alignment.
|
||||||
|
* @return The state of the current sequence.
|
||||||
|
*/
|
||||||
|
AlignmentState getAlignmentState() {
|
||||||
|
return alignmentState;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increment the count of alignments having this particular state.
|
||||||
|
*/
|
||||||
|
void increment() {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.java;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The state of a given base in the alignment.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public enum AlignmentState {
|
||||||
|
MATCH_MISMATCH,
|
||||||
|
INSERTION,
|
||||||
|
DELETION
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,190 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.java;
|
||||||
|
|
||||||
|
import net.sf.samtools.Cigar;
|
||||||
|
import org.broadinstitute.sting.alignment.Alignment;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An alignment object to be used incrementally as the BWA aligner
|
||||||
|
* inspects the read.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWAAlignment extends Alignment implements Cloneable {
|
||||||
|
/**
|
||||||
|
* Track the number of alignments that have been created.
|
||||||
|
*/
|
||||||
|
private static long numCreated;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Which number alignment is this?
|
||||||
|
*/
|
||||||
|
private long creationNumber;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The aligner performing the alignments.
|
||||||
|
*/
|
||||||
|
protected BWAJavaAligner aligner;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The sequence of matches/mismatches/insertions/deletions.
|
||||||
|
*/
|
||||||
|
private AlignmentMatchSequence alignmentMatchSequence = new AlignmentMatchSequence();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Working variable. How many bases have been matched at this point.
|
||||||
|
*/
|
||||||
|
protected int position;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Working variable. How many mismatches have been encountered at this point.
|
||||||
|
*/
|
||||||
|
private int mismatches;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of gap opens in alignment.
|
||||||
|
*/
|
||||||
|
private int gapOpens;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of gap extensions in alignment.
|
||||||
|
*/
|
||||||
|
private int gapExtensions;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Working variable. The lower bound of the alignment within the BWT.
|
||||||
|
*/
|
||||||
|
protected long loBound;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Working variable. The upper bound of the alignment within the BWT.
|
||||||
|
*/
|
||||||
|
protected long hiBound;
|
||||||
|
|
||||||
|
protected void setAlignmentStart(long position) {
|
||||||
|
this.alignmentStart = position;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setNegativeStrand(boolean negativeStrand) {
|
||||||
|
this.negativeStrand = negativeStrand;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cache the score.
|
||||||
|
*/
|
||||||
|
private int score;
|
||||||
|
|
||||||
|
public Cigar getCigar() {
|
||||||
|
return alignmentMatchSequence.convertToCigar(isNegativeStrand());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the current state of this alignment (state of the last base viewed)..
|
||||||
|
* @return Current state of the alignment.
|
||||||
|
*/
|
||||||
|
public AlignmentState getCurrentState() {
|
||||||
|
return alignmentMatchSequence.getCurrentState();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds the given state to the current alignment.
|
||||||
|
* @param state State to add to the given alignment.
|
||||||
|
*/
|
||||||
|
public void addState( AlignmentState state ) {
|
||||||
|
alignmentMatchSequence.addNext(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the BWA score of this alignment.
|
||||||
|
* @return BWA-style scores. 0 is best.
|
||||||
|
*/
|
||||||
|
public int getScore() {
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMismatches() { return mismatches; }
|
||||||
|
public int getGapOpens() { return gapOpens; }
|
||||||
|
public int getGapExtensions() { return gapExtensions; }
|
||||||
|
|
||||||
|
public void incrementMismatches() {
|
||||||
|
this.mismatches++;
|
||||||
|
updateScore();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementGapOpens() {
|
||||||
|
this.gapOpens++;
|
||||||
|
updateScore();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementGapExtensions() {
|
||||||
|
this.gapExtensions++;
|
||||||
|
updateScore();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates the score based on new information about matches / mismatches.
|
||||||
|
*/
|
||||||
|
private void updateScore() {
|
||||||
|
score = mismatches*aligner.MISMATCH_PENALTY + gapOpens*aligner.GAP_OPEN_PENALTY + gapExtensions*aligner.GAP_EXTENSION_PENALTY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new alignment with the given parent aligner.
|
||||||
|
* @param aligner Aligner being used.
|
||||||
|
*/
|
||||||
|
public BWAAlignment( BWAJavaAligner aligner ) {
|
||||||
|
this.aligner = aligner;
|
||||||
|
this.creationNumber = numCreated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clone the alignment.
|
||||||
|
* @return New instance of the alignment.
|
||||||
|
*/
|
||||||
|
public BWAAlignment clone() {
|
||||||
|
BWAAlignment newAlignment = null;
|
||||||
|
try {
|
||||||
|
newAlignment = (BWAAlignment)super.clone();
|
||||||
|
}
|
||||||
|
catch( CloneNotSupportedException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to clone BWAAlignment.");
|
||||||
|
}
|
||||||
|
newAlignment.creationNumber = numCreated++;
|
||||||
|
newAlignment.alignmentMatchSequence = alignmentMatchSequence.clone();
|
||||||
|
|
||||||
|
return newAlignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bases in the read match the given state.
|
||||||
|
* @param state State to test.
|
||||||
|
* @return number of bases which match that state.
|
||||||
|
*/
|
||||||
|
public int getNumberOfBasesMatchingState(AlignmentState state) {
|
||||||
|
return alignmentMatchSequence.getNumberOfBasesMatchingState(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare this alignment to another alignment.
|
||||||
|
* @param rhs Other alignment to which to compare.
|
||||||
|
* @return < 0 if this < other, == 0 if this == other, > 0 if this > other
|
||||||
|
*/
|
||||||
|
public int compareTo(Alignment rhs) {
|
||||||
|
BWAAlignment other = (BWAAlignment)rhs;
|
||||||
|
|
||||||
|
// If the scores are different, disambiguate using the score.
|
||||||
|
if(score != other.score)
|
||||||
|
return score > other.score ? 1 : -1;
|
||||||
|
|
||||||
|
// Otherwise, use the order in which the elements were created.
|
||||||
|
if(creationNumber != other.creationNumber)
|
||||||
|
return creationNumber > other.creationNumber ? -1 : 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("position: %d, strand: %b, state: %s, mismatches: %d, gap opens: %d, gap extensions: %d, loBound: %d, hiBound: %d, score: %d, creationNumber: %d", position, negativeStrand, alignmentMatchSequence.getCurrentState(), mismatches, gapOpens, gapExtensions, loBound, hiBound, getScore(), creationNumber);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,393 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.java;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.alignment.Alignment;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.bwt.*;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.PriorityQueue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create imperfect alignments from the read to the genome represented by the given BWT / suffix array.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWAJavaAligner extends BWAAligner {
|
||||||
|
/**
|
||||||
|
* BWT in the forward direction.
|
||||||
|
*/
|
||||||
|
private BWT forwardBWT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* BWT in the reverse direction.
|
||||||
|
*/
|
||||||
|
private BWT reverseBWT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suffix array in the forward direction.
|
||||||
|
*/
|
||||||
|
private SuffixArray forwardSuffixArray;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suffix array in the reverse direction.
|
||||||
|
*/
|
||||||
|
private SuffixArray reverseSuffixArray;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum edit distance (-n option from original BWA).
|
||||||
|
*/
|
||||||
|
private final int MAXIMUM_EDIT_DISTANCE = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum number of gap opens (-o option from original BWA).
|
||||||
|
*/
|
||||||
|
private final int MAXIMUM_GAP_OPENS = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum number of gap extensions (-e option from original BWA).
|
||||||
|
*/
|
||||||
|
private final int MAXIMUM_GAP_EXTENSIONS = 6;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Penalty for straight mismatches (-M option from original BWA).
|
||||||
|
*/
|
||||||
|
public final int MISMATCH_PENALTY = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Penalty for gap opens (-O option from original BWA).
|
||||||
|
*/
|
||||||
|
public final int GAP_OPEN_PENALTY = 11;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Penalty for gap extensions (-E option from original BWA).
|
||||||
|
*/
|
||||||
|
public final int GAP_EXTENSION_PENALTY = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skip the ends of indels.
|
||||||
|
*/
|
||||||
|
public final int INDEL_END_SKIP = 5;
|
||||||
|
|
||||||
|
public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
|
||||||
|
super(null,null);
|
||||||
|
forwardBWT = new BWTReader(forwardBWTFile).read();
|
||||||
|
reverseBWT = new BWTReader(reverseBWTFile).read();
|
||||||
|
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
|
||||||
|
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close this instance of the BWA pointer and delete its resources.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
throw new UnsupportedOperationException("BWA aligner can't currently be closed.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the current parameters of this aligner.
|
||||||
|
* @param configuration New configuration to set.
|
||||||
|
*/
|
||||||
|
public void updateConfiguration(BWAConfiguration configuration) {
|
||||||
|
throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
|
||||||
|
* @param bases Bases to align.
|
||||||
|
* @return An align
|
||||||
|
*/
|
||||||
|
public Alignment getBestAlignment(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Align the read to the reference.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @param header Optional header to drop in place.
|
||||||
|
* @return A list of the alignments.
|
||||||
|
*/
|
||||||
|
public SAMRecord align(final SAMRecord read, final SAMFileHeader header) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a iterator of alignments, batched by mapping quality.
|
||||||
|
* @param bases List of bases.
|
||||||
|
* @return Iterator to alignments.
|
||||||
|
*/
|
||||||
|
public Iterable<Alignment[]> getAllAlignments(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a iterator of aligned reads, batched by mapping quality.
|
||||||
|
* @param read Read to align.
|
||||||
|
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
|
||||||
|
* @return Iterator to alignments.
|
||||||
|
*/
|
||||||
|
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
|
||||||
|
|
||||||
|
|
||||||
|
public List<Alignment> align( SAMRecord read ) {
|
||||||
|
List<Alignment> successfulMatches = new ArrayList<Alignment>();
|
||||||
|
|
||||||
|
Byte[] uncomplementedBases = normalizeBases(read.getReadBases());
|
||||||
|
Byte[] complementedBases = normalizeBases(Utils.reverse(BaseUtils.simpleReverseComplement(read.getReadBases())));
|
||||||
|
|
||||||
|
List<LowerBound> forwardLowerBounds = LowerBound.create(uncomplementedBases,forwardBWT);
|
||||||
|
List<LowerBound> reverseLowerBounds = LowerBound.create(complementedBases,reverseBWT);
|
||||||
|
|
||||||
|
// Seed the best score with any score that won't overflow on comparison.
|
||||||
|
int bestScore = Integer.MAX_VALUE - MISMATCH_PENALTY;
|
||||||
|
int bestDiff = MAXIMUM_EDIT_DISTANCE+1;
|
||||||
|
int maxDiff = MAXIMUM_EDIT_DISTANCE;
|
||||||
|
|
||||||
|
PriorityQueue<BWAAlignment> alignments = new PriorityQueue<BWAAlignment>();
|
||||||
|
|
||||||
|
// Create a fictional initial alignment, with the position just off the end of the read, and the limits
|
||||||
|
// set as the entire BWT.
|
||||||
|
alignments.add(createSeedAlignment(reverseBWT));
|
||||||
|
alignments.add(createSeedAlignment(forwardBWT));
|
||||||
|
|
||||||
|
while(!alignments.isEmpty()) {
|
||||||
|
BWAAlignment alignment = alignments.remove();
|
||||||
|
|
||||||
|
// From bwtgap.c in the original BWT; if the rank is worse than the best score + the mismatch PENALTY, move on.
|
||||||
|
if( alignment.getScore() > bestScore + MISMATCH_PENALTY )
|
||||||
|
break;
|
||||||
|
|
||||||
|
Byte[] bases = alignment.isNegativeStrand() ? complementedBases : uncomplementedBases;
|
||||||
|
BWT bwt = alignment.isNegativeStrand() ? forwardBWT : reverseBWT;
|
||||||
|
List<LowerBound> lowerBounds = alignment.isNegativeStrand() ? reverseLowerBounds : forwardLowerBounds;
|
||||||
|
|
||||||
|
// if z < D(i) then return {}
|
||||||
|
int mismatches = maxDiff - alignment.getMismatches() - alignment.getGapOpens() - alignment.getGapExtensions();
|
||||||
|
if( alignment.position < lowerBounds.size()-1 && mismatches < lowerBounds.get(alignment.position+1).value )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if(mismatches == 0) {
|
||||||
|
exactMatch(alignment,bases,bwt);
|
||||||
|
if(alignment.loBound > alignment.hiBound)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Found a valid alignment; store it and move on.
|
||||||
|
if(alignment.position >= read.getReadLength()-1) {
|
||||||
|
for(long bwtIndex = alignment.loBound; bwtIndex <= alignment.hiBound; bwtIndex++) {
|
||||||
|
BWAAlignment finalAlignment = alignment.clone();
|
||||||
|
|
||||||
|
if( finalAlignment.isNegativeStrand() )
|
||||||
|
finalAlignment.setAlignmentStart(forwardSuffixArray.get(bwtIndex) + 1);
|
||||||
|
else {
|
||||||
|
int sizeAlongReference = read.getReadLength() -
|
||||||
|
finalAlignment.getNumberOfBasesMatchingState(AlignmentState.INSERTION) +
|
||||||
|
finalAlignment.getNumberOfBasesMatchingState(AlignmentState.DELETION);
|
||||||
|
finalAlignment.setAlignmentStart(reverseBWT.length() - reverseSuffixArray.get(bwtIndex) - sizeAlongReference + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
successfulMatches.add(finalAlignment);
|
||||||
|
|
||||||
|
bestScore = Math.min(finalAlignment.getScore(),bestScore);
|
||||||
|
bestDiff = Math.min(finalAlignment.getMismatches()+finalAlignment.getGapOpens()+finalAlignment.getGapExtensions(),bestDiff);
|
||||||
|
maxDiff = bestDiff + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
|
||||||
|
/*
|
||||||
|
System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
|
||||||
|
alignment.negativeStrand?1:0,
|
||||||
|
bases.length-alignment.position-1,
|
||||||
|
alignment.getCurrentState().toString().charAt(0),
|
||||||
|
alignment.getMismatches(),
|
||||||
|
alignment.getGapOpens(),
|
||||||
|
alignment.getGapExtensions(),
|
||||||
|
lowerBounds.get(alignment.position+1).value,
|
||||||
|
lowerBounds.get(alignment.position+1).width,
|
||||||
|
alignment.loBound,
|
||||||
|
alignment.hiBound);
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Temporary -- look ahead to see if the next alignment is bounded.
|
||||||
|
boolean allowDifferences = mismatches > 0;
|
||||||
|
boolean allowMismatches = mismatches > 0;
|
||||||
|
|
||||||
|
if( allowDifferences &&
|
||||||
|
alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&
|
||||||
|
read.getReadLength()-1-(alignment.position+1) >= INDEL_END_SKIP+alignment.getGapOpens()+alignment.getGapExtensions() ) {
|
||||||
|
if( alignment.getCurrentState() == AlignmentState.MATCH_MISMATCH ) {
|
||||||
|
if( alignment.getGapOpens() < MAXIMUM_GAP_OPENS ) {
|
||||||
|
// Add a potential insertion extension.
|
||||||
|
BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
|
||||||
|
insertionAlignment.incrementGapOpens();
|
||||||
|
alignments.add(insertionAlignment);
|
||||||
|
|
||||||
|
// Add a potential deletion by marking a deletion and augmenting the position.
|
||||||
|
List<BWAAlignment> deletionAlignments = createDeletionAlignments(bwt,alignment);
|
||||||
|
for( BWAAlignment deletionAlignment: deletionAlignments )
|
||||||
|
deletionAlignment.incrementGapOpens();
|
||||||
|
alignments.addAll(deletionAlignments);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( alignment.getCurrentState() == AlignmentState.INSERTION ) {
|
||||||
|
if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
|
||||||
|
// Add a potential insertion extension.
|
||||||
|
BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
|
||||||
|
insertionAlignment.incrementGapExtensions();
|
||||||
|
alignments.add(insertionAlignment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( alignment.getCurrentState() == AlignmentState.DELETION ) {
|
||||||
|
if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
|
||||||
|
// Add a potential deletion by marking a deletion and augmenting the position.
|
||||||
|
List<BWAAlignment> deletionAlignments = createDeletionAlignments(bwt,alignment);
|
||||||
|
for( BWAAlignment deletionAlignment: deletionAlignments )
|
||||||
|
deletionAlignment.incrementGapExtensions();
|
||||||
|
alignments.addAll(deletionAlignments);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mismatches
|
||||||
|
alignments.addAll(createMatchedAlignments(bwt,alignment,bases,allowDifferences&&allowMismatches));
|
||||||
|
}
|
||||||
|
|
||||||
|
return successfulMatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an seeding alignment to use as a starting point when traversing.
|
||||||
|
* @param bwt source BWT.
|
||||||
|
* @return Seed alignment.
|
||||||
|
*/
|
||||||
|
private BWAAlignment createSeedAlignment(BWT bwt) {
|
||||||
|
BWAAlignment seed = new BWAAlignment(this);
|
||||||
|
seed.setNegativeStrand(bwt == forwardBWT);
|
||||||
|
seed.position = -1;
|
||||||
|
seed.loBound = 0;
|
||||||
|
seed.hiBound = bwt.length();
|
||||||
|
return seed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new alignments representing direct matches / mismatches.
|
||||||
|
* @param bwt Source BWT with which to work.
|
||||||
|
* @param alignment Alignment for the previous position.
|
||||||
|
* @param bases The bases in the read.
|
||||||
|
* @param allowMismatch Should mismatching bases be allowed?
|
||||||
|
* @return New alignment representing this position if valid; null otherwise.
|
||||||
|
*/
|
||||||
|
private List<BWAAlignment> createMatchedAlignments( BWT bwt, BWAAlignment alignment, Byte[] bases, boolean allowMismatch ) {
|
||||||
|
List<BWAAlignment> newAlignments = new ArrayList<BWAAlignment>();
|
||||||
|
|
||||||
|
List<Byte> baseChoices = new ArrayList<Byte>();
|
||||||
|
Byte thisBase = bases[alignment.position+1];
|
||||||
|
|
||||||
|
if( allowMismatch )
|
||||||
|
baseChoices.addAll(Bases.allOf());
|
||||||
|
else
|
||||||
|
baseChoices.add(thisBase);
|
||||||
|
|
||||||
|
if( thisBase != null ) {
|
||||||
|
// Keep rotating the current base to the last position until we've hit the current base.
|
||||||
|
for( ;; ) {
|
||||||
|
baseChoices.add(baseChoices.remove(0));
|
||||||
|
if( thisBase.equals(baseChoices.get(baseChoices.size()-1)) )
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(byte base: baseChoices) {
|
||||||
|
BWAAlignment newAlignment = alignment.clone();
|
||||||
|
|
||||||
|
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||||
|
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||||
|
|
||||||
|
// If this alignment is valid, skip it.
|
||||||
|
if( newAlignment.loBound > newAlignment.hiBound )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
newAlignment.position++;
|
||||||
|
newAlignment.addState(AlignmentState.MATCH_MISMATCH);
|
||||||
|
if( bases[newAlignment.position] == null || base != bases[newAlignment.position] )
|
||||||
|
newAlignment.incrementMismatches();
|
||||||
|
|
||||||
|
newAlignments.add(newAlignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newAlignments;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new alignment representing an insertion at this point in the read.
|
||||||
|
* @param alignment Alignment from which to derive the insertion.
|
||||||
|
* @return New alignment reflecting the insertion.
|
||||||
|
*/
|
||||||
|
private BWAAlignment createInsertionAlignment( BWAAlignment alignment ) {
|
||||||
|
// Add a potential insertion extension.
|
||||||
|
BWAAlignment newAlignment = alignment.clone();
|
||||||
|
newAlignment.position++;
|
||||||
|
newAlignment.addState(AlignmentState.INSERTION);
|
||||||
|
return newAlignment;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create new alignments representing a deletion at this point in the read.
|
||||||
|
* @param bwt source BWT for inferring deletion info.
|
||||||
|
* @param alignment Alignment from which to derive the deletion.
|
||||||
|
* @return New alignments reflecting all possible deletions.
|
||||||
|
*/
|
||||||
|
private List<BWAAlignment> createDeletionAlignments( BWT bwt, BWAAlignment alignment) {
|
||||||
|
List<BWAAlignment> newAlignments = new ArrayList<BWAAlignment>();
|
||||||
|
for(byte base: Bases.instance) {
|
||||||
|
BWAAlignment newAlignment = alignment.clone();
|
||||||
|
|
||||||
|
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||||
|
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||||
|
|
||||||
|
// If this alignment is valid, skip it.
|
||||||
|
if( newAlignment.loBound > newAlignment.hiBound )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
newAlignment.addState(AlignmentState.DELETION);
|
||||||
|
|
||||||
|
newAlignments.add(newAlignment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newAlignments;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exactly match the given alignment against the given BWT.
|
||||||
|
* @param alignment Alignment to match.
|
||||||
|
* @param bases Bases to use.
|
||||||
|
* @param bwt BWT to use.
|
||||||
|
*/
|
||||||
|
private void exactMatch( BWAAlignment alignment, Byte[] bases, BWT bwt ) {
|
||||||
|
while( ++alignment.position < bases.length ) {
|
||||||
|
byte base = bases[alignment.position];
|
||||||
|
alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||||
|
alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||||
|
if( alignment.loBound > alignment.hiBound )
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make each base into A/C/G/T or null if unknown.
|
||||||
|
* @param bases Base string to normalize.
|
||||||
|
* @return Array of normalized bases.
|
||||||
|
*/
|
||||||
|
private Byte[] normalizeBases( byte[] bases ) {
|
||||||
|
Byte[] normalBases = new Byte[bases.length];
|
||||||
|
for(int i = 0; i < bases.length; i++)
|
||||||
|
normalBases[i] = Bases.fromASCII(bases[i]);
|
||||||
|
return normalBases;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.broadinstitute.sting.alignment.bwa.java;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.reference.bwt.BWT;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* At any point along the given read, what is a good lower bound for the
|
||||||
|
* total number of differences?
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class LowerBound {
|
||||||
|
/**
|
||||||
|
* Lower bound of the suffix array.
|
||||||
|
*/
|
||||||
|
public final long loIndex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upper bound of the suffix array.
|
||||||
|
*/
|
||||||
|
public final long hiIndex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Width of the bwt from loIndex -> hiIndex, inclusive.
|
||||||
|
*/
|
||||||
|
public final long width;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The lower bound at the given point.
|
||||||
|
*/
|
||||||
|
public final int value;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new lower bound with the given value.
|
||||||
|
* @param loIndex The lower bound of the BWT.
|
||||||
|
* @param hiIndex The upper bound of the BWT.
|
||||||
|
* @param value Value for the lower bound at this site.
|
||||||
|
*/
|
||||||
|
private LowerBound(long loIndex, long hiIndex, int value) {
|
||||||
|
this.loIndex = loIndex;
|
||||||
|
this.hiIndex = hiIndex;
|
||||||
|
this.width = hiIndex - loIndex + 1;
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper.
|
||||||
|
* @param bases Bases of the read to use when creating a new BWT.
|
||||||
|
* @param bwt BWT to check against.
|
||||||
|
* @return A list of lower bounds at every point in the reference.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public static List<LowerBound> create(Byte[] bases, BWT bwt) {
|
||||||
|
List<LowerBound> bounds = new ArrayList<LowerBound>();
|
||||||
|
|
||||||
|
long loIndex = 0, hiIndex = bwt.length();
|
||||||
|
int mismatches = 0;
|
||||||
|
for( int i = bases.length-1; i >= 0; i-- ) {
|
||||||
|
Byte base = bases[i];
|
||||||
|
|
||||||
|
// Ignore non-ACGT bases.
|
||||||
|
if( base != null ) {
|
||||||
|
loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1;
|
||||||
|
hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( base == null || loIndex > hiIndex ) {
|
||||||
|
loIndex = 0;
|
||||||
|
hiIndex = bwt.length();
|
||||||
|
mismatches++;
|
||||||
|
}
|
||||||
|
bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches));
|
||||||
|
}
|
||||||
|
|
||||||
|
return bounds;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a string representation of this bound.
|
||||||
|
* @return String version of this bound.
|
||||||
|
*/
|
||||||
|
public String toString() {
|
||||||
|
return String.format("LowerBound: w = %d, value = %d",width,value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
/**
|
||||||
|
* Analyses used to validate the correctness and performance the BWA Java bindings.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes .amb files - a file indicating where 'holes' (indeterminant bases)
|
||||||
|
* exist in the contig. Currently, only empty, placeholder AMBs are supported.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class AMBWriter {
|
||||||
|
/**
|
||||||
|
* Number of holes is fixed at zero.
|
||||||
|
*/
|
||||||
|
private static final int NUM_HOLES = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Input stream from which to read BWT data.
|
||||||
|
*/
|
||||||
|
private final PrintStream out;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new ANNWriter targeting the given file.
|
||||||
|
* @param file file into which ANN data should be written.
|
||||||
|
* @throws java.io.IOException if there is a problem opening the output file.
|
||||||
|
*/
|
||||||
|
public AMBWriter(File file) throws IOException {
|
||||||
|
out = new PrintStream(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new ANNWriter targeting the given OutputStream.
|
||||||
|
* @param stream Stream into which ANN data should be written.
|
||||||
|
*/
|
||||||
|
public AMBWriter(OutputStream stream) {
|
||||||
|
out = new PrintStream(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the contents of the given dictionary into the AMB file.
|
||||||
|
* Assumes that there are no holes in the dictionary.
|
||||||
|
* @param dictionary Dictionary to write.
|
||||||
|
*/
|
||||||
|
public void writeEmpty(SAMSequenceDictionary dictionary) {
|
||||||
|
long genomeLength = 0L;
|
||||||
|
for(SAMSequenceRecord sequence: dictionary.getSequences())
|
||||||
|
genomeLength += sequence.getSequenceLength();
|
||||||
|
|
||||||
|
int sequences = dictionary.getSequences().size();
|
||||||
|
|
||||||
|
// Write the header
|
||||||
|
out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the given output stream.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,95 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import net.sf.samtools.SAMSequenceRecord;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes .ann files - an alternate sequence dictionary format
|
||||||
|
* used by BWA/C. For best results, the input sequence dictionary
|
||||||
|
* should be created with Picard's CreateSequenceDictionary.jar,
|
||||||
|
* TRUNCATE_NAMES_AT_WHITESPACE=false.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class ANNWriter {
|
||||||
|
/**
|
||||||
|
* BWA uses a fixed seed of 11, written into every file.
|
||||||
|
*/
|
||||||
|
private static final int BNS_SEED = 11;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A seemingly unused value that appears in every contig in the ANN.
|
||||||
|
*/
|
||||||
|
private static final int GI = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Input stream from which to read BWT data.
|
||||||
|
*/
|
||||||
|
private final PrintStream out;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new ANNWriter targeting the given file.
|
||||||
|
* @param file file into which ANN data should be written.
|
||||||
|
* @throws IOException if there is a problem opening the output file.
|
||||||
|
*/
|
||||||
|
public ANNWriter(File file) throws IOException {
|
||||||
|
out = new PrintStream(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new ANNWriter targeting the given OutputStream.
|
||||||
|
* @param stream Stream into which ANN data should be written.
|
||||||
|
*/
|
||||||
|
public ANNWriter(OutputStream stream) {
|
||||||
|
out = new PrintStream(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the contents of the given dictionary into the ANN file.
|
||||||
|
* Assumes that no ambs (blocks of indeterminate base) are present in the dictionary.
|
||||||
|
* @param dictionary Dictionary to write.
|
||||||
|
*/
|
||||||
|
public void write(SAMSequenceDictionary dictionary) {
|
||||||
|
long genomeLength = 0L;
|
||||||
|
for(SAMSequenceRecord sequence: dictionary.getSequences())
|
||||||
|
genomeLength += sequence.getSequenceLength();
|
||||||
|
|
||||||
|
int sequences = dictionary.getSequences().size();
|
||||||
|
|
||||||
|
// Write the header
|
||||||
|
out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED);
|
||||||
|
|
||||||
|
for(SAMSequenceRecord sequence: dictionary.getSequences()) {
|
||||||
|
String fullSequenceName = sequence.getSequenceName();
|
||||||
|
String trimmedSequenceName = fullSequenceName;
|
||||||
|
String sequenceComment = "(null)";
|
||||||
|
|
||||||
|
long offset = 0;
|
||||||
|
|
||||||
|
// Separate the sequence name from the sequence comment, based on BWA's definition.
|
||||||
|
// BWA's definition appears to accept a zero-length contig name, so mimic that behavior.
|
||||||
|
if(fullSequenceName.indexOf(' ') >= 0) {
|
||||||
|
trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' '));
|
||||||
|
sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the sequence GI (?), name, and comment.
|
||||||
|
out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment);
|
||||||
|
// Write the sequence offset, length, and ambs (currently fixed at 0).
|
||||||
|
out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the given output stream.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,172 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWT {
|
||||||
|
/**
|
||||||
|
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
|
||||||
|
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
|
||||||
|
*/
|
||||||
|
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The inverse SA, used as a placeholder for determining where the special EOL character sits.
|
||||||
|
*/
|
||||||
|
protected final long inverseSA0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cumulative counts for the entire BWT.
|
||||||
|
*/
|
||||||
|
protected final Counts counts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The individual sequence blocks, modelling how they appear on disk.
|
||||||
|
*/
|
||||||
|
protected final SequenceBlock[] sequenceBlocks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
|
||||||
|
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||||
|
* @param counts Cumulative count of bases, in A,C,G,T order.
|
||||||
|
* @param sequenceBlocks The full BWT sequence, sans the '$'.
|
||||||
|
*/
|
||||||
|
public BWT( long inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) {
|
||||||
|
this.inverseSA0 = inverseSA0;
|
||||||
|
this.counts = counts;
|
||||||
|
this.sequenceBlocks = sequenceBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
||||||
|
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||||
|
* @param counts Count of bases, in A,C,G,T order.
|
||||||
|
* @param sequence The full BWT sequence, sans the '$'.
|
||||||
|
*/
|
||||||
|
public BWT( long inverseSA0, Counts counts, byte[] sequence ) {
|
||||||
|
this(inverseSA0,counts,generateSequenceBlocks(sequence));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the full sequence from the list of block.
|
||||||
|
* @return The full BWT string as a byte array.
|
||||||
|
*/
|
||||||
|
public byte[] getSequence() {
|
||||||
|
byte[] sequence = new byte[(int)counts.getTotal()];
|
||||||
|
for( SequenceBlock block: sequenceBlocks )
|
||||||
|
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
|
||||||
|
return sequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
|
||||||
|
* @param base The base.
|
||||||
|
* @return Total counts for all bases lexicographically smaller than this base.
|
||||||
|
*/
|
||||||
|
public long counts(byte base) {
|
||||||
|
return counts.getCumulative(base);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
|
||||||
|
* @param base The base.
|
||||||
|
* @param index The position to search within the BWT.
|
||||||
|
* @return Total counts for all bases lexicographically smaller than this base.
|
||||||
|
*/
|
||||||
|
public long occurrences(byte base,long index) {
|
||||||
|
SequenceBlock block = getSequenceBlock(index);
|
||||||
|
int position = getSequencePosition(index);
|
||||||
|
long accumulator = block.occurrences.get(base);
|
||||||
|
for(int i = 0; i <= position; i++) {
|
||||||
|
if(base == block.sequence[i])
|
||||||
|
accumulator++;
|
||||||
|
}
|
||||||
|
return accumulator;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of bases in the BWT as a whole.
|
||||||
|
* @return Number of bases.
|
||||||
|
*/
|
||||||
|
public long length() {
|
||||||
|
return counts.getTotal();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new BWT from the given reference sequence.
|
||||||
|
* @param referenceSequence Sequence from which to derive the BWT.
|
||||||
|
* @return reference sequence-derived BWT.
|
||||||
|
*/
|
||||||
|
public static BWT createFromReferenceSequence(byte[] referenceSequence) {
|
||||||
|
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
|
||||||
|
|
||||||
|
byte[] bwt = new byte[(int)suffixArray.length()-1];
|
||||||
|
int bwtIndex = 0;
|
||||||
|
for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) {
|
||||||
|
if(suffixArray.get(suffixArrayIndex) == 0)
|
||||||
|
continue;
|
||||||
|
bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the base at a given position in the BWT.
|
||||||
|
* @param index The index to use.
|
||||||
|
* @return The base at that location.
|
||||||
|
*/
|
||||||
|
protected byte getBase(long index) {
|
||||||
|
if(index == inverseSA0)
|
||||||
|
throw new ReviewedStingException(String.format("Base at index %d does not have a text representation",index));
|
||||||
|
|
||||||
|
SequenceBlock block = getSequenceBlock(index);
|
||||||
|
int position = getSequencePosition(index);
|
||||||
|
return block.sequence[position];
|
||||||
|
}
|
||||||
|
|
||||||
|
private SequenceBlock getSequenceBlock(long index) {
|
||||||
|
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||||
|
if(index > inverseSA0) index--;
|
||||||
|
return sequenceBlocks[(int)(index/SEQUENCE_BLOCK_SIZE)];
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getSequencePosition(long index) {
|
||||||
|
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||||
|
if(index > inverseSA0) index--;
|
||||||
|
return (int)(index%SEQUENCE_BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a set of sequence blocks from one long sequence.
|
||||||
|
* @param sequence Sequence from which to derive blocks.
|
||||||
|
* @return Array of sequence blocks containing data from the sequence.
|
||||||
|
*/
|
||||||
|
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
|
||||||
|
Counts occurrences = new Counts();
|
||||||
|
|
||||||
|
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
|
||||||
|
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
|
||||||
|
|
||||||
|
for( int block = 0; block < numSequenceBlocks; block++ ) {
|
||||||
|
int blockStart = block*SEQUENCE_BLOCK_SIZE;
|
||||||
|
int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
|
||||||
|
byte[] subsequence = new byte[blockLength];
|
||||||
|
|
||||||
|
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
|
||||||
|
|
||||||
|
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence);
|
||||||
|
|
||||||
|
for( byte base: subsequence )
|
||||||
|
occurrences.increment(base);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequenceBlocks;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,89 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.BasePackedInputStream;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
/**
|
||||||
|
* Reads a BWT from a given file.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWTReader {
|
||||||
|
/**
|
||||||
|
* Input stream from which to read BWT data.
|
||||||
|
*/
|
||||||
|
private FileInputStream inputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new BWT reader.
|
||||||
|
* @param inputFile File in which the BWT is stored.
|
||||||
|
*/
|
||||||
|
public BWTReader( File inputFile ) {
|
||||||
|
try {
|
||||||
|
this.inputStream = new FileInputStream(inputFile);
|
||||||
|
}
|
||||||
|
catch( FileNotFoundException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to open input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a BWT from the input stream.
|
||||||
|
* @return The BWT stored in the input stream.
|
||||||
|
*/
|
||||||
|
public BWT read() {
|
||||||
|
UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
|
long inverseSA0;
|
||||||
|
long[] count;
|
||||||
|
SequenceBlock[] sequenceBlocks;
|
||||||
|
|
||||||
|
try {
|
||||||
|
inverseSA0 = uintPackedInputStream.read();
|
||||||
|
count = new long[PackUtils.ALPHABET_SIZE];
|
||||||
|
uintPackedInputStream.read(count);
|
||||||
|
|
||||||
|
long bwtSize = count[PackUtils.ALPHABET_SIZE-1];
|
||||||
|
sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
|
||||||
|
|
||||||
|
for( int block = 0; block < sequenceBlocks.length; block++ ) {
|
||||||
|
int sequenceStart = block* BWT.SEQUENCE_BLOCK_SIZE;
|
||||||
|
int sequenceLength = (int)Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
|
||||||
|
|
||||||
|
long[] occurrences = new long[PackUtils.ALPHABET_SIZE];
|
||||||
|
byte[] bwt = new byte[sequenceLength];
|
||||||
|
|
||||||
|
uintPackedInputStream.read(occurrences);
|
||||||
|
basePackedInputStream.read(bwt);
|
||||||
|
|
||||||
|
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the input stream.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
inputStream.close();
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to close input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate BWA supplementary files (.ann, .amb) from the command line.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWTSupplementaryFileGenerator {
|
||||||
|
enum SupplementaryFileType { ANN, AMB }
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
if(args.length < 3)
|
||||||
|
usage("Incorrect number of arguments supplied");
|
||||||
|
|
||||||
|
File fastaFile = new File(args[0]);
|
||||||
|
File outputFile = new File(args[1]);
|
||||||
|
SupplementaryFileType outputType = null;
|
||||||
|
try {
|
||||||
|
outputType = Enum.valueOf(SupplementaryFileType.class,args[2]);
|
||||||
|
}
|
||||||
|
catch(IllegalArgumentException ex) {
|
||||||
|
usage("Invalid output type: " + args[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile);
|
||||||
|
SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary();
|
||||||
|
|
||||||
|
switch(outputType) {
|
||||||
|
case ANN:
|
||||||
|
ANNWriter annWriter = new ANNWriter(outputFile);
|
||||||
|
annWriter.write(dictionary);
|
||||||
|
annWriter.close();
|
||||||
|
break;
|
||||||
|
case AMB:
|
||||||
|
AMBWriter ambWriter = new AMBWriter(outputFile);
|
||||||
|
ambWriter.writeEmpty(dictionary);
|
||||||
|
ambWriter.close();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
usage("Unsupported output type: " + outputType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Print usage information and exit.
|
||||||
|
*/
|
||||||
|
private static void usage(String message) {
|
||||||
|
System.err.println(message);
|
||||||
|
System.err.println("Usage: BWTSupplementaryFileGenerator <fasta> <output file> <output type>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.BasePackedOutputStream;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes an in-memory BWT to an outputstream.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BWTWriter {
|
||||||
|
/**
|
||||||
|
* Input stream from which to read BWT data.
|
||||||
|
*/
|
||||||
|
private final OutputStream outputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new BWT writer.
|
||||||
|
* @param outputFile File in which the BWT is stored.
|
||||||
|
*/
|
||||||
|
public BWTWriter( File outputFile ) {
|
||||||
|
try {
|
||||||
|
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
|
||||||
|
}
|
||||||
|
catch( FileNotFoundException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to open output file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a BWT to the output stream.
|
||||||
|
* @param bwt Transform to be written to the output stream.
|
||||||
|
*/
|
||||||
|
public void write( BWT bwt ) {
|
||||||
|
UnsignedIntPackedOutputStream intPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream<Integer>(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
|
try {
|
||||||
|
intPackedOutputStream.write(bwt.inverseSA0);
|
||||||
|
intPackedOutputStream.write(bwt.counts.toArray(true));
|
||||||
|
|
||||||
|
for( SequenceBlock block: bwt.sequenceBlocks ) {
|
||||||
|
intPackedOutputStream.write(block.occurrences.toArray(false));
|
||||||
|
basePackedOutputStream.write(block.sequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The last block is the last set of counts in the structure.
|
||||||
|
intPackedOutputStream.write(bwt.counts.toArray(false));
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the input stream.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
outputStream.close();
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to close input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,108 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enhanced enum representation of a base.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class Bases implements Iterable<Byte>
|
||||||
|
{
|
||||||
|
public static byte A = 'A';
|
||||||
|
public static byte C = 'C';
|
||||||
|
public static byte G = 'G';
|
||||||
|
public static byte T = 'T';
|
||||||
|
|
||||||
|
public static final Bases instance = new Bases();
|
||||||
|
|
||||||
|
private static final List<Byte> allBases;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Representation of the base broken down by packed value.
|
||||||
|
*/
|
||||||
|
private static final Map<Integer,Byte> basesByPack = new HashMap<Integer,Byte>();
|
||||||
|
|
||||||
|
static {
|
||||||
|
List<Byte> bases = new ArrayList<Byte>();
|
||||||
|
bases.add(A);
|
||||||
|
bases.add(C);
|
||||||
|
bases.add(G);
|
||||||
|
bases.add(T);
|
||||||
|
allBases = Collections.unmodifiableList(bases);
|
||||||
|
|
||||||
|
for(int i = 0; i < allBases.size(); i++)
|
||||||
|
basesByPack.put(i,allBases.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new base with the given ascii representation and
|
||||||
|
* pack value.
|
||||||
|
*/
|
||||||
|
private Bases() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return all possible bases.
|
||||||
|
* @return Byte representation of all bases.
|
||||||
|
*/
|
||||||
|
public static Collection<Byte> allOf() {
|
||||||
|
return allBases;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the number of known bases.
|
||||||
|
* @return The number of known bases.
|
||||||
|
*/
|
||||||
|
public static int size() {
|
||||||
|
return allBases.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets an iterator over the total number of known base types.
|
||||||
|
* @return Iterator over all known bases.
|
||||||
|
*/
|
||||||
|
public Iterator<Byte> iterator() {
|
||||||
|
return basesByPack.values().iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the given base from the packed representation.
|
||||||
|
* @param pack Packed representation.
|
||||||
|
* @return base.
|
||||||
|
*/
|
||||||
|
public static byte fromPack( int pack ) { return basesByPack.get(pack); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the given base to its packed value.
|
||||||
|
* @param ascii ASCII representation of the base.
|
||||||
|
* @return Packed value.
|
||||||
|
*/
|
||||||
|
public static int toPack( byte ascii )
|
||||||
|
{
|
||||||
|
for( Map.Entry<Integer,Byte> entry: basesByPack.entrySet() ) {
|
||||||
|
if( entry.getValue().equals(ascii) )
|
||||||
|
return entry.getKey();
|
||||||
|
}
|
||||||
|
throw new ReviewedStingException(String.format("Base %c is an invalid base to pack", (char)ascii));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the ASCII representation of a base to its 'normalized' representation.
|
||||||
|
* @param base The base itself.
|
||||||
|
* @return The byte, if present. Null if unknown.
|
||||||
|
*/
|
||||||
|
public static Byte fromASCII( byte base ) {
|
||||||
|
Byte found = null;
|
||||||
|
for( Byte normalized: allBases ) {
|
||||||
|
if( normalized.equals(base) ) {
|
||||||
|
found = normalized;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,151 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts of how many bases of each type have been seen.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class Counts implements Cloneable {
|
||||||
|
/**
|
||||||
|
* Internal representation of counts, broken down by ASCII value.
|
||||||
|
*/
|
||||||
|
private Map<Byte,Long> counts = new HashMap<Byte,Long>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal representation of cumulative counts, broken down by ASCII value.
|
||||||
|
*/
|
||||||
|
private Map<Byte,Long> cumulativeCounts = new HashMap<Byte,Long>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
|
||||||
|
*/
|
||||||
|
public Counts()
|
||||||
|
{
|
||||||
|
for(byte base: Bases.instance) {
|
||||||
|
counts.put(base,0L);
|
||||||
|
cumulativeCounts.put(base,0L);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a counts data structure with the given initial values.
|
||||||
|
* @param data Count data, broken down by base.
|
||||||
|
* @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example).
|
||||||
|
*/
|
||||||
|
public Counts( long[] data, boolean cumulative ) {
|
||||||
|
if(cumulative) {
|
||||||
|
long priorCount = 0;
|
||||||
|
for(byte base: Bases.instance) {
|
||||||
|
long count = data[Bases.toPack(base)];
|
||||||
|
counts.put(base,count-priorCount);
|
||||||
|
cumulativeCounts.put(base,priorCount);
|
||||||
|
priorCount = count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
long priorCount = 0;
|
||||||
|
for(byte base: Bases.instance) {
|
||||||
|
long count = data[Bases.toPack(base)];
|
||||||
|
counts.put(base,count);
|
||||||
|
cumulativeCounts.put(base,priorCount);
|
||||||
|
priorCount += count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert to an array for persistence.
|
||||||
|
* @param cumulative Use a cumulative representation.
|
||||||
|
* @return Array of count values.
|
||||||
|
*/
|
||||||
|
public long[] toArray(boolean cumulative) {
|
||||||
|
long[] countArray = new long[counts.size()];
|
||||||
|
if(cumulative) {
|
||||||
|
int index = 0;
|
||||||
|
boolean first = true;
|
||||||
|
for(byte base: Bases.instance) {
|
||||||
|
if(first) {
|
||||||
|
first = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
countArray[index++] = getCumulative(base);
|
||||||
|
}
|
||||||
|
countArray[countArray.length-1] = getTotal();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int index = 0;
|
||||||
|
for(byte base: Bases.instance)
|
||||||
|
countArray[index++] = counts.get(base);
|
||||||
|
}
|
||||||
|
return countArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a unique copy of the current object.
|
||||||
|
* @return A duplicate of this object.
|
||||||
|
*/
|
||||||
|
public Counts clone() {
|
||||||
|
Counts other;
|
||||||
|
try {
|
||||||
|
other = (Counts)super.clone();
|
||||||
|
}
|
||||||
|
catch(CloneNotSupportedException ex) {
|
||||||
|
throw new ReviewedStingException("Unable to clone counts object", ex);
|
||||||
|
}
|
||||||
|
other.counts = new HashMap<Byte,Long>(counts);
|
||||||
|
other.cumulativeCounts = new HashMap<Byte,Long>(cumulativeCounts);
|
||||||
|
return other;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increment the number of bases seen at the given location.
|
||||||
|
* @param base Base to increment.
|
||||||
|
*/
|
||||||
|
public void increment(byte base) {
|
||||||
|
counts.put(base,counts.get(base)+1);
|
||||||
|
boolean increment = false;
|
||||||
|
for(byte cumulative: Bases.instance) {
|
||||||
|
if(increment) cumulativeCounts.put(cumulative,cumulativeCounts.get(cumulative)+1);
|
||||||
|
increment |= (cumulative == base);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a count of the number of bases seen at a given location.
|
||||||
|
* Note that counts in this case are not cumulative (counts for A,C,G,T
|
||||||
|
* are independent).
|
||||||
|
* @param base Base for which to query counts.
|
||||||
|
* @return Number of bases of this type seen.
|
||||||
|
*/
|
||||||
|
public long get(byte base) {
|
||||||
|
return counts.get(base);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a count of the number of bases seen before this base.
|
||||||
|
* Note that counts in this case are cumulative.
|
||||||
|
* @param base Base for which to query counts.
|
||||||
|
* @return Number of bases of this type seen.
|
||||||
|
*/
|
||||||
|
public long getCumulative(byte base) {
|
||||||
|
return cumulativeCounts.get(base);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many total bases are represented by this count structure?
|
||||||
|
* @return Total bases represented.
|
||||||
|
*/
|
||||||
|
public long getTotal() {
|
||||||
|
int accumulator = 0;
|
||||||
|
for(byte base: Bases.instance) {
|
||||||
|
accumulator += get(base);
|
||||||
|
}
|
||||||
|
return accumulator;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,200 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITHoc THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a suffix array data structure.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class CreateBWTFromReference {
|
||||||
|
private byte[] loadReference( File inputFile ) {
|
||||||
|
// Read in the first sequence in the input file
|
||||||
|
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||||
|
ReferenceSequence sequence = reference.nextSequence();
|
||||||
|
return sequence.getBases();
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] loadReverseReference( File inputFile ) {
|
||||||
|
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||||
|
ReferenceSequence sequence = reference.nextSequence();
|
||||||
|
PackUtils.reverse(sequence.getBases());
|
||||||
|
return sequence.getBases();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Counts countOccurrences( byte[] sequence ) {
|
||||||
|
Counts occurrences = new Counts();
|
||||||
|
for( byte base: sequence )
|
||||||
|
occurrences.increment(base);
|
||||||
|
return occurrences;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] createSuffixArray( byte[] sequence ) {
|
||||||
|
return SuffixArray.createFromReferenceSequence(sequence).sequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] invertSuffixArray( long[] suffixArray ) {
|
||||||
|
long[] inverseSuffixArray = new long[suffixArray.length];
|
||||||
|
for( int i = 0; i < suffixArray.length; i++ )
|
||||||
|
inverseSuffixArray[(int)suffixArray[i]] = i;
|
||||||
|
return inverseSuffixArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] createCompressedSuffixArray( int[] suffixArray, int[] inverseSuffixArray ) {
|
||||||
|
long[] compressedSuffixArray = new long[suffixArray.length];
|
||||||
|
compressedSuffixArray[0] = inverseSuffixArray[0];
|
||||||
|
for( int i = 1; i < suffixArray.length; i++ )
|
||||||
|
compressedSuffixArray[i] = inverseSuffixArray[suffixArray[i]+1];
|
||||||
|
return compressedSuffixArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] createInversedCompressedSuffixArray( int[] compressedSuffixArray ) {
|
||||||
|
long[] inverseCompressedSuffixArray = new long[compressedSuffixArray.length];
|
||||||
|
for( int i = 0; i < compressedSuffixArray.length; i++ )
|
||||||
|
inverseCompressedSuffixArray[compressedSuffixArray[i]] = i;
|
||||||
|
return inverseCompressedSuffixArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main( String argv[] ) throws IOException {
|
||||||
|
if( argv.length != 5 ) {
|
||||||
|
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String inputFileName = argv[0];
|
||||||
|
File inputFile = new File(inputFileName);
|
||||||
|
|
||||||
|
String bwtFileName = argv[1];
|
||||||
|
File bwtFile = new File(bwtFileName);
|
||||||
|
|
||||||
|
String rbwtFileName = argv[2];
|
||||||
|
File rbwtFile = new File(rbwtFileName);
|
||||||
|
|
||||||
|
String saFileName = argv[3];
|
||||||
|
File saFile = new File(saFileName);
|
||||||
|
|
||||||
|
String rsaFileName = argv[4];
|
||||||
|
File rsaFile = new File(rsaFileName);
|
||||||
|
|
||||||
|
CreateBWTFromReference creator = new CreateBWTFromReference();
|
||||||
|
|
||||||
|
byte[] sequence = creator.loadReference(inputFile);
|
||||||
|
byte[] reverseSequence = creator.loadReverseReference(inputFile);
|
||||||
|
|
||||||
|
// Count the occurences of each given base.
|
||||||
|
Counts occurrences = creator.countOccurrences(sequence);
|
||||||
|
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A),
|
||||||
|
occurrences.getCumulative(Bases.C),
|
||||||
|
occurrences.getCumulative(Bases.G),
|
||||||
|
occurrences.getCumulative(Bases.T));
|
||||||
|
|
||||||
|
// Generate the suffix array and print diagnostics.
|
||||||
|
long[] suffixArrayData = creator.createSuffixArray(sequence);
|
||||||
|
long[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);
|
||||||
|
|
||||||
|
// Invert the suffix array and print diagnostics.
|
||||||
|
long[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
|
||||||
|
long[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);
|
||||||
|
|
||||||
|
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
|
||||||
|
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
|
||||||
|
|
||||||
|
/*
|
||||||
|
// Create the data structure for the compressed suffix array and print diagnostics.
|
||||||
|
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
|
||||||
|
int reconstructedInverseSA = compressedSuffixArray[0];
|
||||||
|
for( int i = 0; i < 8; i++ ) {
|
||||||
|
System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA);
|
||||||
|
reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the data structure for the inverse compressed suffix array and print diagnostics.
|
||||||
|
int[] inverseCompressedSuffixArray = creator.createInversedCompressedSuffixArray(compressedSuffixArray);
|
||||||
|
for( int i = 0; i < 8; i++ ) {
|
||||||
|
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Create the BWT.
|
||||||
|
BWT bwt = BWT.createFromReferenceSequence(sequence);
|
||||||
|
BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence);
|
||||||
|
|
||||||
|
byte[] bwtSequence = bwt.getSequence();
|
||||||
|
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
|
||||||
|
|
||||||
|
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||||
|
bwtWriter.write(bwt);
|
||||||
|
bwtWriter.close();
|
||||||
|
|
||||||
|
BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
|
||||||
|
reverseBWTWriter.write(reverseBWT);
|
||||||
|
reverseBWTWriter.close();
|
||||||
|
|
||||||
|
/*
|
||||||
|
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
||||||
|
saWriter.write(suffixArray);
|
||||||
|
saWriter.close();
|
||||||
|
|
||||||
|
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
|
||||||
|
reverseSAWriter.write(reverseSuffixArray);
|
||||||
|
reverseSAWriter.close();
|
||||||
|
*/
|
||||||
|
|
||||||
|
File existingBWTFile = new File(inputFileName+".bwt");
|
||||||
|
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
||||||
|
BWT existingBWT = existingBWTReader.read();
|
||||||
|
|
||||||
|
byte[] existingBWTSequence = existingBWT.getSequence();
|
||||||
|
System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());
|
||||||
|
|
||||||
|
for( int i = 0; i < bwt.length(); i++ ) {
|
||||||
|
if( bwtSequence[i] != existingBWTSequence[i] )
|
||||||
|
throw new ReviewedStingException("BWT mismatch at " + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
File existingSAFile = new File(inputFileName+".sa");
|
||||||
|
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
|
||||||
|
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
|
||||||
|
|
||||||
|
for(int i = 0; i < suffixArray.length(); i++) {
|
||||||
|
if( i % 10000 == 0 )
|
||||||
|
System.out.printf("Validating suffix array entry %d%n", i);
|
||||||
|
if( suffixArray.get(i) != existingSuffixArray.get(i) )
|
||||||
|
throw new ReviewedStingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models a block of bases within the BWT.
|
||||||
|
*/
|
||||||
|
public class SequenceBlock {
|
||||||
|
/**
|
||||||
|
* Start position of this sequence within the BWT.
|
||||||
|
*/
|
||||||
|
public final int sequenceStart;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Length of this sequence within the BWT.
|
||||||
|
*/
|
||||||
|
public final int sequenceLength;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Occurrences of each letter up to this sequence block.
|
||||||
|
*/
|
||||||
|
public final Counts occurrences;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sequence for this segment.
|
||||||
|
*/
|
||||||
|
public final byte[] sequence;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new block within this BWT.
|
||||||
|
* @param sequenceStart Starting position of this sequence within the BWT.
|
||||||
|
* @param sequenceLength Length of this sequence.
|
||||||
|
* @param occurrences How many of each base has been seen before this sequence began.
|
||||||
|
* @param sequence The actual sequence from the BWT.
|
||||||
|
*/
|
||||||
|
public SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) {
|
||||||
|
this.sequenceStart = sequenceStart;
|
||||||
|
this.sequenceLength = sequenceLength;
|
||||||
|
this.occurrences = occurrences;
|
||||||
|
this.sequence = sequence;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,158 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import net.sf.samtools.util.StringUtil;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An in-memory representation of a suffix array.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class SuffixArray {
|
||||||
|
public final long inverseSA0;
|
||||||
|
public final Counts occurrences;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The elements of the sequence actually stored in memory.
|
||||||
|
*/
|
||||||
|
protected final long[] sequence;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How often are individual elements in the sequence actually stored
|
||||||
|
* in memory, as opposed to being calculated on the fly?
|
||||||
|
*/
|
||||||
|
protected final int sequenceInterval;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The BWT used to calculate missing portions of the sequence.
|
||||||
|
*/
|
||||||
|
protected final BWT bwt;
|
||||||
|
|
||||||
|
public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence) {
|
||||||
|
this(inverseSA0,occurrences,sequence,1,null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new sequence array with the given inverse SA, occurrences, and values.
|
||||||
|
* @param inverseSA0 Inverse SA entry for the first element.
|
||||||
|
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||||
|
* @param sequence The full suffix array.
|
||||||
|
* @param sequenceInterval How frequently is the sequence interval stored.
|
||||||
|
* @param bwt bwt used to infer the remaining entries in the BWT.
|
||||||
|
*/
|
||||||
|
public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence, int sequenceInterval, BWT bwt) {
|
||||||
|
this.inverseSA0 = inverseSA0;
|
||||||
|
this.occurrences = occurrences;
|
||||||
|
this.sequence = sequence;
|
||||||
|
this.sequenceInterval = sequenceInterval;
|
||||||
|
this.bwt = bwt;
|
||||||
|
|
||||||
|
if(sequenceInterval != 1 && bwt == null)
|
||||||
|
throw new ReviewedStingException("A BWT must be provided if the sequence interval is not 1");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the length of the sequence array.
|
||||||
|
* @return Length of the suffix array.
|
||||||
|
*/
|
||||||
|
public long length() {
|
||||||
|
if( bwt != null )
|
||||||
|
return bwt.length()+1;
|
||||||
|
else
|
||||||
|
return sequence.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the suffix array value at a given sequence.
|
||||||
|
* @param index Index at which to retrieve the suffix array vaule.
|
||||||
|
* @return The suffix array value at that entry.
|
||||||
|
*/
|
||||||
|
public long get(long index) {
|
||||||
|
int iterations = 0;
|
||||||
|
while(index%sequenceInterval != 0) {
|
||||||
|
// The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case.
|
||||||
|
if(index == inverseSA0)
|
||||||
|
index = 0;
|
||||||
|
else {
|
||||||
|
byte base = bwt.getBase(index);
|
||||||
|
index = bwt.counts(base) + bwt.occurrences(base,index);
|
||||||
|
}
|
||||||
|
iterations++;
|
||||||
|
}
|
||||||
|
return (sequence[(int)(index/sequenceInterval)]+iterations) % length();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a suffix array from a given reference sequence.
|
||||||
|
* @param sequence The reference sequence to use when building the suffix array.
|
||||||
|
* @return a constructed suffix array.
|
||||||
|
*/
|
||||||
|
public static SuffixArray createFromReferenceSequence(byte[] sequence) {
|
||||||
|
// The builder for the suffix array. Use an integer in this case because
|
||||||
|
// Java arrays can only hold an integer.
|
||||||
|
TreeSet<Integer> suffixArrayBuilder = new TreeSet<Integer>(new SuffixArrayComparator(sequence));
|
||||||
|
|
||||||
|
Counts occurrences = new Counts();
|
||||||
|
for( byte base: sequence )
|
||||||
|
occurrences.increment(base);
|
||||||
|
|
||||||
|
// Build out the suffix array using a custom comparator.
|
||||||
|
for( int i = 0; i <= sequence.length; i++ )
|
||||||
|
suffixArrayBuilder.add(i);
|
||||||
|
|
||||||
|
// Copy the suffix array into an array.
|
||||||
|
long[] suffixArray = new long[suffixArrayBuilder.size()];
|
||||||
|
int i = 0;
|
||||||
|
for( Integer element: suffixArrayBuilder )
|
||||||
|
suffixArray[i++] = element;
|
||||||
|
|
||||||
|
// Find the first element in the inverse suffix array.
|
||||||
|
long inverseSA0 = -1;
|
||||||
|
for(i = 0; i < suffixArray.length; i++) {
|
||||||
|
if(suffixArray[i] == 0)
|
||||||
|
inverseSA0 = i;
|
||||||
|
}
|
||||||
|
if(inverseSA0 < 0)
|
||||||
|
throw new ReviewedStingException("Unable to find first inverse SA entry in generated suffix array.");
|
||||||
|
|
||||||
|
return new SuffixArray(inverseSA0,occurrences,suffixArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares two suffix arrays of the given sequence. Will return whichever string appears
|
||||||
|
* first in lexicographic order.
|
||||||
|
*/
|
||||||
|
private static class SuffixArrayComparator implements Comparator<Integer> {
|
||||||
|
/**
|
||||||
|
* The data source for all suffix arrays.
|
||||||
|
*/
|
||||||
|
private final String sequence;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new comparator.
|
||||||
|
* @param sequence Reference sequence to use as basis for comparison.
|
||||||
|
*/
|
||||||
|
public SuffixArrayComparator( byte[] sequence ) {
|
||||||
|
// Processing the suffix array tends to be easier as a string.
|
||||||
|
this.sequence = StringUtil.bytesToString(sequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of
|
||||||
|
* the two substrings sequence[lhs:], sequence[rhs:].
|
||||||
|
* @param lhs Left-hand side of comparison.
|
||||||
|
* @param rhs Right-hand side of comparison.
|
||||||
|
* @return How the suffix arrays represented by lhs, rhs compare.
|
||||||
|
*/
|
||||||
|
public int compare( Integer lhs, Integer rhs ) {
|
||||||
|
String lhsSuffixArray = sequence.substring(lhs);
|
||||||
|
String rhsSuffixArray = sequence.substring(rhs);
|
||||||
|
return lhsSuffixArray.compareTo(rhsSuffixArray);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A reader for suffix arrays in permanent storage.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class SuffixArrayReader {
|
||||||
|
/**
|
||||||
|
* Input stream from which to read suffix array data.
|
||||||
|
*/
|
||||||
|
private FileInputStream inputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* BWT to use to fill in missing data.
|
||||||
|
*/
|
||||||
|
private BWT bwt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new suffix array reader.
|
||||||
|
* @param inputFile File in which the suffix array is stored.
|
||||||
|
* @param bwt BWT to use when filling in missing data.
|
||||||
|
*/
|
||||||
|
public SuffixArrayReader(File inputFile, BWT bwt) {
|
||||||
|
try {
|
||||||
|
this.inputStream = new FileInputStream(inputFile);
|
||||||
|
this.bwt = bwt;
|
||||||
|
}
|
||||||
|
catch( FileNotFoundException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to open input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a suffix array from the input stream.
|
||||||
|
* @return The suffix array stored in the input stream.
|
||||||
|
*/
|
||||||
|
public SuffixArray read() {
|
||||||
|
UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
|
long inverseSA0;
|
||||||
|
long[] occurrences;
|
||||||
|
long[] suffixArray;
|
||||||
|
int suffixArrayInterval;
|
||||||
|
|
||||||
|
try {
|
||||||
|
inverseSA0 = uintPackedInputStream.read();
|
||||||
|
occurrences = new long[PackUtils.ALPHABET_SIZE];
|
||||||
|
uintPackedInputStream.read(occurrences);
|
||||||
|
// Throw away the suffix array size in bytes and use the occurrences table directly.
|
||||||
|
suffixArrayInterval = (int)uintPackedInputStream.read();
|
||||||
|
suffixArray = new long[(int)((occurrences[occurrences.length-1]+suffixArrayInterval-1)/suffixArrayInterval)];
|
||||||
|
uintPackedInputStream.read(suffixArray);
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the input stream.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
inputStream.close();
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to close input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.bwt;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Javadoc goes here.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class SuffixArrayWriter {
|
||||||
|
/**
|
||||||
|
* Input stream from which to read suffix array data.
|
||||||
|
*/
|
||||||
|
private OutputStream outputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new suffix array reader.
|
||||||
|
* @param outputFile File in which the suffix array is stored.
|
||||||
|
*/
|
||||||
|
public SuffixArrayWriter( File outputFile ) {
|
||||||
|
try {
|
||||||
|
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
|
||||||
|
}
|
||||||
|
catch( FileNotFoundException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to open input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a suffix array to the output stream.
|
||||||
|
* @param suffixArray suffix array to write.
|
||||||
|
*/
|
||||||
|
public void write(SuffixArray suffixArray) {
|
||||||
|
UnsignedIntPackedOutputStream uintPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
|
try {
|
||||||
|
uintPackedOutputStream.write(suffixArray.inverseSA0);
|
||||||
|
uintPackedOutputStream.write(suffixArray.occurrences.toArray(true));
|
||||||
|
// How frequently the suffix array entry is placed.
|
||||||
|
uintPackedOutputStream.write(1);
|
||||||
|
// Length of the suffix array.
|
||||||
|
uintPackedOutputStream.write(suffixArray.length()-1);
|
||||||
|
uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length-1);
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close the input stream.
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
outputStream.close();
|
||||||
|
}
|
||||||
|
catch( IOException ex ) {
|
||||||
|
throw new ReviewedStingException("Unable to close input file", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,95 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.packing;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a packed version of the input stream.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BasePackedInputStream<T> {
|
||||||
|
/**
|
||||||
|
* Type of object to unpack.
|
||||||
|
*/
|
||||||
|
private final Class<T> type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ultimate source for packed bases.
|
||||||
|
*/
|
||||||
|
private final FileInputStream targetInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Channel source for packed bases.
|
||||||
|
*/
|
||||||
|
private final FileChannel targetInputChannel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A fixed-size buffer for word-packed data.
|
||||||
|
*/
|
||||||
|
private final ByteOrder byteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bases are in a given packed word.
|
||||||
|
*/
|
||||||
|
private final int basesPerPackedWord = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bytes in an integer?
|
||||||
|
*/
|
||||||
|
private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE;
|
||||||
|
|
||||||
|
|
||||||
|
public BasePackedInputStream( Class<T> type, File inputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||||
|
this(type,new FileInputStream(inputFile),byteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BasePackedInputStream( Class<T> type, FileInputStream inputStream, ByteOrder byteOrder ) {
|
||||||
|
if( type != Integer.class )
|
||||||
|
throw new ReviewedStingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName());
|
||||||
|
this.type = type;
|
||||||
|
this.targetInputStream = inputStream;
|
||||||
|
this.targetInputChannel = inputStream.getChannel();
|
||||||
|
this.byteOrder = byteOrder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the entire contents of the input stream.
|
||||||
|
* @param bwt array into which bases should be read.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void read(byte[] bwt) throws IOException {
|
||||||
|
read(bwt,0,bwt.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the next <code>length</code> bases into the bwt array, starting at the given offset.
|
||||||
|
* @param bwt array holding the given data.
|
||||||
|
* @param offset target position in the bases array into which bytes should be written.
|
||||||
|
* @param length number of bases to read from the stream.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void read(byte[] bwt, int offset, int length) throws IOException {
|
||||||
|
int bufferWidth = ((bwt.length+basesPerPackedWord-1)/basesPerPackedWord)*bytesPerInteger;
|
||||||
|
ByteBuffer buffer = ByteBuffer.allocate(bufferWidth).order(byteOrder);
|
||||||
|
targetInputChannel.read(buffer);
|
||||||
|
targetInputChannel.position(targetInputChannel.position()+buffer.remaining());
|
||||||
|
buffer.flip();
|
||||||
|
|
||||||
|
int packedWord = 0;
|
||||||
|
int i = 0;
|
||||||
|
while(i < length) {
|
||||||
|
if(i % basesPerPackedWord == 0) packedWord = buffer.getInt();
|
||||||
|
int position = basesPerPackedWord - i%basesPerPackedWord - 1;
|
||||||
|
bwt[offset+i++] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,140 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.packing;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A general-purpose stream for writing packed bases.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class BasePackedOutputStream<T> {
|
||||||
|
/**
|
||||||
|
* Type of object to pack.
|
||||||
|
*/
|
||||||
|
private final Class<T> type;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bases can be stored in the given data structure?
|
||||||
|
*/
|
||||||
|
private final int basesPerType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ultimate target for the packed bases.
|
||||||
|
*/
|
||||||
|
private final OutputStream targetOutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A fixed-size buffer for word-packed data.
|
||||||
|
*/
|
||||||
|
private final ByteBuffer buffer;
|
||||||
|
|
||||||
|
public BasePackedOutputStream( Class<T> type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||||
|
this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write packed bases to the given output stream.
|
||||||
|
* @param type Type of data to pack bases into.
|
||||||
|
* @param outputStream Output stream to which to write packed bases.
|
||||||
|
* @param byteOrder Switch between big endian / little endian when reading / writing files.
|
||||||
|
*/
|
||||||
|
public BasePackedOutputStream( Class<T> type, OutputStream outputStream, ByteOrder byteOrder) {
|
||||||
|
this.targetOutputStream = outputStream;
|
||||||
|
this.type = type;
|
||||||
|
basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE;
|
||||||
|
this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the given base to the output stream. Will write only this base; no packing will be performed.
|
||||||
|
* @param base List of bases to write.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( int base ) throws IOException {
|
||||||
|
write( new byte[] { (byte)base } );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes an array of bases to the target output stream.
|
||||||
|
* @param bases List of bases to write.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( byte[] bases ) throws IOException {
|
||||||
|
write(bases,0,bases.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a subset of the array of bases to the output stream.
|
||||||
|
* @param bases List of bases to write.
|
||||||
|
* @param offset site at which to start writing.
|
||||||
|
* @param length number of bases to write.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( byte[] bases, int offset, int length ) throws IOException {
|
||||||
|
int packedBases = 0;
|
||||||
|
int positionInPack = 0;
|
||||||
|
|
||||||
|
for( int base = offset; base < offset+length; base++ ) {
|
||||||
|
packedBases = packBase(bases[base], packedBases, positionInPack);
|
||||||
|
|
||||||
|
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||||
|
positionInPack = ++positionInPack % basesPerType;
|
||||||
|
if( positionInPack == 0 ) {
|
||||||
|
writePackedBases(packedBases);
|
||||||
|
packedBases = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( positionInPack > 0 )
|
||||||
|
writePackedBases(packedBases);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flush the contents of the OutputStream to disk.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void flush() throws IOException {
|
||||||
|
targetOutputStream.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the given output stream.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void close() throws IOException {
|
||||||
|
targetOutputStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pack the given base into the basepack.
|
||||||
|
* @param base The base to pack.
|
||||||
|
* @param basePack Target for the pack operation.
|
||||||
|
* @param position Position within the pack to which to add the base.
|
||||||
|
* @return The packed integer.
|
||||||
|
*/
|
||||||
|
private int packBase( byte base, int basePack, int position ) {
|
||||||
|
basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1));
|
||||||
|
return basePack;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the given packed base structure to the output file.
|
||||||
|
* @param packedBases Packed bases to write.
|
||||||
|
* @throws IOException on error writing to the file.
|
||||||
|
*/
|
||||||
|
private void writePackedBases(int packedBases) throws IOException {
|
||||||
|
buffer.rewind();
|
||||||
|
if( type == Integer.class )
|
||||||
|
buffer.putInt(packedBases);
|
||||||
|
else if( type == Byte.class )
|
||||||
|
buffer.put((byte)packedBases);
|
||||||
|
else
|
||||||
|
throw new ReviewedStingException("Cannot pack bases into type " + type.getName());
|
||||||
|
targetOutputStream.write(buffer.array());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment.reference.packing;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a .PAC file from a given reference.
|
||||||
|
*
|
||||||
|
* @author hanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class CreatePACFromReference {
|
||||||
|
public static void main( String argv[] ) throws IOException {
|
||||||
|
if( argv.length != 3 ) {
|
||||||
|
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output pac> <output rpac>");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read in the first sequence in the input file
|
||||||
|
String inputFileName = argv[0];
|
||||||
|
File inputFile = new File(inputFileName);
|
||||||
|
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||||
|
ReferenceSequence sequence = reference.nextSequence();
|
||||||
|
|
||||||
|
// Target file for output
|
||||||
|
PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() );
|
||||||
|
|
||||||
|
// Reverse the bases in the reference
|
||||||
|
PackUtils.reverse(sequence.getBases());
|
||||||
|
|
||||||
|
// Target file for output
|
||||||
|
PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,135 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.packing;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities designed for packing / unpacking bases.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class PackUtils {
|
||||||
|
/**
|
||||||
|
* How many possible bases can be encoded?
|
||||||
|
*/
|
||||||
|
public static final int ALPHABET_SIZE = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bits does it take to store a single base?
|
||||||
|
*/
|
||||||
|
public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bits fit into a single byte?
|
||||||
|
*/
|
||||||
|
public static final int BITS_PER_BYTE = 8;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a reference sequence to a PAC file.
|
||||||
|
* @param outputFile Filename for the PAC file.
|
||||||
|
* @param referenceSequence Reference sequence to write.
|
||||||
|
* @throws IOException If there's a problem writing to the output file.
|
||||||
|
*/
|
||||||
|
public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException {
|
||||||
|
OutputStream outputStream = new FileOutputStream(outputFile);
|
||||||
|
|
||||||
|
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
|
||||||
|
basePackedOutputStream.write(referenceSequence);
|
||||||
|
|
||||||
|
outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE);
|
||||||
|
|
||||||
|
outputStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bits can a given type hold?
|
||||||
|
* @param type Type to test.
|
||||||
|
* @return Number of bits that the given type can hold.
|
||||||
|
*/
|
||||||
|
public static int bitsInType( Class<?> type ) {
|
||||||
|
try {
|
||||||
|
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
|
||||||
|
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
|
||||||
|
if( typeSize > intTypeSize )
|
||||||
|
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName());
|
||||||
|
return (int)(Math.log(typeSize)/Math.log(2));
|
||||||
|
}
|
||||||
|
catch( NoSuchFieldException ex ) {
|
||||||
|
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex);
|
||||||
|
}
|
||||||
|
catch( IllegalAccessException ex ) {
|
||||||
|
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
|
||||||
|
* @param base ASCII value for the base to pack.
|
||||||
|
* @return A byte from 0-3 indicating the base's packed value.
|
||||||
|
*/
|
||||||
|
public static byte packBase(byte base) {
|
||||||
|
switch( base ) {
|
||||||
|
case 'A':
|
||||||
|
return 0;
|
||||||
|
case 'C':
|
||||||
|
return 1;
|
||||||
|
case 'G':
|
||||||
|
return 2;
|
||||||
|
case 'T':
|
||||||
|
return 3;
|
||||||
|
default:
|
||||||
|
throw new ReviewedStingException("Unknown base type: " + base);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a two-bit representation of a base into an ASCII representation of a base.
|
||||||
|
* @param pack Byte from 0-3 indicating which base is represented.
|
||||||
|
* @return An ASCII value representing the packed base.
|
||||||
|
*/
|
||||||
|
public static byte unpackBase(byte pack) {
|
||||||
|
switch( pack ) {
|
||||||
|
case 0:
|
||||||
|
return 'A';
|
||||||
|
case 1:
|
||||||
|
return 'C';
|
||||||
|
case 2:
|
||||||
|
return 'G';
|
||||||
|
case 3:
|
||||||
|
return 'T';
|
||||||
|
default:
|
||||||
|
throw new ReviewedStingException("Unknown pack type: " + pack);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverses an unpacked sequence of bases.
|
||||||
|
* @param bases bases to reverse.
|
||||||
|
*/
|
||||||
|
public static void reverse( byte[] bases ) {
|
||||||
|
for( int i = 0, j = bases.length-1; i < j; i++, j-- ) {
|
||||||
|
byte temp = bases[j];
|
||||||
|
bases[j] = bases[i];
|
||||||
|
bases[i] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a structure of size <code>size</code> that should be split
|
||||||
|
* into <code>partitionSize</code> partitions, how many partitions should
|
||||||
|
* be created? Size of last partition will be <= partitionSize.
|
||||||
|
* @param size Total size of the data structure.
|
||||||
|
* @param partitionSize Size of an individual partition.
|
||||||
|
* @return Number of partitions that would be created.
|
||||||
|
*/
|
||||||
|
public static int numberOfPartitions( long size, long partitionSize ) {
|
||||||
|
return (int)((size+partitionSize-1) / partitionSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,104 @@
|
||||||
|
package org.broadinstitute.sting.alignment.reference.packing;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a set of integers packed into
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class UnsignedIntPackedInputStream {
|
||||||
|
/**
|
||||||
|
* Ultimate target for the occurrence array.
|
||||||
|
*/
|
||||||
|
private final FileInputStream targetInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Target channel from which to pull file data.
|
||||||
|
*/
|
||||||
|
private final FileChannel targetInputChannel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The byte order in which integer input data appears.
|
||||||
|
*/
|
||||||
|
private final ByteOrder byteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many bytes are required to store an integer?
|
||||||
|
*/
|
||||||
|
private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new PackedIntInputStream, writing to the given target file.
|
||||||
|
* @param inputFile target input file.
|
||||||
|
* @param byteOrder Endianness to use when writing a list of integers.
|
||||||
|
* @throws java.io.IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public UnsignedIntPackedInputStream(File inputFile, ByteOrder byteOrder) throws IOException {
|
||||||
|
this(new FileInputStream(inputFile),byteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read ints from the given InputStream.
|
||||||
|
* @param inputStream Input stream from which to read ints.
|
||||||
|
* @param byteOrder Endianness to use when writing a list of integers.
|
||||||
|
*/
|
||||||
|
public UnsignedIntPackedInputStream(FileInputStream inputStream, ByteOrder byteOrder) {
|
||||||
|
this.targetInputStream = inputStream;
|
||||||
|
this.targetInputChannel = inputStream.getChannel();
|
||||||
|
this.byteOrder = byteOrder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a datum from the input stream.
|
||||||
|
* @return The next input datum in the stream.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public long read() throws IOException {
|
||||||
|
long[] data = new long[1];
|
||||||
|
read(data);
|
||||||
|
return data[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the data from the input stream.
|
||||||
|
* @param data placeholder for input data.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void read( long[] data ) throws IOException {
|
||||||
|
read( data, 0, data.length );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the data from the input stream, starting at the given offset.
|
||||||
|
* @param data placeholder for input data.
|
||||||
|
* @param offset place in the array to start reading in data.
|
||||||
|
* @param length number of ints to read in.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void read( long[] data, int offset, int length ) throws IOException {
|
||||||
|
ByteBuffer readBuffer = ByteBuffer.allocate(bytesPerInteger*length).order(byteOrder);
|
||||||
|
|
||||||
|
targetInputChannel.read(readBuffer,targetInputChannel.position());
|
||||||
|
readBuffer.flip();
|
||||||
|
targetInputChannel.position(targetInputChannel.position()+readBuffer.remaining());
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
while(i < length)
|
||||||
|
data[offset+i++] = readBuffer.getInt() & 0xFFFFFFFFL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the given output stream.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void close() throws IOException {
|
||||||
|
targetInputStream.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,121 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.alignment.reference.packing;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes an list of integers to the output file.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class UnsignedIntPackedOutputStream {
|
||||||
|
/**
|
||||||
|
* Ultimate target for the occurrence array.
|
||||||
|
*/
|
||||||
|
private final OutputStream targetOutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A fixed-size buffer for int-packed data.
|
||||||
|
*/
|
||||||
|
private final ByteBuffer buffer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new PackedIntOutputStream, writing to the given target file.
|
||||||
|
* @param outputFile target output file.
|
||||||
|
* @param byteOrder Endianness to use when writing a list of integers.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public UnsignedIntPackedOutputStream(File outputFile, ByteOrder byteOrder) throws IOException {
|
||||||
|
this(new FileOutputStream(outputFile),byteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write packed ints to the given OutputStream.
|
||||||
|
* @param outputStream Output stream to which to write packed ints.
|
||||||
|
* @param byteOrder Endianness to use when writing a list of integers.
|
||||||
|
*/
|
||||||
|
public UnsignedIntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
|
||||||
|
this.targetOutputStream = outputStream;
|
||||||
|
buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the data to the output stream.
|
||||||
|
* @param datum datum to write.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( long datum ) throws IOException {
|
||||||
|
buffer.rewind();
|
||||||
|
buffer.putInt((int)datum);
|
||||||
|
targetOutputStream.write(buffer.array());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the data to the output stream.
|
||||||
|
* @param data data to write. occurrences.length must match alphabet size.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( long[] data ) throws IOException {
|
||||||
|
for(long datum: data)
|
||||||
|
write(datum);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write the given chunk of data to the input stream.
|
||||||
|
* @param data data to write.
|
||||||
|
* @param offset position at which to start.
|
||||||
|
* @param length number of ints to write.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( long[] data, int offset, int length ) throws IOException {
|
||||||
|
for( int i = offset; i < offset+length; i++ )
|
||||||
|
write(data[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flush the contents of the OutputStream to disk.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void flush() throws IOException {
|
||||||
|
targetOutputStream.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the given output stream.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void close() throws IOException {
|
||||||
|
targetOutputStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -30,10 +30,16 @@ import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
/**
|
/**
|
||||||
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
|
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
|
||||||
* in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
|
* in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
|
||||||
|
|
@ -52,6 +58,13 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
String platform = null; // E.g. ILLUMINA, 454
|
String platform = null; // E.g. ILLUMINA, 454
|
||||||
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
|
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
|
||||||
int nReadsToPrint = -1;
|
int nReadsToPrint = -1;
|
||||||
|
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
|
||||||
|
public Set<File> sampleFile = new TreeSet<File>();
|
||||||
|
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
|
||||||
|
public Set<String> sampleNames = new TreeSet<String>();
|
||||||
|
|
||||||
|
private TreeSet<String> samplesToChoose = new TreeSet<String>();
|
||||||
|
private boolean SAMPLES_SPECIFIED = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The initialize function.
|
* The initialize function.
|
||||||
|
|
@ -59,6 +72,20 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if ( platform != null )
|
if ( platform != null )
|
||||||
platform = platform.toUpperCase();
|
platform = platform.toUpperCase();
|
||||||
|
|
||||||
|
Collection<String> samplesFromFile;
|
||||||
|
if (!sampleFile.isEmpty()) {
|
||||||
|
samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
|
||||||
|
samplesToChoose.addAll(samplesFromFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!sampleNames.isEmpty())
|
||||||
|
samplesToChoose.addAll(sampleNames);
|
||||||
|
|
||||||
|
if(!samplesToChoose.isEmpty()) {
|
||||||
|
SAMPLES_SPECIFIED = true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -85,6 +112,14 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform))
|
if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (SAMPLES_SPECIFIED ) {
|
||||||
|
// user specified samples to select
|
||||||
|
// todo - should be case-agnostic but for simplicity and speed this is ignored.
|
||||||
|
// todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup.
|
||||||
|
if (!samplesToChoose.contains(read.getReadGroup().getSample()))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// check if we've reached the output limit
|
// check if we've reached the output limit
|
||||||
if ( nReadsToPrint == 0 ) {
|
if ( nReadsToPrint == 0 ) {
|
||||||
|
|
|
||||||
|
|
@ -24,11 +24,27 @@ public class IndelType implements InfoFieldAnnotation, ExperimentalAnnotation {
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||||
|
|
||||||
int run;
|
int run;
|
||||||
if ( vc.isIndel() && vc.isBiallelic() ) {
|
if (vc.isMixed()) {
|
||||||
|
Map<String, Object> map = new HashMap<String, Object>();
|
||||||
|
map.put(getKeyNames().get(0), String.format("%s", "MIXED"));
|
||||||
|
return map;
|
||||||
|
|
||||||
|
}
|
||||||
|
else if ( vc.isIndel() ) {
|
||||||
String type="";
|
String type="";
|
||||||
ArrayList<Integer> inds = IndelUtils.findEventClassificationIndex(vc, ref);
|
if (!vc.isBiallelic())
|
||||||
for (int k : inds) {
|
type = "MULTIALLELIC_INDEL";
|
||||||
type = type+ IndelUtils.getIndelClassificationName(k)+".";
|
else {
|
||||||
|
if (vc.isInsertion())
|
||||||
|
type = "INS.";
|
||||||
|
else if (vc.isDeletion())
|
||||||
|
type = "DEL.";
|
||||||
|
else
|
||||||
|
type = "OTHER.";
|
||||||
|
ArrayList<Integer> inds = IndelUtils.findEventClassificationIndex(vc, ref);
|
||||||
|
for (int k : inds) {
|
||||||
|
type = type+ IndelUtils.getIndelClassificationName(k)+".";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Map<String, Object> map = new HashMap<String, Object>();
|
Map<String, Object> map = new HashMap<String, Object>();
|
||||||
map.put(getKeyNames().get(0), String.format("%s", type));
|
map.put(getKeyNames().get(0), String.format("%s", type));
|
||||||
|
|
|
||||||
|
|
@ -391,7 +391,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
||||||
if (DEBUG)
|
if (DEBUG)
|
||||||
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
|
System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
|
||||||
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
|
(int)ref.getWindow().size(), loc.getStart(), numPrefBases);
|
||||||
|
//System.out.println(eventLength);
|
||||||
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
|
||||||
ref, hsize, numPrefBases);
|
ref, hsize, numPrefBases);
|
||||||
|
|
||||||
|
|
@ -418,8 +418,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is lexically ordered, for example
|
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example
|
||||||
// for 3 alleles it's 00 01 02 11 12 22
|
// for 3 alleles it's 00 01 11 02 12 22
|
||||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||||
alleleList,
|
alleleList,
|
||||||
genotypeLikelihoods,
|
genotypeLikelihoods,
|
||||||
|
|
|
||||||
|
|
@ -1042,8 +1042,8 @@ public class PairHMMIndelErrorModel {
|
||||||
|
|
||||||
int k=0;
|
int k=0;
|
||||||
double maxElement = Double.NEGATIVE_INFINITY;
|
double maxElement = Double.NEGATIVE_INFINITY;
|
||||||
for (int i=0; i < hSize; i++) {
|
for (int j=0; j < hSize; j++) {
|
||||||
for (int j=i; j < hSize; j++){
|
for (int i=0; i <= j; i++){
|
||||||
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
|
genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
|
||||||
if (haplotypeLikehoodMatrix[i][j] > maxElement)
|
if (haplotypeLikehoodMatrix[i][j] > maxElement)
|
||||||
maxElement = haplotypeLikehoodMatrix[i][j];
|
maxElement = haplotypeLikehoodMatrix[i][j];
|
||||||
|
|
|
||||||
|
|
@ -1,50 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a mask for use with the PickSequenomProbes walker.
|
|
||||||
*/
|
|
||||||
public class CreateSequenomMask extends RodWalker<Integer, Integer> {
|
|
||||||
@Output
|
|
||||||
PrintStream out;
|
|
||||||
|
|
||||||
public void initialize() {}
|
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
||||||
if ( tracker == null )
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
int result = 0;
|
|
||||||
for ( VariantContext vc : tracker.getAllVariantContexts(ref) ) {
|
|
||||||
if ( vc.isSNP() ) {
|
|
||||||
GenomeLoc loc = context.getLocation();
|
|
||||||
out.println(loc.getContig() + "\t" + (loc.getStart()-1) + "\t" + loc.getStop());
|
|
||||||
result = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduceInit() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
|
||||||
return value + sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(Integer sum) {
|
|
||||||
logger.info("Found " + sum + " masking sites.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,332 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2010 The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
|
||||||
|
|
||||||
import org.broad.tribble.bed.BEDCodec;
|
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generates Sequenom probe information given a single variant track. Emitted is the variant
|
|
||||||
* along with the 200 reference bases on each side of the variant.
|
|
||||||
*/
|
|
||||||
@WalkerName("PickSequenomProbes")
|
|
||||||
@Requires(value={DataSource.REFERENCE})
|
|
||||||
@Reference(window=@Window(start=-200,stop=200))
|
|
||||||
public class PickSequenomProbes extends RodWalker<String, String> {
|
|
||||||
@Output
|
|
||||||
PrintStream out;
|
|
||||||
|
|
||||||
@Argument(required=false, shortName="snp_mask", doc="positions to be masked with N's")
|
|
||||||
protected String SNP_MASK = null;
|
|
||||||
@Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'")
|
|
||||||
String project_id = null;
|
|
||||||
@Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")")
|
|
||||||
boolean omitWindow = false;
|
|
||||||
@Argument(required = false, fullName="usePlinkRODNamingConvention", shortName="nameConvention",doc="Use the naming convention defined in PLINKROD")
|
|
||||||
boolean useNamingConvention = false;
|
|
||||||
@Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes")
|
|
||||||
int noMaskWindow = 0;
|
|
||||||
@Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name")
|
|
||||||
boolean addCounter = false;
|
|
||||||
|
|
||||||
private byte [] maskFlags = new byte[401];
|
|
||||||
|
|
||||||
private LocationAwareSeekableRODIterator snpMaskIterator=null;
|
|
||||||
|
|
||||||
private GenomeLoc positionOfLastVariant = null;
|
|
||||||
|
|
||||||
private int cnt = 0;
|
|
||||||
private int discarded = 0;
|
|
||||||
|
|
||||||
VariantCollection VCs ; // will keep a set of distinct variants at a given site
|
|
||||||
private List<GenomeLoc> processedVariantsInScope = new LinkedList<GenomeLoc>();
|
|
||||||
|
|
||||||
public void initialize() {
|
|
||||||
if ( SNP_MASK != null ) {
|
|
||||||
logger.info("Loading SNP mask... ");
|
|
||||||
ReferenceOrderedData snp_mask;
|
|
||||||
//if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) {
|
|
||||||
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
|
|
||||||
RMDTrack track = builder.createInstanceOfTrack(BEDCodec.class, new File(SNP_MASK));
|
|
||||||
snpMaskIterator = new SeekableRODIterator(track.getHeader(),
|
|
||||||
track.getSequenceDictionary(),
|
|
||||||
getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
|
|
||||||
getToolkit().getGenomeLocParser(),
|
|
||||||
track.getIterator());
|
|
||||||
//} else {
|
|
||||||
// // TODO: fix me when Plink is back
|
|
||||||
// throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)");
|
|
||||||
//}
|
|
||||||
|
|
||||||
}
|
|
||||||
VCs = new VariantCollection();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
||||||
if ( tracker == null )
|
|
||||||
return "";
|
|
||||||
|
|
||||||
logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow());
|
|
||||||
|
|
||||||
VCs.clear();
|
|
||||||
VCs.addAll( tracker.getAllVariantContexts(ref), ref.getLocus() );
|
|
||||||
|
|
||||||
discarded += VCs.discarded();
|
|
||||||
|
|
||||||
if ( VCs.size() == 0 ) {
|
|
||||||
logger.debug(" Context empty");
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( VCs.size() > 1 ) {
|
|
||||||
logger.debug(" "+VCs.size()+ " variants at the locus");
|
|
||||||
}
|
|
||||||
|
|
||||||
// System.out.print("At locus "+ref.getLocus()+": ");
|
|
||||||
// for ( VariantContext vc : VCs ) {
|
|
||||||
// System.out.println(vc.toString());
|
|
||||||
// }
|
|
||||||
|
|
||||||
// little optimization: since we may have few events at the current site on the reference,
|
|
||||||
// we are going to make sure we compute the mask and ref bases only once for each location and only if we need to
|
|
||||||
boolean haveMaskForWindow = false;
|
|
||||||
boolean haveBasesForWindow = false;
|
|
||||||
String leading_bases = null;
|
|
||||||
String trailing_bases = null;
|
|
||||||
|
|
||||||
StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed)
|
|
||||||
|
|
||||||
// get all variant contexts!!!!
|
|
||||||
for ( VariantContext vc : VCs ) {
|
|
||||||
|
|
||||||
// we can only deal with biallelic sites for now
|
|
||||||
if ( !vc.isBiallelic() ) {
|
|
||||||
logger.debug(" Not biallelic; skipped");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// we don't want to see the same multi-base event (deletion, DNP etc) multiple times.
|
|
||||||
// All the vcs we are currently seeing are clearly on the same contig as the current reference
|
|
||||||
// poisiton (or we would not see them at all!). All we need to check is if the vc starts at the
|
|
||||||
// current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already).
|
|
||||||
if ( ref.getLocus().getStart() != vc.getStart() )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if ( ! haveMaskForWindow ) {
|
|
||||||
String contig = context.getLocation().getContig();
|
|
||||||
int offset = context.getLocation().getStart();
|
|
||||||
int true_offset = offset - 200;
|
|
||||||
|
|
||||||
// we have variant; let's load all the snps falling into the current window and prepare the mask array.
|
|
||||||
// we need to do it only once per window, regardless of how many vcs we may have at this location!
|
|
||||||
if ( snpMaskIterator != null ) {
|
|
||||||
// clear the mask
|
|
||||||
for ( int i = 0 ; i < 401; i++ )
|
|
||||||
maskFlags[i] = 0;
|
|
||||||
|
|
||||||
RODRecordList snpList = snpMaskIterator.seekForward(getToolkit().getGenomeLocParser().createGenomeLoc(contig,offset-200,offset+200));
|
|
||||||
if ( snpList != null && snpList.size() != 0 ) {
|
|
||||||
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
|
|
||||||
int i = 0;
|
|
||||||
while ( snpsInWindow.hasNext() ) {
|
|
||||||
GenomeLoc snp = snpsInWindow.next().getLocation();
|
|
||||||
// we don't really want to mask out multi-base indels
|
|
||||||
if ( snp.size() > 1 )
|
|
||||||
continue;
|
|
||||||
logger.debug(" SNP at "+snp.getStart());
|
|
||||||
int offsetInWindow = (int)(snp.getStart() - true_offset);
|
|
||||||
maskFlags[offsetInWindow] = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window...
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( ! haveBasesForWindow ) {
|
|
||||||
byte[] context_bases = ref.getBases();
|
|
||||||
for (int i = 0; i < 401; i++) {
|
|
||||||
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
|
|
||||||
context_bases[i] = 'N';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
|
|
||||||
trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
|
|
||||||
// masked bases are not gonna change for the current window, unless we use windowed masking;
|
|
||||||
// in the latter case the bases (N's) will depend on the event we are currently looking at,
|
|
||||||
// so we better recompute..
|
|
||||||
if ( noMaskWindow == 0 ) haveBasesForWindow = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// below, build single assay line for the current VC:
|
|
||||||
|
|
||||||
String assay_sequence;
|
|
||||||
if ( vc.isSNP() )
|
|
||||||
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
|
|
||||||
else if ( vc.isMNP() )
|
|
||||||
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
|
|
||||||
else if ( vc.isInsertion() )
|
|
||||||
assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
|
|
||||||
else if ( vc.isDeletion() )
|
|
||||||
assay_sequence = leading_bases + (char)ref.getBase() + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length());
|
|
||||||
else
|
|
||||||
continue;
|
|
||||||
|
|
||||||
StringBuilder assay_id = new StringBuilder();
|
|
||||||
if ( project_id != null ) {
|
|
||||||
assay_id.append(project_id);
|
|
||||||
assay_id.append('|');
|
|
||||||
}
|
|
||||||
if ( useNamingConvention ) {
|
|
||||||
assay_id.append('c');
|
|
||||||
assay_id.append(context.getLocation().toString().replace(":","_p"));
|
|
||||||
} else {
|
|
||||||
assay_id.append(context.getLocation().toString().replace(':','_'));
|
|
||||||
}
|
|
||||||
if ( vc.isInsertion() ) assay_id.append("_gI");
|
|
||||||
else if ( vc.isDeletion()) assay_id.append("_gD");
|
|
||||||
|
|
||||||
if ( ! omitWindow ) {
|
|
||||||
assay_id.append("_");
|
|
||||||
assay_id.append(ref.getWindow().toString().replace(':', '_'));
|
|
||||||
}
|
|
||||||
++cnt;
|
|
||||||
if ( addCounter ) assay_id.append("_"+cnt);
|
|
||||||
|
|
||||||
assaysForLocus.append(assay_id);
|
|
||||||
assaysForLocus.append('\t');
|
|
||||||
assaysForLocus.append(assay_sequence);
|
|
||||||
assaysForLocus.append('\n');
|
|
||||||
}
|
|
||||||
return assaysForLocus.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String reduceInit() {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
public String reduce(String data, String sum) {
|
|
||||||
out.print(data);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getNoMaskWindowRightEnd(VariantContext vc, int window) {
|
|
||||||
if ( window == 0 ) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( vc.isInsertion() ) {
|
|
||||||
return window-1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int max = 0;
|
|
||||||
for (Allele a : vc.getAlleles() ) {
|
|
||||||
if ( vc.isInsertion() ) {
|
|
||||||
logger.debug("Getting length of allele "+a.toString()+" it is "+a.getBases().length+" (ref allele is "+vc.getReference().toString()+")");
|
|
||||||
}
|
|
||||||
if ( a.getBases().length > max ) {
|
|
||||||
max = a.getBases().length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return max+window-1;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(String sum) {
|
|
||||||
logger.info(cnt+" assay seqences generated");
|
|
||||||
logger.info(discarded+" events were found to be duplicates and discarded (no redundant assays generated)");
|
|
||||||
}
|
|
||||||
|
|
||||||
static class EventComparator implements Comparator<VariantContext> {
|
|
||||||
|
|
||||||
public int compare(VariantContext o1, VariantContext o2) {
|
|
||||||
// if variants start at different positions, they are different. All we actually
|
|
||||||
// care about is detecting the variants that are strictly the same; the actual ordering of distinct variants
|
|
||||||
// (which one we deem less and which one greater) is utterly unimportant. We just need to be consistent.
|
|
||||||
if ( o1.getStart() < o2.getStart() ) return -1;
|
|
||||||
if ( o1.getStart() > o2.getStart() ) return 1;
|
|
||||||
|
|
||||||
if ( o1.getType() != o2.getType() ) return o1.getType().compareTo(o2.getType());
|
|
||||||
|
|
||||||
int refComp = o1.getReference().compareTo(o2.getReference());
|
|
||||||
if ( refComp != 0 ) return refComp;
|
|
||||||
|
|
||||||
return o1.getAlternateAllele(0).compareTo(o2.getAlternateAllele(0));
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static class VariantCollection implements Iterable<VariantContext> {
|
|
||||||
TreeSet<VariantContext> variants = new TreeSet(new EventComparator());
|
|
||||||
int discarded = 0;
|
|
||||||
|
|
||||||
public void add(VariantContext vc, GenomeLoc current) {
|
|
||||||
if ( vc.getStart() != current.getStart() ) return; // we add only variants that start at current locus
|
|
||||||
// note that we do not check chr here, since the way this class is used, the mathod is always called with
|
|
||||||
// VCs coming from the same metadata tracker, so they simply can not be on different chrs!
|
|
||||||
if ( !vc.isBiallelic() ) {
|
|
||||||
logger.info(" Non-biallelic variant encountered; skipped");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if ( variants.add(vc) == false ) discarded++;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addAll(Collection<VariantContext> c, GenomeLoc current) {
|
|
||||||
for ( VariantContext vc : c ) add(vc,current);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
variants.clear();
|
|
||||||
discarded = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int discarded() { return discarded; }
|
|
||||||
|
|
||||||
public int size() { return variants.size(); }
|
|
||||||
|
|
||||||
public Iterator<VariantContext> iterator() { return variants.iterator(); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,413 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation;
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMSequenceDictionary;
|
||||||
|
import org.broadinstitute.sting.alignment.Alignment;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||||
|
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 6/13/11
|
||||||
|
* Time: 2:12 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class),
|
||||||
|
@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)})
|
||||||
|
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||||
|
|
||||||
|
@Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false)
|
||||||
|
boolean lowerCaseSNPs = false;
|
||||||
|
|
||||||
|
@Argument(doc="Size of the virtual primer to use for lower-casing regions with low specificity",fullName="virtualPrimerSize",required=false)
|
||||||
|
int virtualPrimerSize = 20;
|
||||||
|
|
||||||
|
@Argument(doc="Monomorphic sites in the mask file will be treated as filtered",fullName="filterMonomorphic",required=false)
|
||||||
|
boolean filterMonomorphic = false;
|
||||||
|
|
||||||
|
@Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false)
|
||||||
|
boolean doNotUseBWA = false;
|
||||||
|
|
||||||
|
GenomeLoc prevInterval;
|
||||||
|
GenomeLoc allelePos;
|
||||||
|
String probeName;
|
||||||
|
StringBuilder sequence;
|
||||||
|
StringBuilder rawSequence;
|
||||||
|
boolean sequenceInvalid;
|
||||||
|
List<String> invReason;
|
||||||
|
int indelCounter;
|
||||||
|
|
||||||
|
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
|
||||||
|
"generated by bwa index -d bwtsw. If unspecified, will default " +
|
||||||
|
"to the reference specified via the -R argument.",required=false)
|
||||||
|
private File targetReferenceFile = null;
|
||||||
|
|
||||||
|
@Output
|
||||||
|
PrintStream out;
|
||||||
|
|
||||||
|
BWACAligner aligner = null;
|
||||||
|
|
||||||
|
private SAMFileHeader header = null;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if ( ! doNotUseBWA ) {
|
||||||
|
if(targetReferenceFile == null)
|
||||||
|
targetReferenceFile = getToolkit().getArguments().referenceFile;
|
||||||
|
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
|
||||||
|
BWAConfiguration configuration = new BWAConfiguration();
|
||||||
|
aligner = new BWACAligner(bwtFiles,configuration);
|
||||||
|
header = new SAMFileHeader();
|
||||||
|
SAMSequenceDictionary referenceDictionary =
|
||||||
|
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
|
||||||
|
header.setSequenceDictionary(referenceDictionary);
|
||||||
|
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
prevInterval = null;
|
||||||
|
sequence = null;
|
||||||
|
rawSequence = null;
|
||||||
|
sequenceInvalid = false;
|
||||||
|
probeName = null;
|
||||||
|
invReason = null;
|
||||||
|
indelCounter = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; }
|
||||||
|
|
||||||
|
GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation();
|
||||||
|
//logger.debug(interval);
|
||||||
|
if ( prevInterval == null || ! interval.equals(prevInterval) ) {
|
||||||
|
// we're in a new interval, we should:
|
||||||
|
// 1) print out previous data
|
||||||
|
// 2) reset internal data
|
||||||
|
// 3) instantiate traversal of this interval
|
||||||
|
|
||||||
|
// step 1:
|
||||||
|
if ( prevInterval != null ) {
|
||||||
|
// there was a previous interval
|
||||||
|
validateSequence(); // ensure the sequence in the region is valid
|
||||||
|
// next line removed in favor of the one after
|
||||||
|
if ( doNotUseBWA ) {
|
||||||
|
lowerRepeats(); // change repeats in sequence to lower case
|
||||||
|
} else {
|
||||||
|
lowerNonUniqueSegments();
|
||||||
|
}
|
||||||
|
print(); // print out the fasta sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 2:
|
||||||
|
prevInterval = interval;
|
||||||
|
allelePos = null;
|
||||||
|
sequence = new StringBuilder();
|
||||||
|
rawSequence = new StringBuilder();
|
||||||
|
sequenceInvalid = false;
|
||||||
|
invReason = new LinkedList<String>();
|
||||||
|
logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues()));
|
||||||
|
probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1);
|
||||||
|
indelCounter = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 3 (or 1 if not new):
|
||||||
|
// build up the sequence
|
||||||
|
|
||||||
|
VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus());
|
||||||
|
VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus());
|
||||||
|
|
||||||
|
if ( mask == null && validate == null ) {
|
||||||
|
if ( indelCounter > 0 ) {
|
||||||
|
sequence.append('N');
|
||||||
|
indelCounter--;
|
||||||
|
} else {
|
||||||
|
sequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||||
|
}
|
||||||
|
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||||
|
} else if ( validate != null ) {
|
||||||
|
// doesn't matter if there's a mask here too -- this is what we want to validate
|
||||||
|
if ( validate.isFiltered() ) {
|
||||||
|
logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("SITE_IS_FILTERED");
|
||||||
|
}
|
||||||
|
if ( validate.isIndel() ) {
|
||||||
|
sequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||||
|
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||||
|
}
|
||||||
|
sequence.append('[');
|
||||||
|
sequence.append(validate.getAlternateAllele(0).toString());
|
||||||
|
sequence.append('/');
|
||||||
|
sequence.append(validate.getReference().toString());
|
||||||
|
sequence.append(']');
|
||||||
|
// do this to the raw sequence to -- the indeces will line up that way
|
||||||
|
rawSequence.append('[');
|
||||||
|
rawSequence.append(validate.getAlternateAllele(0).getBaseString());
|
||||||
|
rawSequence.append('/');
|
||||||
|
rawSequence.append(validate.getReference().getBaseString());
|
||||||
|
rawSequence.append(']');
|
||||||
|
allelePos = ref.getLocus();
|
||||||
|
if ( indelCounter > 0 ) {
|
||||||
|
logger.warn("An indel event overlaps the event to be validated. This completely invalidates the probe.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("INDEL_OVERLAPS_VALIDATION_SITE");
|
||||||
|
if ( validate.isSNP() ) {
|
||||||
|
indelCounter--;
|
||||||
|
} else {
|
||||||
|
indelCounter -= validate.getEnd()-validate.getStart();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else /* (mask != null && validate == null ) */ {
|
||||||
|
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
|
||||||
|
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
|
||||||
|
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
|
||||||
|
// note: indelCounter could be > 0 (could have small deletion within larger one). This always selects
|
||||||
|
// the larger event.
|
||||||
|
int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart();
|
||||||
|
if ( indelCounterNew > indelCounter ) {
|
||||||
|
indelCounter = indelCounterNew;
|
||||||
|
}
|
||||||
|
//sequence.append((char) ref.getBase());
|
||||||
|
//sequence.append(mask.isInsertion() ? 'I' : 'D');
|
||||||
|
sequence.append("N");
|
||||||
|
indelCounter--;
|
||||||
|
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||||
|
} else if ( indelCounter > 0 ) {
|
||||||
|
// previous section resets the indel counter. Doesn't matter if there's a SNP underlying this, we just want to append an 'N' and move on.
|
||||||
|
sequence.append('N');
|
||||||
|
indelCounter--;
|
||||||
|
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
|
||||||
|
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){
|
||||||
|
logger.debug("SNP in mask found at " + ref.getLocus().toString());
|
||||||
|
|
||||||
|
if ( lowerCaseSNPs ) {
|
||||||
|
sequence.append(Character.toLowerCase((char) ref.getBase()));
|
||||||
|
} else {
|
||||||
|
sequence.append((char) BaseUtils.N);
|
||||||
|
}
|
||||||
|
|
||||||
|
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||||
|
} else if ( mask.isSNP() ) {
|
||||||
|
logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic");
|
||||||
|
sequence.append((Character.toUpperCase((char) ref.getBase())));
|
||||||
|
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduce(Integer i, Integer j) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer fin ) {
|
||||||
|
validateSequence();
|
||||||
|
if ( doNotUseBWA ) {
|
||||||
|
lowerRepeats();
|
||||||
|
} else {
|
||||||
|
lowerNonUniqueSegments();
|
||||||
|
}
|
||||||
|
print();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void validateSequence() {
|
||||||
|
// code for ensuring primer sequence is valid goes here
|
||||||
|
|
||||||
|
// validate that there are no masked sites near to the variant site
|
||||||
|
String seq = sequence.toString();
|
||||||
|
int start = seq.indexOf('[') - 4;
|
||||||
|
int end = seq.indexOf(']') + 5;
|
||||||
|
|
||||||
|
if ( start < 50 ) {
|
||||||
|
logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("START_TOO_CLOSE");
|
||||||
|
} else if ( end > seq.length() - 50 ) {
|
||||||
|
logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. ");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("END_TOO_CLOSE");
|
||||||
|
} else {
|
||||||
|
boolean maskNearVariantSite = false;
|
||||||
|
for ( int i = start; i < end; i++ ) {
|
||||||
|
maskNearVariantSite |= (seq.charAt(i) == 'N' || Character.isLowerCase(seq.charAt(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( maskNearVariantSite ) {
|
||||||
|
logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("VARIANT_TOO_NEAR_PROBE");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
|
||||||
|
logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("MULTIPLE_PROBES");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( seq.indexOf("[") < 0 ) {
|
||||||
|
logger.warn("No variants in region were found. This site will not be designed.");
|
||||||
|
sequenceInvalid = true;
|
||||||
|
invReason.add("NO_VARIANTS_FOUND");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void lowerNonUniqueSegments() {
|
||||||
|
if ( ! invReason.contains("MULTIPLE_PROBES") && !invReason.contains("NO_VARIANTS_FOUND") ) {
|
||||||
|
String leftFlank = rawSequence.toString().split("\\[")[0];
|
||||||
|
String rightFlank = rawSequence.toString().split("\\]")[1];
|
||||||
|
List<Integer> badLeft = getBadIndeces(leftFlank);
|
||||||
|
List<Integer> badRight = getBadIndeces(rightFlank);
|
||||||
|
// propagate lowercases into the printed sequence
|
||||||
|
for ( int idx = 0; idx < leftFlank.length(); idx++ ) {
|
||||||
|
while ( badLeft.size() > 0 && idx > badLeft.get(0) + virtualPrimerSize ) {
|
||||||
|
badLeft.remove(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( badLeft.size() > 0 && badLeft.get(0) <= idx && idx <= badLeft.get(0) + virtualPrimerSize ) {
|
||||||
|
sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int offset = 1 + rawSequence.indexOf("]");
|
||||||
|
for ( int i= 0; i < rightFlank.length(); i++ ) {
|
||||||
|
int idx = i + offset;
|
||||||
|
while ( badRight.size() > 0 && i > badRight.get(0) + virtualPrimerSize ) {
|
||||||
|
//logger.debug("Removing "+badRight.get(0)+" because "+(badRight.get(0)+virtualPrimerSize)+" < "+i);
|
||||||
|
badRight.remove(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( badRight.size() > 0 && badRight.get(0) <= i && i <= badRight.get(0) + virtualPrimerSize ) {
|
||||||
|
//logger.debug("Resetting character on right flank: "+idx+" "+i+" offset="+offset);
|
||||||
|
//logger.debug(sequence);
|
||||||
|
sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx)));
|
||||||
|
//logger.debug(sequence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Integer> getBadIndeces(String sequence) {
|
||||||
|
|
||||||
|
List<Integer> badLeftIndeces = new ArrayList<Integer>(sequence.length()-virtualPrimerSize);
|
||||||
|
for ( int i = 0; i < sequence.length()-virtualPrimerSize ; i++ ) {
|
||||||
|
String toAlign = sequence.substring(i,i+virtualPrimerSize);
|
||||||
|
Iterable<Alignment[]> allAlignments = aligner.getAllAlignments(toAlign.getBytes());
|
||||||
|
for ( Alignment[] alignments : allAlignments ) {
|
||||||
|
if ( alignments.length > 1 ) {
|
||||||
|
if ( alignments[0].getMappingQuality() == 0 ) {
|
||||||
|
// this region is bad -- multiple MQ alignments
|
||||||
|
badLeftIndeces.add(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return badLeftIndeces;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note- this is an old function - a proxy for identifying regions with low specificity to genome. Saved in case the alignment-based version
|
||||||
|
* turns out to be worse than just doing a simple repeat-lowering method.
|
||||||
|
*/
|
||||||
|
public void lowerRepeats() {
|
||||||
|
// convert to lower case low-complexity repeats, e.g. tandem k-mers
|
||||||
|
final int K_LIM = 8;
|
||||||
|
String seq = sequence.toString();
|
||||||
|
StringBuilder newSequence = new StringBuilder();
|
||||||
|
int start_pos = 0;
|
||||||
|
while( start_pos < seq.length() ) {
|
||||||
|
boolean broke = false;
|
||||||
|
for ( int length = K_LIM; length > 1; length -- ) {
|
||||||
|
//logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length()));
|
||||||
|
if ( start_pos + 2*length> seq.length() ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) {
|
||||||
|
newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase());
|
||||||
|
newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase());
|
||||||
|
start_pos += 2*length;
|
||||||
|
broke = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! broke ) {
|
||||||
|
newSequence.append(seq.substring(start_pos,start_pos+1));
|
||||||
|
start_pos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence = newSequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equalsIgnoreNs(String one, String two) {
|
||||||
|
if ( one.length() != two.length() ) { return false; }
|
||||||
|
for ( int idx = 0; idx < one.length(); idx++ ) {
|
||||||
|
if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) {
|
||||||
|
if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//logger.debug(String.format("one: %s two: %s",one,two));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void print() {
|
||||||
|
String valid;
|
||||||
|
if ( sequenceInvalid ) {
|
||||||
|
valid = "";
|
||||||
|
while ( invReason.size() > 0 ) {
|
||||||
|
String reason = invReason.get(0);
|
||||||
|
invReason.remove(reason);
|
||||||
|
int num = 1;
|
||||||
|
while ( invReason.contains(reason) ) {
|
||||||
|
num++;
|
||||||
|
invReason.remove(reason);
|
||||||
|
}
|
||||||
|
valid += String.format("%s=%d,",reason,num);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
valid = "Valid";
|
||||||
|
}
|
||||||
|
|
||||||
|
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
|
||||||
|
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -24,6 +24,16 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Hidden;
|
||||||
|
import org.broadinstitute.sting.commandline.Input;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.Hidden;
|
import org.broadinstitute.sting.commandline.Hidden;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
|
@ -44,6 +54,9 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.lang.annotation.AnnotationFormatError;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -91,6 +104,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
|
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
|
||||||
private boolean KEEP_AF_SPECTRUM = false;
|
private boolean KEEP_AF_SPECTRUM = false;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false)
|
||||||
|
private File AF_FILE = new File("");
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||||
|
private File FAMILY_STRUCTURE_FILE = null;
|
||||||
|
|
||||||
@Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
@Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||||
private String FAMILY_STRUCTURE = "";
|
private String FAMILY_STRUCTURE = "";
|
||||||
|
|
@ -113,6 +133,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false)
|
@Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false)
|
||||||
private boolean SELECT_INDELS = false;
|
private boolean SELECT_INDELS = false;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||||
|
private String outMVFile = null;
|
||||||
|
|
||||||
/* Private class used to store the intermediate variants in the integer random selection process */
|
/* Private class used to store the intermediate variants in the integer random selection process */
|
||||||
private class RandomVariantStructure {
|
private class RandomVariantStructure {
|
||||||
|
|
@ -140,7 +163,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
private boolean DISCORDANCE_ONLY = false;
|
private boolean DISCORDANCE_ONLY = false;
|
||||||
private boolean CONCORDANCE_ONLY = false;
|
private boolean CONCORDANCE_ONLY = false;
|
||||||
|
|
||||||
private MendelianViolation mv;
|
private Set<MendelianViolation> mvSet = new HashSet<MendelianViolation>();
|
||||||
|
|
||||||
/* default name for the variant dataset (VCF) */
|
/* default name for the variant dataset (VCF) */
|
||||||
private final String variantRodName = "variant";
|
private final String variantRodName = "variant";
|
||||||
|
|
@ -155,8 +178,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
private RandomVariantStructure [] variantArray;
|
private RandomVariantStructure [] variantArray;
|
||||||
|
|
||||||
|
|
||||||
|
/* Variables used for random selection with AF boosting */
|
||||||
|
private ArrayList<Double> afBreakpoints = null;
|
||||||
|
private ArrayList<Double> afBoosts = null;
|
||||||
|
double bkDelta = 0.0;
|
||||||
|
|
||||||
|
|
||||||
|
private PrintStream outMVFileStream = null;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
|
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
|
||||||
|
|
@ -212,10 +241,29 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
CONCORDANCE_ONLY = concordanceRodName.length() > 0;
|
CONCORDANCE_ONLY = concordanceRodName.length() > 0;
|
||||||
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName);
|
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName);
|
||||||
|
|
||||||
if (MENDELIAN_VIOLATIONS)
|
if (MENDELIAN_VIOLATIONS) {
|
||||||
mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
if ( FAMILY_STRUCTURE_FILE != null) {
|
||||||
|
try {
|
||||||
|
for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) {
|
||||||
|
MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
||||||
|
if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom()))
|
||||||
|
mvSet.add(mv);
|
||||||
|
}
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
|
||||||
|
}
|
||||||
|
if (outMVFile != null)
|
||||||
|
try {
|
||||||
|
outMVFileStream = new PrintStream(outMVFile);
|
||||||
|
}
|
||||||
|
catch (FileNotFoundException e) {
|
||||||
|
throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); }
|
||||||
|
}
|
||||||
|
else
|
||||||
|
mvSet.add(new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD));
|
||||||
|
}
|
||||||
else if (!FAMILY_STRUCTURE.isEmpty()) {
|
else if (!FAMILY_STRUCTURE.isEmpty()) {
|
||||||
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
|
||||||
MENDELIAN_VIOLATIONS = true;
|
MENDELIAN_VIOLATIONS = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -227,6 +275,33 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track");
|
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track");
|
||||||
|
|
||||||
|
|
||||||
|
if (KEEP_AF_SPECTRUM) {
|
||||||
|
try {
|
||||||
|
afBreakpoints = new ArrayList<Double>();
|
||||||
|
afBoosts = new ArrayList<Double>();
|
||||||
|
logger.info("Reading in AF boost table...");
|
||||||
|
boolean firstLine = false;
|
||||||
|
for ( final String line : new XReadLines( AF_FILE ) ) {
|
||||||
|
if (!firstLine) {
|
||||||
|
firstLine = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
final String[] vals = line.split(" ");
|
||||||
|
|
||||||
|
double bkp = Double.valueOf(vals[0]);
|
||||||
|
double afb = Double.valueOf(vals[1]);
|
||||||
|
afBreakpoints.add(bkp);
|
||||||
|
afBoosts.add(afb);
|
||||||
|
|
||||||
|
}
|
||||||
|
bkDelta = afBreakpoints.get(0);
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -250,9 +325,24 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
for (VariantContext vc : vcs) {
|
for (VariantContext vc : vcs) {
|
||||||
if (MENDELIAN_VIOLATIONS) {
|
if (MENDELIAN_VIOLATIONS) {
|
||||||
if (!mv.isViolation(vc)) {
|
boolean foundMV = false;
|
||||||
break;
|
for (MendelianViolation mv : mvSet) {
|
||||||
|
if (mv.isViolation(vc)) {
|
||||||
|
foundMV = true;
|
||||||
|
//System.out.println(vc.toString());
|
||||||
|
if (outMVFile != null)
|
||||||
|
outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " +
|
||||||
|
"childG=%s childGL=%s\n",vc.getChr(), vc.getStart(),
|
||||||
|
vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)),
|
||||||
|
mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(),
|
||||||
|
vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
|
||||||
|
vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
|
||||||
|
vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!foundMV)
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (DISCORDANCE_ONLY) {
|
if (DISCORDANCE_ONLY) {
|
||||||
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false);
|
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false);
|
||||||
|
|
@ -283,46 +373,59 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
if (SELECT_RANDOM_NUMBER) {
|
if (SELECT_RANDOM_NUMBER) {
|
||||||
randomlyAddVariant(++variantNumber, sub, ref.getBase());
|
randomlyAddVariant(++variantNumber, sub, ref.getBase());
|
||||||
}
|
}
|
||||||
else if (!SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom) {
|
else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
|
||||||
vcfWriter.add(sub, ref.getBase());
|
vcfWriter.add(sub, ref.getBase());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) {
|
if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) {
|
||||||
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, inputAFRodName, null, context.getLocation(), true, false);
|
|
||||||
if (compVCs.isEmpty())
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
|
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
|
||||||
// We then pick a variant with probablity AF*desiredFraction
|
// We then pick a variant with probablity AF*desiredFraction
|
||||||
for (VariantContext compVC : compVCs) {
|
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
|
||||||
if ( compVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
|
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
|
||||||
String afo = compVC.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
|
|
||||||
|
|
||||||
double af;
|
double af;
|
||||||
if (afo.contains(",")) {
|
double afBoost = 1.0;
|
||||||
String[] afs = afo.split(",");
|
if (afo.contains(",")) {
|
||||||
afs[0] = afs[0].substring(1,afs[0].length());
|
String[] afs = afo.split(",");
|
||||||
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
afs[0] = afs[0].substring(1,afs[0].length());
|
||||||
|
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||||
|
|
||||||
double[] afd = new double[afs.length];
|
double[] afd = new double[afs.length];
|
||||||
|
|
||||||
for (int k=0; k < afd.length; k++)
|
for (int k=0; k < afd.length; k++)
|
||||||
afd[k] = Double.valueOf(afs[k]);
|
afd[k] = Double.valueOf(afs[k]);
|
||||||
|
|
||||||
af = MathUtils.arrayMax(afd);
|
af = MathUtils.arrayMax(afd);
|
||||||
//af = Double.valueOf(afs[0]);
|
//af = Double.valueOf(afs[0]);
|
||||||
|
|
||||||
}
|
|
||||||
else
|
|
||||||
af = Double.valueOf(afo);
|
|
||||||
|
|
||||||
//System.out.format("%s .. %4.4f\n",afo.toString(), af);
|
|
||||||
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * af)
|
|
||||||
vcfWriter.add(sub, ref.getBase());
|
|
||||||
}
|
}
|
||||||
break; // do only one vc
|
else
|
||||||
|
af = Double.valueOf(afo);
|
||||||
|
|
||||||
|
// now boost af by table read from file if desired
|
||||||
|
//double bkpt = 0.0;
|
||||||
|
int bkidx = 0;
|
||||||
|
if (!afBreakpoints.isEmpty()) {
|
||||||
|
for ( Double bkpt : afBreakpoints) {
|
||||||
|
if (af < bkpt + bkDelta)
|
||||||
|
break;
|
||||||
|
else bkidx++;
|
||||||
|
}
|
||||||
|
if (bkidx >=afBoosts.size())
|
||||||
|
bkidx = afBoosts.size()-1;
|
||||||
|
afBoost = afBoosts.get(bkidx);
|
||||||
|
//System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.format("%s .. %4.4f\n",afo.toString(), af);
|
||||||
|
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost)
|
||||||
|
vcfWriter.add(sub, ref.getBase());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -406,8 +509,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
private boolean haveSameGenotypes(Genotype g1, Genotype g2) {
|
private boolean haveSameGenotypes(Genotype g1, Genotype g2) {
|
||||||
if ((g1.isCalled() && g2.isFiltered()) ||
|
if ((g1.isCalled() && g2.isFiltered()) ||
|
||||||
(g2.isCalled() && g1.isFiltered()) ||
|
(g2.isCalled() && g1.isFiltered()) ||
|
||||||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
|
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
List<Allele> a1s = g1.getAlleles();
|
List<Allele> a1s = g1.getAlleles();
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,8 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
|
@ -33,7 +35,6 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
@ -74,17 +75,29 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
||||||
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
|
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
|
||||||
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
||||||
getters.put("REF", new Getter() { public String get(VariantContext vc) { return vc.getReference().toString(); } });
|
getters.put("REF", new Getter() {
|
||||||
|
public String get(VariantContext vc) {
|
||||||
|
String x = "";
|
||||||
|
if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) {
|
||||||
|
Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY));
|
||||||
|
x=x+new String(new byte[]{refByte});
|
||||||
|
}
|
||||||
|
return x+vc.getReference().getDisplayString();
|
||||||
|
}
|
||||||
|
});
|
||||||
getters.put("ALT", new Getter() {
|
getters.put("ALT", new Getter() {
|
||||||
public String get(VariantContext vc) {
|
public String get(VariantContext vc) {
|
||||||
StringBuilder x = new StringBuilder();
|
StringBuilder x = new StringBuilder();
|
||||||
int n = vc.getAlternateAlleles().size();
|
int n = vc.getAlternateAlleles().size();
|
||||||
|
|
||||||
if ( n == 0 ) return ".";
|
if ( n == 0 ) return ".";
|
||||||
|
if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) {
|
||||||
|
Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY));
|
||||||
|
x.append(new String(new byte[]{refByte}));
|
||||||
|
}
|
||||||
|
|
||||||
for ( int i = 0; i < n; i++ ) {
|
for ( int i = 0; i < n; i++ ) {
|
||||||
if ( i != 0 ) x.append(",");
|
if ( i != 0 ) x.append(",");
|
||||||
x.append(vc.getAlternateAllele(i).toString());
|
x.append(vc.getAlternateAllele(i).getDisplayString());
|
||||||
}
|
}
|
||||||
return x.toString();
|
return x.toString();
|
||||||
}
|
}
|
||||||
|
|
@ -168,6 +181,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||||
throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc));
|
throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (field.equals("AF") || field.equals("AC")) {
|
||||||
|
String afo = val;
|
||||||
|
|
||||||
|
double af=0;
|
||||||
|
if (afo.contains(",")) {
|
||||||
|
String[] afs = afo.split(",");
|
||||||
|
afs[0] = afs[0].substring(1,afs[0].length());
|
||||||
|
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||||
|
|
||||||
|
double[] afd = new double[afs.length];
|
||||||
|
|
||||||
|
for (int k=0; k < afd.length; k++)
|
||||||
|
afd[k] = Double.valueOf(afs[k]);
|
||||||
|
|
||||||
|
af = MathUtils.arrayMax(afd);
|
||||||
|
//af = Double.valueOf(afs[0]);
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if (!afo.equals("NA"))
|
||||||
|
af = Double.valueOf(afo);
|
||||||
|
|
||||||
|
val = Double.toString(af);
|
||||||
|
|
||||||
|
}
|
||||||
vals.add(val);
|
vals.add(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
package org.broadinstitute.sting.alignment;
|
||||||
|
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Integration tests for the aligner.
|
||||||
|
*
|
||||||
|
* @author mhanna
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class AlignerIntegrationTest extends WalkerTest {
|
||||||
|
@Test
|
||||||
|
public void testBasicAlignment() {
|
||||||
|
String md5 = "34eb4323742999d6d250a0aaa803c6d5";
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-R " + GATKDataLocation + "bwa/human_b36_both.fasta" +
|
||||||
|
" -T Align" +
|
||||||
|
" -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" +
|
||||||
|
" -o %s",
|
||||||
|
1, // just one output file
|
||||||
|
Arrays.asList(md5));
|
||||||
|
executeTest("testBasicAlignment", spec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.WalkerTest;
|
|
||||||
import org.testng.annotations.Test;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
public class PickSequenomProbesIntegrationTest extends WalkerTest {
|
|
||||||
@Test
|
|
||||||
public void testProbes() {
|
|
||||||
String testVCF = validationDataLocation + "complexExample.vcf4";
|
|
||||||
String testArgs = "-R " + b36KGReference + " -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B:input,VCF "+testVCF+" -o %s";
|
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
|
||||||
Arrays.asList("6b5409cc78960f1be855536ed89ea9dd"));
|
|
||||||
executeTest("Test probes", spec);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testProbesUsingDbSNPMask() {
|
|
||||||
|
|
||||||
String md5 = "46d53491af1d3aa0ee1f1e13d68b732d";
|
|
||||||
String testVCF = validationDataLocation + "pickSeqIntegrationTest.vcf";
|
|
||||||
|
|
||||||
String testArgs = "-snp_mask " + validationDataLocation + "pickSeqIntegrationTest.bed -R "
|
|
||||||
+ b36KGReference + " -omitWindow -nameConvention "
|
|
||||||
+ "-project_id 1kgp3_s4_lf -T PickSequenomProbes -B:input,VCF "+testVCF+" -o %s";
|
|
||||||
WalkerTestSpec spec1 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5));
|
|
||||||
executeTest("Test probes", spec1);
|
|
||||||
|
|
||||||
testArgs += " -nmw 1";
|
|
||||||
WalkerTestSpec spec2 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5));
|
|
||||||
executeTest("Test probes", spec2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: Ghost
|
||||||
|
* Date: 7/19/11
|
||||||
|
* Time: 7:39 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class ValidationAmpliconsIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWikiExample() {
|
||||||
|
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
|
||||||
|
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
|
||||||
|
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
|
||||||
|
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
|
||||||
|
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
|
||||||
|
testArgs += " --virtualPrimerSize 30";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
|
Arrays.asList("27f9450afa132888a8994167f0035fd7"));
|
||||||
|
executeTest("Test probes", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWikiExampleNoBWA() {
|
||||||
|
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
|
||||||
|
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
|
||||||
|
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
|
||||||
|
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
|
||||||
|
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
|
||||||
|
testArgs += " --virtualPrimerSize 30 --doNotUseBWA";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
|
Arrays.asList("f2611ff1d9cd5bedaad003251fed8bc1"));
|
||||||
|
executeTest("Test probes", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWikiExampleMonoFilter() {
|
||||||
|
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
|
||||||
|
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
|
||||||
|
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
|
||||||
|
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
|
||||||
|
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
|
||||||
|
testArgs += " --virtualPrimerSize 30 --filterMonomorphic";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
|
||||||
|
Arrays.asList("77b3f30e38fedad812125bdf6cf3255f"));
|
||||||
|
executeTest("Test probes", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -44,7 +44,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testComplexVariantsToTable() {
|
public void testComplexVariantsToTable() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"),
|
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"),
|
||||||
Arrays.asList("b2a3712c1bfad8f1383ffada8b5017ba"));
|
Arrays.asList("e8f771995127b727fb433da91dd4ee98"));
|
||||||
executeTest("testComplexVariantsToTable", spec).getFirst();
|
executeTest("testComplexVariantsToTable", spec).getFirst();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue