Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Mauricio Carneiro 2011-07-20 11:38:16 -04:00
commit 4e00666632
72 changed files with 6173 additions and 509 deletions

View File

@ -0,0 +1,70 @@
#include "MainTools.h"
#include "Basevector.h"
#include "lookup/LookAlign.h"
#include "lookup/SerialQltout.h"
unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) {
//la.PrintParseable(cout);
for (int i = 0; i < candidates.size(); i++) {
look_align newla = la;
if (newla.rc1) { candidates[i].ReverseComplement(); }
newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]);
//newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]);
//cout << newla.Errors() << " " << la.Errors() << endl;
if (newla.Errors() == la.Errors()) {
return i;
}
}
//FatalErr("Query id " + ToString(la.query_id) + " had no matches.");
return candidates.size() + 1;
}
int main(int argc, char **argv) {
RunTime();
BeginCommandArguments;
CommandArgument_String(ALIGNS);
CommandArgument_String(FASTB_END_1);
CommandArgument_String(FASTB_END_2);
CommandArgument_String(REFERENCE);
CommandArgument_String(ALIGNS_END_1_OUT);
CommandArgument_String(ALIGNS_END_2_OUT);
EndCommandArguments;
vecbasevector ref(REFERENCE);
vecbasevector reads1(FASTB_END_1);
vecbasevector reads2(FASTB_END_2);
ofstream aligns1stream(ALIGNS_END_1_OUT.c_str());
ofstream aligns2stream(ALIGNS_END_2_OUT.c_str());
basevector bv;
SerialQltout sqltout(ALIGNS);
look_align la;
while (sqltout.Next(la)) {
vecbasevector candidates(2);
candidates[0] = reads1[la.query_id];
candidates[1] = reads2[la.query_id];
unsigned int matchingend = MatchingEnd(la, candidates, ref);
if (matchingend < 2) {
bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id];
//la.PrintParseable(cout, &bv, &ref[la.target_id]);
la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]);
}
}
aligns1stream.close();
aligns2stream.close();
return 0;
}

View File

@ -0,0 +1,21 @@
CXX=g++
CXXFLAGS=-g -Wall -O2 -m64 -fPIC
.cpp.o:
$(CXX) -c $(CXXFLAGS) -I$(BWA_HOME) -I$(JAVA_INCLUDE) $< -o $@
all: init lib
init:
@echo Please make sure the following platforms are set correctly on your machine.
@echo BWA_HOME=$(BWA_HOME)
@echo JAVA_INCLUDE=$(JAVA_INCLUDE)
@echo TARGET_LIB=$(TARGET_LIB)
@echo EXTRA_LIBS=$(EXTRA_LIBS)
@echo LIBTOOL_COMMAND=$(LIBTOOL_COMMAND)
lib: org_broadinstitute_sting_alignment_bwa_c_BWACAligner.o bwa_gateway.o
$(LIBTOOL_COMMAND) $? -o $(TARGET_LIB) -L$(BWA_HOME) -lbwacore $(EXTRA_LIBS)
clean:
rm *.o libbwa.*

View File

@ -0,0 +1,7 @@
#!/bin/sh
export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa"
export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux"
export TARGET_LIB="libbwa.so"
export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread"
export LIBTOOL_COMMAND="g++ -shared -Wl,-soname,libbwa.so"
make

View File

@ -0,0 +1,7 @@
#!/bin/sh
export BWA_HOME="/Users/mhanna/src/bwa"
export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers"
export TARGET_LIB="libbwa.dylib"
export EXTRA_LIBS="-lc -lz -lsupc++"
export LIBTOOL_COMMAND="libtool -dynamic"
make

View File

@ -0,0 +1,268 @@
#include <cstdio>
#include <cstring>
#include "bwase.h"
#include "bwa_gateway.h"
BWA::BWA(const char* ann_filename,
const char* amb_filename,
const char* pac_filename,
const char* forward_bwt_filename,
const char* forward_sa_filename,
const char* reverse_bwt_filename,
const char* reverse_sa_filename)
{
// Load the bns (?) and reference
bns = bns_restore_core(ann_filename,amb_filename,pac_filename);
reference = new ubyte_t[bns->l_pac/4+1];
rewind(bns->fp_pac);
fread(reference, 1, bns->l_pac/4+1, bns->fp_pac);
fclose(bns->fp_pac);
bns->fp_pac = NULL;
// Load the BWTs (both directions) and suffix arrays (both directions)
bwts[0] = bwt_restore_bwt(forward_bwt_filename);
bwt_restore_sa(forward_sa_filename, bwts[0]);
bwts[1] = bwt_restore_bwt(reverse_bwt_filename);
bwt_restore_sa(reverse_sa_filename, bwts[1]);
load_default_options();
// initialize the bwase subsystem
bwase_initialize();
}
BWA::~BWA() {
delete[] reference;
bns_destroy(bns);
bwt_destroy(bwts[0]);
bwt_destroy(bwts[1]);
}
void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count)
{
bwa_seq_t* sequence = create_sequence(bases, read_length);
// Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln).
// This method will destroy the contents of seq and rseq.
bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
paths = new bwt_aln1_t[sequence->n_aln];
memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t));
num_paths = sequence->n_aln;
// Call aln2seq to initialize the type of match present.
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
best_path_count = sequence->c1;
second_best_path_count = sequence->c2;
bwa_free_read_seq(1,sequence);
}
Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) {
bwa_seq_t* sequence = create_sequence(bases,read_length);
// Calculate paths.
bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
// Check for no alignments found and return null.
if(sequence->n_aln == 0) {
bwa_free_read_seq(1,sequence);
return NULL;
}
// bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in.
copy_bases_into_sequence(sequence,bases,read_length);
// Pick best alignment and propagate its information into the sequence.
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
// Generate the best alignment from the sequence.
Alignment* alignment = new Alignment;
*alignment = generate_final_alignment_from_sequence(sequence);
bwa_free_read_seq(1,sequence);
return alignment;
}
void BWA::generate_alignments_from_paths(const char* bases,
const unsigned read_length,
bwt_aln1_t* paths,
const unsigned num_paths,
const unsigned best_count,
const unsigned second_best_count,
Alignment*& alignments,
unsigned& num_alignments)
{
bwa_seq_t* sequence = create_sequence(bases,read_length);
sequence->aln = paths;
sequence->n_aln = num_paths;
// (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself.
bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
// But overwrite key parts of the sequence in case the user passed back only a smaller subset
// of the paths.
sequence->c1 = best_count;
sequence->c2 = second_best_count;
sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
num_alignments = 0;
for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++)
num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1;
alignments = new Alignment[num_alignments];
unsigned alignment_idx = 0;
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
// Stub in a 'working' path, so that only the desired alignment is local-aligned.
const bwt_aln1_t* path = paths + path_idx;
bwt_aln1_t working_path = *path;
// Loop through all alignments, aligning each one individually.
for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) {
working_path.k = working_path.l = sa_idx;
sequence->aln = &working_path;
sequence->n_aln = 1;
sequence->sa = sa_idx;
sequence->strand = path->a;
sequence->score = path->score;
// Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse.
// TODO: Fix the interface to bwa_refine_gapped so its easier to work with.
if(alignment_idx > 0)
seq_reverse(sequence->len, sequence->seq, 0);
// Copy the local alignment data into the alignment object.
*(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence);
alignment_idx++;
}
}
sequence->aln = NULL;
sequence->n_aln = 0;
bwa_free_read_seq(1,sequence);
}
Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) {
// Calculate the local coordinate and local alignment.
bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr);
bwa_refine_gapped(bns, 1, sequence, reference, NULL);
// Copy the local alignment data into the alignment object.
Alignment alignment;
// Populate basic path info
alignment.edit_distance = sequence->nm;
alignment.num_mismatches = sequence->n_mm;
alignment.num_gap_opens = sequence->n_gapo;
alignment.num_gap_extensions = sequence->n_gape;
alignment.num_best = sequence->c1;
alignment.num_second_best = sequence->c2;
// Final alignment position.
alignment.type = sequence->type;
bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig);
alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1;
alignment.negative_strand = sequence->strand;
alignment.mapping_quality = sequence->mapQ;
// Cigar step.
alignment.cigar = NULL;
if(sequence->cigar) {
alignment.cigar = new uint16_t[sequence->n_cigar];
memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t));
}
alignment.n_cigar = sequence->n_cigar;
// MD tag with a better breakdown of differences in the cigar
alignment.md = strdup(sequence->md);
delete[] sequence->md;
sequence->md = NULL;
return alignment;
}
void BWA::load_default_options()
{
options.s_mm = 3;
options.s_gapo = 11;
options.s_gape = 4;
options.mode = 3;
options.indel_end_skip = 5;
options.max_del_occ = 10;
options.max_entries = 2000000;
options.fnr = 0.04;
options.max_diff = -1;
options.max_gapo = 1;
options.max_gape = 6;
options.max_seed_diff = 2;
options.seed_len = 2147483647;
options.n_threads = 1;
options.max_top2 = 30;
options.trim_qual = 0;
}
void BWA::set_max_edit_distance(float edit_distance) {
if(edit_distance > 0 && edit_distance < 1) {
options.fnr = edit_distance;
options.max_diff = -1;
}
else {
options.fnr = -1.0;
options.max_diff = (int)edit_distance;
}
}
void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; }
void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; }
void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; }
void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; }
void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; }
void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; }
/**
* Create a sequence with a set of reasonable initial defaults.
* Will leave seq and rseq empty.
*/
bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length)
{
bwa_seq_t* sequence = new bwa_seq_t;
sequence->tid = -1;
sequence->name = 0;
copy_bases_into_sequence(sequence, bases, read_length);
sequence->qual = 0;
sequence->aln = 0;
sequence->md = 0;
sequence->cigar = NULL;
sequence->n_cigar = 0;
sequence->multi = NULL;
sequence->n_multi = 0;
return sequence;
}
void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length)
{
// seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap
sequence->seq = new ubyte_t[read_length];
sequence->rseq = new ubyte_t[read_length];
for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]];
memcpy(sequence->rseq,sequence->seq,read_length);
// BWA expects the read bases to arrive reversed.
seq_reverse(read_length,sequence->seq,0);
seq_reverse(read_length,sequence->rseq,1);
sequence->full_len = sequence->len = read_length;
}

View File

@ -0,0 +1,82 @@
#ifndef BWA_GATEWAY
#define BWA_GATEWAY
#include <cstdio>
#include "bntseq.h"
#include "bwt.h"
#include "bwtaln.h"
class Alignment {
public:
uint32_t type;
int contig;
bwtint_t pos;
bool negative_strand;
uint32_t mapping_quality;
uint16_t *cigar;
int n_cigar;
uint8_t num_mismatches;
uint8_t num_gap_opens;
uint8_t num_gap_extensions;
uint16_t edit_distance;
uint32_t num_best;
uint32_t num_second_best;
char* md;
};
class BWA {
private:
bntseq_t *bns;
ubyte_t* reference;
bwt_t* bwts[2];
gap_opt_t options;
void load_default_options();
bwa_seq_t* create_sequence(const char* bases, const unsigned read_length);
void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length);
Alignment generate_final_alignment_from_sequence(bwa_seq_t* sequence);
public:
BWA(const char* ann_filename,
const char* amb_filename,
const char* pac_filename,
const char* forward_bwt_filename,
const char* forward_sa_filename,
const char* reverse_bwt_filename,
const char* reverse_sa_filename);
~BWA();
// Parameterize the aligner.
void set_max_edit_distance(float edit_distance);
void set_max_gap_opens(int max_gap_opens);
void set_max_gap_extensions(int max_gap_extensions);
void set_disallow_indel_within_range(int indel_range);
void set_mismatch_penalty(int penalty);
void set_gap_open_penalty(int penalty);
void set_gap_extension_penalty(int penalty);
// Perform the alignment
Alignment* generate_single_alignment(const char* bases,
const unsigned read_length);
void find_paths(const char* bases,
const unsigned read_length,
bwt_aln1_t*& paths,
unsigned& num_paths,
unsigned& best_path_count,
unsigned& second_best_path_count);
void generate_alignments_from_paths(const char* bases,
const unsigned read_length,
bwt_aln1_t* paths,
const unsigned num_paths,
const unsigned best_count,
const unsigned second_best_count,
Alignment*& alignments,
unsigned& num_alignments);
};
#endif // BWA_GATEWAY

Binary file not shown.

View File

@ -0,0 +1,437 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include "bntseq.h"
#include "bwt.h"
#include "bwtaln.h"
#include "bwa_gateway.h"
#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h"
typedef void (BWA::*int_setter)(int value);
typedef void (BWA::*float_setter)(float value);
static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration)
{
jstring java_ann = get_configuration_file(env,bwtFiles,"annFile");
if(java_ann == NULL) return 0L;
jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile");
if(java_amb == NULL) return 0L;
jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile");
if(java_pac == NULL) return 0L;
jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile");
if(java_forward_bwt == NULL) return 0L;
jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile");
if(java_forward_sa == NULL) return 0L;
jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile");
if(java_reverse_bwt == NULL) return 0L;
jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile");
if(java_reverse_sa == NULL) return 0L;
const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
const char* amb_filename = env->GetStringUTFChars(java_amb,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
const char* pac_filename = env->GetStringUTFChars(java_pac,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
const char* forward_bwt_filename = env->GetStringUTFChars(java_forward_bwt,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
const char* forward_sa_filename = env->GetStringUTFChars(java_forward_sa,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
const char* reverse_bwt_filename = env->GetStringUTFChars(java_reverse_bwt,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
const char* reverse_sa_filename = env->GetStringUTFChars(java_reverse_sa,JNI_FALSE);
if(env->ExceptionCheck()) return 0L;
BWA* bwa = new BWA(ann_filename,
amb_filename,
pac_filename,
forward_bwt_filename,
forward_sa_filename,
reverse_bwt_filename,
reverse_sa_filename);
Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(env,instance,(jlong)bwa,configuration);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_ann,ann_filename);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_amb,amb_filename);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_pac,pac_filename);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_forward_bwt,forward_bwt_filename);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_forward_sa,forward_sa_filename);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_reverse_bwt,reverse_bwt_filename);
if(env->ExceptionCheck()) return 0L;
env->ReleaseStringUTFChars(java_reverse_sa,reverse_sa_filename);
if(env->ExceptionCheck()) return 0L;
return (jlong)bwa;
}
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy(JNIEnv* env, jobject instance, jlong java_bwa)
{
BWA* bwa = (BWA*)java_bwa;
delete bwa;
}
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(JNIEnv *env, jobject instance, jlong java_bwa, jobject configuration) {
BWA* bwa = (BWA*)java_bwa;
set_float_configuration_param(env, configuration, "maximumEditDistance", bwa, &BWA::set_max_edit_distance);
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "maximumGapOpens", bwa, &BWA::set_max_gap_opens);
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "maximumGapExtensions", bwa, &BWA::set_max_gap_extensions);
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "disallowIndelWithinRange", bwa, &BWA::set_disallow_indel_within_range);
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "mismatchPenalty", bwa, &BWA::set_mismatch_penalty);
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "gapOpenPenalty", bwa, &BWA::set_gap_open_penalty);
if(env->ExceptionCheck()) return;
set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty);
if(env->ExceptionCheck()) return;
}
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases)
{
BWA* bwa = (BWA*)java_bwa;
const jsize read_length = env->GetArrayLength(java_bases);
if(env->ExceptionCheck()) return NULL;
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
if(read_bases == NULL) return NULL;
bwt_aln1_t* paths = NULL;
unsigned num_paths = 0;
unsigned best_path_count, second_best_path_count;
bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count);
jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL);
if(java_paths == NULL) return NULL;
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
bwt_aln1_t& path = *(paths + path_idx);
jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath");
if(java_path_class == NULL) return NULL;
jmethodID java_path_constructor = env->GetMethodID(java_path_class, "<init>", "(IIIZJJIII)V");
if(java_path_constructor == NULL) return NULL;
// Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints.
jobject java_path = env->NewObject(java_path_class,
java_path_constructor,
path.n_mm,
path.n_gapo,
path.n_gape,
path.a,
(jlong)path.k,
(jlong)path.l,
path.score,
best_path_count,
second_best_path_count);
if(java_path == NULL) return NULL;
env->SetObjectArrayElement(java_paths,path_idx,java_path);
if(env->ExceptionCheck()) return NULL;
env->DeleteLocalRef(java_path_class);
if(env->ExceptionCheck()) return NULL;
}
delete[] paths;
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
return env->ExceptionCheck() ? NULL : java_paths;
}
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths)
{
BWA* bwa = (BWA*)java_bwa;
const jsize read_length = env->GetArrayLength(java_bases);
if(env->ExceptionCheck()) return NULL;
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
if(read_bases == NULL) return NULL;
const jsize num_paths = env->GetArrayLength(java_paths);
bwt_aln1_t* paths = new bwt_aln1_t[num_paths];
unsigned best_count = 0, second_best_count = 0;
for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
jobject java_path = env->GetObjectArrayElement(java_paths,path_idx);
jclass java_path_class = env->GetObjectClass(java_path);
if(java_path_class == NULL) return NULL;
bwt_aln1_t& path = *(paths + path_idx);
jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I");
if(mismatches_field == NULL) return NULL;
path.n_mm = env->GetIntField(java_path,mismatches_field);
if(env->ExceptionCheck()) return NULL;
jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I");
if(gap_opens_field == NULL) return NULL;
path.n_gapo = env->GetIntField(java_path,gap_opens_field);
if(env->ExceptionCheck()) return NULL;
jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I");
if(gap_extensions_field == NULL) return NULL;
path.n_gape = env->GetIntField(java_path,gap_extensions_field);
if(env->ExceptionCheck()) return NULL;
jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z");
if(negative_strand_field == NULL) return NULL;
path.a = env->GetBooleanField(java_path,negative_strand_field);
if(env->ExceptionCheck()) return NULL;
jfieldID k_field = env->GetFieldID(java_path_class, "k", "J");
if(k_field == NULL) return NULL;
path.k = env->GetLongField(java_path,k_field);
if(env->ExceptionCheck()) return NULL;
jfieldID l_field = env->GetFieldID(java_path_class, "l", "J");
if(l_field == NULL) return NULL;
path.l = env->GetLongField(java_path,l_field);
if(env->ExceptionCheck()) return NULL;
jfieldID score_field = env->GetFieldID(java_path_class, "score", "I");
if(score_field == NULL) return NULL;
path.score = env->GetIntField(java_path,score_field);
if(env->ExceptionCheck()) return NULL;
jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I");
if(best_count_field == NULL) return NULL;
best_count = env->GetIntField(java_path,best_count_field);
if(env->ExceptionCheck()) return NULL;
jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I");
if(second_best_count_field == NULL) return NULL;
second_best_count = env->GetIntField(java_path,second_best_count_field);
if(env->ExceptionCheck()) return NULL;
}
Alignment* alignments = NULL;
unsigned num_alignments = 0;
bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments);
jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL);
if(java_alignments == NULL) return NULL;
for(unsigned alignment_idx = 0; alignment_idx < (unsigned)num_alignments; alignment_idx++) {
Alignment& alignment = *(alignments + alignment_idx);
jobject java_alignment = convert_to_java_alignment(env,read_bases,read_length,alignment);
if(java_alignment == NULL) return NULL;
env->SetObjectArrayElement(java_alignments,alignment_idx,java_alignment);
if(env->ExceptionCheck()) return NULL;
}
delete[] alignments;
delete[] paths;
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
return env->ExceptionCheck() ? NULL : java_alignments;
}
JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) {
BWA* bwa = (BWA*)java_bwa;
const jsize read_length = env->GetArrayLength(java_bases);
if(env->ExceptionCheck()) return NULL;
jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
if(read_bases == NULL) return NULL;
Alignment* best_alignment = bwa->generate_single_alignment((const char*)read_bases,read_length);
jobject java_best_alignment = (best_alignment != NULL) ? convert_to_java_alignment(env,read_bases,read_length,*best_alignment) : NULL;
delete best_alignment;
env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
return java_best_alignment;
}
static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment) {
unsigned cigar_length;
if(alignment.type == BWA_TYPE_NO_MATCH) cigar_length = 0;
else if(!alignment.cigar) cigar_length = 1;
else cigar_length = alignment.n_cigar;
jcharArray java_cigar_operators = env->NewCharArray(cigar_length);
if(java_cigar_operators == NULL) return NULL;
jintArray java_cigar_lengths = env->NewIntArray(cigar_length);
if(java_cigar_lengths == NULL) return NULL;
if(alignment.cigar) {
for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) {
jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14];
jint cigar_length = alignment.cigar[cigar_idx]&0x3fff;
env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator);
if(env->ExceptionCheck()) return NULL;
env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length);
if(env->ExceptionCheck()) return NULL;
}
}
else {
if(alignment.type != BWA_TYPE_NO_MATCH) {
jchar cigar_operator = 'M';
env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator);
if(env->ExceptionCheck()) return NULL;
env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length);
if(env->ExceptionCheck()) return NULL;
}
}
delete[] alignment.cigar;
jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment");
if(java_alignment_class == NULL) return NULL;
jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "<init>", "(IIZI[C[IILjava/lang/String;IIIII)V");
if(java_alignment_constructor == NULL) return NULL;
jstring java_md = env->NewStringUTF(alignment.md);
if(java_md == NULL) return NULL;
delete[] alignment.md;
jobject java_alignment = env->NewObject(java_alignment_class,
java_alignment_constructor,
alignment.contig,
alignment.pos,
alignment.negative_strand,
alignment.mapping_quality,
java_cigar_operators,
java_cigar_lengths,
alignment.edit_distance,
java_md,
alignment.num_mismatches,
alignment.num_gap_opens,
alignment.num_gap_extensions,
alignment.num_best,
alignment.num_second_best);
if(java_alignment == NULL) return NULL;
env->DeleteLocalRef(java_alignment_class);
if(env->ExceptionCheck()) return NULL;
return java_alignment;
}
static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) {
jclass configuration_class = env->GetObjectClass(configuration);
if(configuration_class == NULL) return NULL;
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;");
if(configuration_field == NULL) return NULL;
jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field);
jclass file_class = env->FindClass("java/io/File");
if(file_class == NULL) return NULL;
jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;");
if(path_extractor == NULL) return NULL;
jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor);
if(path == NULL) return NULL;
env->DeleteLocalRef(configuration_class);
env->DeleteLocalRef(file_class);
env->DeleteLocalRef(configuration_file);
return path;
}
static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {
jclass configuration_class = env->GetObjectClass(configuration);
if(configuration_class == NULL) return;
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Integer;");
if(configuration_field == NULL) return;
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
if(env->ExceptionCheck()) return;
if(boxed_value != NULL) {
jclass int_box_class = env->FindClass("java/lang/Integer");
if(int_box_class == NULL) return;
jmethodID int_extractor = env->GetMethodID(int_box_class,"intValue", "()I");
if(int_extractor == NULL) return;
jint value = env->CallIntMethod(boxed_value,int_extractor);
if(env->ExceptionCheck()) return;
if(value < 0)
{
throw_config_value_exception(env,field_name,"cannot be set to a negative value");
return;
}
(bwa->*setter)(value);
env->DeleteLocalRef(int_box_class);
}
env->DeleteLocalRef(boxed_value);
env->DeleteLocalRef(configuration_class);
}
static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter)
{
jclass configuration_class = env->GetObjectClass(configuration);
if(configuration_class == NULL) return;
jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Float;");
if(configuration_field == NULL) return;
jobject boxed_value = env->GetObjectField(configuration,configuration_field);
if(boxed_value != NULL) {
jclass float_box_class = env->FindClass("java/lang/Float");
if(float_box_class == NULL) return;
jmethodID float_extractor = env->GetMethodID(float_box_class,"floatValue", "()F");
if(float_extractor == NULL) return;
jfloat value = env->CallFloatMethod(boxed_value,float_extractor);
if(env->ExceptionCheck()) return;
if(value < 0)
{
throw_config_value_exception(env,field_name,"cannot be set to a negative value");
return;
}
(bwa->*setter)(value);
env->DeleteLocalRef(float_box_class);
}
env->DeleteLocalRef(boxed_value);
env->DeleteLocalRef(configuration_class);
}
static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message)
{
char* buffer = new char[strlen(field_name)+1+strlen(message)+1];
sprintf(buffer,"%s %s",field_name,message);
jclass sting_exception_class = env->FindClass("org/broadinstitute/sting/utils/StingException");
if(sting_exception_class == NULL) return;
env->ThrowNew(sting_exception_class, buffer);
delete[] buffer;
}

View File

@ -0,0 +1,61 @@
/* DO NOT EDIT THIS FILE - it is machine generated */
#include <jni.h>
/* Header for class org_broadinstitute_sting_alignment_bwa_c_BWACAligner */
#ifndef _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
#define _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
#ifdef __cplusplus
extern "C" {
#endif
/*
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
* Method: create
* Signature: (Lorg/broadinstitute/sting/alignment/bwa/BWTFiles;Lorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)J
*/
JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create
(JNIEnv *, jobject, jobject, jobject);
/*
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
* Method: updateConfiguration
* Signature: (JLorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration
(JNIEnv *, jobject, jlong, jobject);
/*
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
* Method: destroy
* Signature: (J)V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy
(JNIEnv *, jobject, jlong);
/*
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
* Method: getPaths
* Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;
*/
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths
(JNIEnv *, jobject, jlong, jbyteArray);
/*
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
* Method: convertPathsToAlignments
* Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment;
*/
JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments
(JNIEnv *, jobject, jlong, jbyteArray, jobjectArray);
/*
* Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
* Method: getBestAlignment
* Signature: (J[B)Lorg/broadinstitute/sting/alignment/Alignment;
*/
JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment
(JNIEnv *, jobject, jlong, jbyteArray);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,10 @@
CC=gcc
CCFLAGS=-Wall -dynamiclib -arch i386 -arch x86_64
libenvironhack.dylib: libenvironhack.c
$(CC) $(CCFLAGS) -init _init_environ $< -o $@
all: libenvironhack.dylib
clean:
rm -f libenvironhack.dylib

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+.
nm $LSF_LIBDIR/liblsf.dylib | grep environ
See "man environ" for more info, along with http://lists.apple.com/archives/java-dev/2007/Dec/msg00096.html
*/
#include <crt_externs.h>
char **environ = (char **)0;
void init_environ(void) {
environ = (*_NSGetEnviron());
}

Binary file not shown.

View File

@ -0,0 +1,49 @@
package org.broadinstitute.sting.alignment;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
/**
* Create perfect alignments from the read to the genome represented by the given BWT / suffix array.
*
* @author mhanna
* @version 0.1
*/
public interface Aligner {
/**
* Close this instance of the BWA pointer and delete its resources.
*/
public void close();
/**
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
* @param bases Bases to align.
* @return An align
*/
public Alignment getBestAlignment(final byte[] bases);
/**
* Align the read to the reference.
* @param read Read to align.
* @param header Optional header to drop in place.
* @return A list of the alignments.
*/
public SAMRecord align(final SAMRecord read, final SAMFileHeader header);
/**
* Get a iterator of alignments, batched by mapping quality.
* @param bases List of bases.
* @return Iterator to alignments.
*/
public Iterable<Alignment[]> getAllAlignments(final byte[] bases);
/**
* Get a iterator of aligned reads, batched by mapping quality.
* @param read Read to align.
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
* @return Iterator to alignments.
*/
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader);
}

View File

@ -0,0 +1,221 @@
package org.broadinstitute.sting.alignment;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* Represents an alignment of a read to a site in the reference genome.
*
* @author mhanna
* @version 0.1
*/
public class Alignment {
protected int contigIndex;
protected long alignmentStart;
protected boolean negativeStrand;
protected int mappingQuality;
protected char[] cigarOperators;
protected int[] cigarLengths;
protected int editDistance;
protected String mismatchingPositions;
protected int numMismatches;
protected int numGapOpens;
protected int numGapExtensions;
protected int bestCount;
protected int secondBestCount;
/**
* Gets the index of the given contig.
* @return the inde
*/
public int getContigIndex() { return contigIndex; }
/**
* Gets the starting position for the given alignment.
* @return Starting position.
*/
public long getAlignmentStart() { return alignmentStart; }
/**
* Is the given alignment on the reverse strand?
* @return True if the alignment is on the reverse strand.
*/
public boolean isNegativeStrand() { return negativeStrand; }
/**
* Gets the score of this alignment.
* @return The score.
*/
public int getMappingQuality() { return mappingQuality; }
/**
* Gets the edit distance; will eventually end up in the NM SAM tag
* if this alignment makes it that far.
* @return The edit distance.
*/
public int getEditDistance() { return editDistance; }
/**
* A string representation of which positions mismatch; contents of MD tag.
* @return String representation of mismatching positions.
*/
public String getMismatchingPositions() { return mismatchingPositions; }
/**
* Gets the number of mismatches in the read.
* @return Number of mismatches.
*/
public int getNumMismatches() { return numMismatches; }
/**
* Get the number of gap opens.
* @return Number of gap opens.
*/
public int getNumGapOpens() { return numGapOpens; }
/**
* Get the number of gap extensions.
* @return Number of gap extensions.
*/
public int getNumGapExtensions() { return numGapExtensions; }
/**
* Get the number of best alignments.
* @return Number of top scoring alignments.
*/
public int getBestCount() { return bestCount; }
/**
* Get the number of second best alignments.
* @return Number of second best scoring alignments.
*/
public int getSecondBestCount() { return secondBestCount; }
/**
* Gets the cigar for this alignment.
* @return sam-jdk formatted alignment.
*/
public Cigar getCigar() {
Cigar cigar = new Cigar();
for(int i = 0; i < cigarOperators.length; i++) {
CigarOperator operator = CigarOperator.characterToEnum(cigarOperators[i]);
cigar.add(new CigarElement(cigarLengths[i],operator));
}
return cigar;
}
/**
* Temporarily implement getCigarString() for debugging; the TextCigarCodec is unfortunately
* package-protected.
* @return
*/
public String getCigarString() {
Cigar cigar = getCigar();
if(cigar.isEmpty()) return "*";
StringBuilder cigarString = new StringBuilder();
for(CigarElement element: cigar.getCigarElements()) {
cigarString.append(element.getLength());
cigarString.append(element.getOperator());
}
return cigarString.toString();
}
/**
* Stub for inheritance.
*/
public Alignment() {}
/**
* Create a new alignment object.
* @param contigIndex The contig to which this read aligned.
* @param alignmentStart The point within the contig to which this read aligned.
* @param negativeStrand Forward or reverse alignment of the given read.
* @param mappingQuality How good does BWA think this mapping is?
* @param cigarOperators The ordered operators in the cigar string.
* @param cigarLengths The lengths to which each operator applies.
* @param editDistance The edit distance (cumulative) of the read.
* @param mismatchingPositions String representation of which bases in the read mismatch.
* @param numMismatches Number of total mismatches in the read.
* @param numGapOpens Number of gap opens in the read.
* @param numGapExtensions Number of gap extensions in the read.
* @param bestCount Number of best alignments in the read.
* @param secondBestCount Number of second best alignments in the read.
*/
public Alignment(int contigIndex,
int alignmentStart,
boolean negativeStrand,
int mappingQuality,
char[] cigarOperators,
int[] cigarLengths,
int editDistance,
String mismatchingPositions,
int numMismatches,
int numGapOpens,
int numGapExtensions,
int bestCount,
int secondBestCount) {
this.contigIndex = contigIndex;
this.alignmentStart = alignmentStart;
this.negativeStrand = negativeStrand;
this.mappingQuality = mappingQuality;
this.cigarOperators = cigarOperators;
this.cigarLengths = cigarLengths;
this.editDistance = editDistance;
this.mismatchingPositions = mismatchingPositions;
this.numMismatches = numMismatches;
this.numGapOpens = numGapOpens;
this.numGapExtensions = numGapExtensions;
this.bestCount = bestCount;
this.secondBestCount = secondBestCount;
}
/**
* Creates a read directly from an alignment.
* @param alignment The alignment to convert to a read.
* @param unmappedRead Source of the unmapped read. Should have bases, quality scores, and flags.
* @param newSAMHeader The new SAM header to use in creating this read. Can be null, but if so, the sequence
* dictionary in the
* @return A mapped alignment.
*/
public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader) {
SAMRecord read;
try {
read = (SAMRecord)unmappedRead.clone();
}
catch(CloneNotSupportedException ex) {
throw new ReviewedStingException("Unable to create aligned read from template.");
}
if(newSAMHeader != null)
read.setHeader(newSAMHeader);
// If we're realigning a previously aligned record, strip out the placement of the alignment.
read.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
read.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
if(alignment != null) {
read.setReadUnmappedFlag(false);
read.setReferenceIndex(alignment.getContigIndex());
read.setAlignmentStart((int)alignment.getAlignmentStart());
read.setReadNegativeStrandFlag(alignment.isNegativeStrand());
read.setMappingQuality(alignment.getMappingQuality());
read.setCigar(alignment.getCigar());
if(alignment.isNegativeStrand()) {
read.setReadBases(BaseUtils.simpleReverseComplement(read.getReadBases()));
read.setBaseQualities(Utils.reverse(read.getBaseQualities()));
}
read.setAttribute("NM",alignment.getEditDistance());
read.setAttribute("MD",alignment.getMismatchingPositions());
}
return read;
}
}

View File

@ -0,0 +1,157 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Iterator;
/**
* Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them
* of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original
* alignment from the input file.
*
* @author mhanna
* @version 0.1
*/
public class AlignmentValidationWalker extends ReadWalker<Integer,Integer> {
/**
* The supporting BWT index generated using BWT.
*/
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
private String prefix = null;
/**
* The instance used to generate alignments.
*/
private BWACAligner aligner = null;
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(prefix == null)
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
BWTFiles bwtFiles = new BWTFiles(prefix);
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
}
/**
* Aligns a read to the given reference.
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of reads aligned by this map (aka 1).
*/
@Override
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
//logger.info(String.format("examining read %s", read.getReadName()));
byte[] bases = read.getReadBases();
if(read.getReadNegativeStrandFlag()) bases = BaseUtils.simpleReverseComplement(bases);
boolean matches = true;
Iterable<Alignment[]> alignments = aligner.getAllAlignments(bases);
Iterator<Alignment[]> alignmentIterator = alignments.iterator();
if(!alignmentIterator.hasNext()) {
matches = read.getReadUnmappedFlag();
}
else {
Alignment[] alignmentsOfBestQuality = alignmentIterator.next();
for(Alignment alignment: alignmentsOfBestQuality) {
matches = (alignment.getContigIndex() == read.getReferenceIndex());
matches &= (alignment.getAlignmentStart() == read.getAlignmentStart());
matches &= (alignment.isNegativeStrand() == read.getReadNegativeStrandFlag());
matches &= (alignment.getCigar().equals(read.getCigar()));
matches &= (alignment.getMappingQuality() == read.getMappingQuality());
if(matches) break;
}
}
if(!matches) {
logger.error("Found mismatch!");
logger.error(String.format("Read %s:",read.getReadName()));
logger.error(String.format(" Contig index: %d",read.getReferenceIndex()));
logger.error(String.format(" Alignment start: %d", read.getAlignmentStart()));
logger.error(String.format(" Negative strand: %b", read.getReadNegativeStrandFlag()));
logger.error(String.format(" Cigar: %s%n", read.getCigarString()));
logger.error(String.format(" Mapping quality: %s%n", read.getMappingQuality()));
for(Alignment[] alignmentsByScore: alignments) {
for(int i = 0; i < alignmentsByScore.length; i++) {
logger.error(String.format("Alignment %d:",i));
logger.error(String.format(" Contig index: %d",alignmentsByScore[i].getContigIndex()));
logger.error(String.format(" Alignment start: %d", alignmentsByScore[i].getAlignmentStart()));
logger.error(String.format(" Negative strand: %b", alignmentsByScore[i].isNegativeStrand()));
logger.error(String.format(" Cigar: %s", alignmentsByScore[i].getCigarString()));
logger.error(String.format(" Mapping quality: %s%n", alignmentsByScore[i].getMappingQuality()));
}
}
throw new ReviewedStingException(String.format("Read %s mismatches!", read.getReadName()));
}
return 1;
}
/**
* Initial value for reduce. In this case, validated reads will be counted.
* @return 0, indicating no reads yet validated.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of reads processed.
* @param value Number of reads processed by this map.
* @param sum Number of reads processed before this map.
* @return Number of reads processed up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
super.onTraversalDone(result);
}
}

View File

@ -0,0 +1,134 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import java.io.File;
/**
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
*
* @author mhanna
* @version 0.1
*/
@WalkerName("Align")
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
"generated by bwa index -d bwtsw. If unspecified, will default " +
"to the reference specified via the -R argument.",required=false)
private File targetReferenceFile = null;
@Output
private StingSAMFileWriter out = null;
/**
* The actual aligner.
*/
private BWACAligner aligner = null;
/**
* New header to use, if desired.
*/
private SAMFileHeader header;
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(targetReferenceFile == null)
targetReferenceFile = getToolkit().getArguments().referenceFile;
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
header = getToolkit().getSAMFileHeader().clone();
SAMSequenceDictionary referenceDictionary =
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
header.setSequenceDictionary(referenceDictionary);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
out.writeHeader(header);
}
/**
* Aligns a read to the given reference.
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
SAMRecord alignedRead = aligner.align(read,header);
out.addAlignment(alignedRead);
return 1;
}
/**
* Initial value for reduce. In this case, alignments will be counted.
* @return 0, indicating no alignments yet found.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of alignments found.
* @param value Number of alignments found by this map.
* @param sum Number of alignments found before this map.
* @return Number of alignments found up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
super.onTraversalDone(result);
}
}

View File

@ -0,0 +1,128 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
* frequency of that number of placements.
*
* @author mhanna
* @version 0.1
*/
public class CountBestAlignmentsWalker extends ReadWalker<Integer,Integer> {
/**
* The supporting BWT index generated using BWT.
*/
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
private String prefix = null;
@Output
private PrintStream out = null;
/**
* The actual aligner.
*/
private Aligner aligner = null;
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(prefix == null)
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
BWTFiles bwtFiles = new BWTFiles(prefix);
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
}
/**
* Aligns a read to the given reference.
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
if(alignmentIterator.hasNext()) {
int numAlignments = alignmentIterator.next().length;
if(alignmentFrequencies.containsKey(numAlignments))
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
else
alignmentFrequencies.put(numAlignments,1);
}
return 1;
}
/**
* Initial value for reduce. In this case, validated reads will be counted.
* @return 0, indicating no reads yet validated.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of reads processed.
* @param value Number of reads processed by this map.
* @param sum Number of reads processed before this map.
* @return Number of reads processed up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
super.onTraversalDone(result);
}
}

View File

@ -0,0 +1,38 @@
package org.broadinstitute.sting.alignment.bwa;
import org.broadinstitute.sting.alignment.Aligner;
/**
* Align reads using BWA.
*
* @author mhanna
* @version 0.1
*/
public abstract class BWAAligner implements Aligner {
/**
* The supporting files used by BWA.
*/
protected BWTFiles bwtFiles;
/**
* The current configuration for the BWA aligner.
*/
protected BWAConfiguration configuration;
/**
* Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct
* parameters.
* @param bwtFiles The many files representing BWTs persisted to disk.
* @param configuration Configuration parameters for the alignment.
*/
public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
this.bwtFiles = bwtFiles;
this.configuration = configuration;
}
/**
* Update the configuration passed to the BWA aligner.
* @param configuration New configuration to set.
*/
public abstract void updateConfiguration(BWAConfiguration configuration);
}

View File

@ -0,0 +1,44 @@
package org.broadinstitute.sting.alignment.bwa;
/**
* Configuration for the BWA/C aligner.
*
* @author mhanna
* @version 0.1
*/
public class BWAConfiguration {
/**
* The maximum edit distance used by BWA.
*/
public Float maximumEditDistance = null;
/**
* How many gap opens are acceptable within this alignment?
*/
public Integer maximumGapOpens = null;
/**
* How many gap extensions are acceptable within this alignment?
*/
public Integer maximumGapExtensions = null;
/**
* Do we disallow indels within a certain range from the start / end?
*/
public Integer disallowIndelWithinRange = null;
/**
* What is the scoring penalty for a mismatch?
*/
public Integer mismatchPenalty = null;
/**
* What is the scoring penalty for a gap open?
*/
public Integer gapOpenPenalty = null;
/**
* What is the scoring penalty for a gap extension?
*/
public Integer gapExtensionPenalty = null;
}

View File

@ -0,0 +1,234 @@
package org.broadinstitute.sting.alignment.bwa;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.alignment.reference.bwt.*;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.IOException;
/**
* Support files for BWT.
*
* @author mhanna
* @version 0.1
*/
public class BWTFiles {
/**
* ANN (?) file name.
*/
public final File annFile;
/**
* AMB (?) file name.
*/
public final File ambFile;
/**
* Packed reference sequence file.
*/
public final File pacFile;
/**
* Reverse of packed reference sequence file.
*/
public final File rpacFile;
/**
* Forward BWT file.
*/
public final File forwardBWTFile;
/**
* Forward suffix array file.
*/
public final File forwardSAFile;
/**
* Reverse BWT file.
*/
public final File reverseBWTFile;
/**
* Reverse suffix array file.
*/
public final File reverseSAFile;
/**
* Where these files autogenerated on the fly?
*/
public final boolean autogenerated;
/**
* Create a new BWA configuration file using the given prefix.
* @param prefix Prefix to use when creating the configuration. Must not be null.
*/
public BWTFiles(String prefix) {
if(prefix == null)
throw new ReviewedStingException("Prefix must not be null.");
annFile = new File(prefix + ".ann");
ambFile = new File(prefix + ".amb");
pacFile = new File(prefix + ".pac");
rpacFile = new File(prefix + ".rpac");
forwardBWTFile = new File(prefix + ".bwt");
forwardSAFile = new File(prefix + ".sa");
reverseBWTFile = new File(prefix + ".rbwt");
reverseSAFile = new File(prefix + ".rsa");
autogenerated = false;
}
/**
* Hand-create a new BWTFiles object, specifying a unique file object for each type.
* @param annFile ANN (alternate dictionary) file.
* @param ambFile AMB (holes) files.
* @param pacFile Packed representation of the forward reference sequence.
* @param forwardBWTFile BWT representation of the forward reference sequence.
* @param forwardSAFile SA representation of the forward reference sequence.
* @param rpacFile Packed representation of the reversed reference sequence.
* @param reverseBWTFile BWT representation of the reversed reference sequence.
* @param reverseSAFile SA representation of the reversed reference sequence.
*/
private BWTFiles(File annFile,
File ambFile,
File pacFile,
File forwardBWTFile,
File forwardSAFile,
File rpacFile,
File reverseBWTFile,
File reverseSAFile) {
this.annFile = annFile;
this.ambFile = ambFile;
this.pacFile = pacFile;
this.forwardBWTFile = forwardBWTFile;
this.forwardSAFile = forwardSAFile;
this.rpacFile = rpacFile;
this.reverseBWTFile = reverseBWTFile;
this.reverseSAFile = reverseSAFile;
autogenerated = true;
}
/**
* Close out this files object, in the process deleting any temporary filse
* that were created.
*/
public void close() {
if(autogenerated) {
boolean success = true;
success = annFile.delete();
success &= ambFile.delete();
success &= pacFile.delete();
success &= forwardBWTFile.delete();
success &= forwardSAFile.delete();
success &= rpacFile.delete();
success &= reverseBWTFile.delete();
success &= reverseSAFile.delete();
if(!success)
throw new ReviewedStingException("Unable to clean up autogenerated representation");
}
}
/**
* Create a new set of BWT files from the given reference sequence.
* @param referenceSequence Sequence from which to build metadata.
* @return A new object representing encoded representations of each sequence.
*/
public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
byte[] normalizedReferenceSequence = new byte[referenceSequence.length];
System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length);
normalizeReferenceSequence(normalizedReferenceSequence);
File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
try {
// Write the ann and amb for this reference sequence.
annFile = File.createTempFile("bwt",".ann");
ambFile = File.createTempFile("bwt",".amb");
SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length));
ANNWriter annWriter = new ANNWriter(annFile);
annWriter.write(dictionary);
annWriter.close();
AMBWriter ambWriter = new AMBWriter(ambFile);
ambWriter.writeEmpty(dictionary);
ambWriter.close();
// Write the encoded files for the forward version of this reference sequence.
pacFile = File.createTempFile("bwt",".pac");
bwtFile = File.createTempFile("bwt",".bwt");
saFile = File.createTempFile("bwt",".sa");
writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile);
// Write the encoded files for the reverse version of this reference sequence.
byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence);
rpacFile = File.createTempFile("bwt",".rpac");
rbwtFile = File.createTempFile("bwt",".rbwt");
rsaFile = File.createTempFile("bwt",".rsa");
writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
}
catch(IOException ex) {
throw new ReviewedStingException("Unable to write autogenerated reference sequence to temporary files");
}
// Make sure that, at the very least, all temporary files are deleted on exit.
annFile.deleteOnExit();
ambFile.deleteOnExit();
pacFile.deleteOnExit();
bwtFile.deleteOnExit();
saFile.deleteOnExit();
rpacFile.deleteOnExit();
rbwtFile.deleteOnExit();
rsaFile.deleteOnExit();
return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile);
}
/**
* Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
* sequence is the reference itself in PAC format, the BWT, and the suffix array.
* @param referenceSequence The reference sequence to encode.
* @param pacFile Target for the PAC-encoded reference.
* @param bwtFile Target for the BWT representation of the reference.
* @param suffixArrayFile Target for the suffix array encoding of the reference.
* @throws java.io.IOException In case of issues writing to the file.
*/
private static void writeEncodedReferenceSequence(byte[] referenceSequence,
File pacFile,
File bwtFile,
File suffixArrayFile) throws IOException {
PackUtils.writeReferenceSequence(pacFile,referenceSequence);
BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
BWTWriter bwtWriter = new BWTWriter(bwtFile);
bwtWriter.write(bwt);
bwtWriter.close();
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
suffixArrayWriter.write(suffixArray);
suffixArrayWriter.close();
}
/**
* Convert the given reference sequence into a form suitable for building into
* on-the-fly sequences.
* @param referenceSequence The reference sequence to normalize.
* @throws org.broadinstitute.sting.utils.exceptions.ReviewedStingException if normalized sequence cannot be generated.
*/
private static void normalizeReferenceSequence(byte[] referenceSequence) {
StringUtil.toUpperCase(referenceSequence);
for(byte base: referenceSequence) {
if(base != 'A' && base != 'C' && base != 'G' && base != 'T')
throw new ReviewedStingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base));
}
}
}

View File

@ -0,0 +1,259 @@
package org.broadinstitute.sting.alignment.bwa.c;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.Iterator;
/**
* An aligner using the BWA/C implementation.
*
* @author mhanna
* @version 0.1
*/
public class BWACAligner extends BWAAligner {
static {
System.loadLibrary("bwa");
}
/**
* A pointer to the C++ object representing the BWA engine.
*/
private long thunkPointer = 0;
public BWACAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
super(bwtFiles,configuration);
if(thunkPointer != 0)
throw new ReviewedStingException("BWA/C attempting to reinitialize.");
if(!bwtFiles.annFile.exists()) throw new ReviewedStingException("ANN file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.ambFile.exists()) throw new ReviewedStingException("AMB file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.pacFile.exists()) throw new ReviewedStingException("PAC file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.forwardBWTFile.exists()) throw new ReviewedStingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.forwardSAFile.exists()) throw new ReviewedStingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.reverseBWTFile.exists()) throw new ReviewedStingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it.");
if(!bwtFiles.reverseSAFile.exists()) throw new ReviewedStingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it.");
thunkPointer = create(bwtFiles,configuration);
}
/**
* Create an aligner object using an array of bytes as a reference.
* @param referenceSequence Reference sequence to encode ad-hoc.
* @param configuration Configuration for the given aligner.
*/
public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) {
this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration);
// Now that the temporary files are created, the temporary files can be destroyed.
bwtFiles.close();
}
/**
* Update the configuration passed to the BWA aligner.
* @param configuration New configuration to set.
*/
@Override
public void updateConfiguration(BWAConfiguration configuration) {
if(thunkPointer == 0)
throw new ReviewedStingException("BWA/C: attempting to update configuration of uninitialized aligner.");
updateConfiguration(thunkPointer,configuration);
}
/**
* Close this instance of the BWA pointer and delete its resources.
*/
@Override
public void close() {
if(thunkPointer == 0)
throw new ReviewedStingException("BWA/C close attempted, but BWA/C is not properly initialized.");
destroy(thunkPointer);
}
/**
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
* @param bases Bases to align.
* @return An align
*/
@Override
public Alignment getBestAlignment(final byte[] bases) {
if(thunkPointer == 0)
throw new ReviewedStingException("BWA/C getBestAlignment attempted, but BWA/C is not properly initialized.");
return getBestAlignment(thunkPointer,bases);
}
/**
* Get the best aligned read, chosen randomly from the pile of best alignments.
* @param read Read to align.
* @param newHeader New header to apply to this SAM file. Can be null, but if so, read header must be valid.
* @return Read with injected alignment data.
*/
@Override
public SAMRecord align(final SAMRecord read, final SAMFileHeader newHeader) {
if(bwtFiles.autogenerated)
throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
return Alignment.convertToRead(getBestAlignment(read.getReadBases()),read,newHeader);
}
/**
* Get a iterator of alignments, batched by mapping quality.
* @param bases List of bases.
* @return Iterator to alignments.
*/
@Override
public Iterable<Alignment[]> getAllAlignments(final byte[] bases) {
final BWAPath[] paths = getPaths(bases);
return new Iterable<Alignment[]>() {
public Iterator<Alignment[]> iterator() {
return new Iterator<Alignment[]>() {
/**
* The last position accessed.
*/
private int position = 0;
/**
* Whether all alignments have been seen based on the current position.
* @return True if any more alignments are pending. False otherwise.
*/
public boolean hasNext() { return position < paths.length; }
/**
* Return the next cross-section of alignments, based on mapping quality.
* @return Array of the next set of alignments of a given mapping quality.
*/
public Alignment[] next() {
if(position >= paths.length)
throw new UnsupportedOperationException("Out of alignments to return.");
int score = paths[position].score;
int startingPosition = position;
while(position < paths.length && paths[position].score == score) position++;
return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position));
}
/**
* Unsupported.
*/
public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
};
}
};
}
/**
* Get a iterator of aligned reads, batched by mapping quality.
* @param read Read to align.
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
* @return Iterator to alignments.
*/
@Override
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader) {
if(bwtFiles.autogenerated)
throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
final Iterable<Alignment[]> alignments = getAllAlignments(read.getReadBases());
return new Iterable<SAMRecord[]>() {
public Iterator<SAMRecord[]> iterator() {
final Iterator<Alignment[]> alignmentIterator = alignments.iterator();
return new Iterator<SAMRecord[]>() {
/**
* Whether all alignments have been seen based on the current position.
* @return True if any more alignments are pending. False otherwise.
*/
public boolean hasNext() { return alignmentIterator.hasNext(); }
/**
* Return the next cross-section of alignments, based on mapping quality.
* @return Array of the next set of alignments of a given mapping quality.
*/
public SAMRecord[] next() {
Alignment[] alignmentsOfQuality = alignmentIterator.next();
SAMRecord[] reads = new SAMRecord[alignmentsOfQuality.length];
for(int i = 0; i < alignmentsOfQuality.length; i++) {
reads[i] = Alignment.convertToRead(alignmentsOfQuality[i],read,newHeader);
}
return reads;
}
/**
* Unsupported.
*/
public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
};
}
};
}
/**
* Get the paths associated with the given base string.
* @param bases List of bases.
* @return A set of paths through the BWA.
*/
public BWAPath[] getPaths(byte[] bases) {
if(thunkPointer == 0)
throw new ReviewedStingException("BWA/C getPaths attempted, but BWA/C is not properly initialized.");
return getPaths(thunkPointer,bases);
}
/**
* Create a pointer to the BWA/C thunk.
* @param files BWT source files.
* @param configuration Configuration of the aligner.
* @return Pointer to the BWA/C thunk.
*/
protected native long create(BWTFiles files, BWAConfiguration configuration);
/**
* Update the configuration passed to the BWA aligner. For internal use only.
* @param thunkPointer pointer to BWA object.
* @param configuration New configuration to set.
*/
protected native void updateConfiguration(long thunkPointer, BWAConfiguration configuration);
/**
* Destroy the BWA/C thunk.
* @param thunkPointer Pointer to the allocated thunk.
*/
protected native void destroy(long thunkPointer);
/**
* Do the extra steps involved in converting a local alignment to a global alignment.
* @param bases ASCII representation of byte array.
* @param paths Paths through the current BWT.
* @return A list of alignments.
*/
protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) {
if(thunkPointer == 0)
throw new ReviewedStingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized.");
return convertPathsToAlignments(thunkPointer,bases,paths);
}
/**
* Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead.
* @param thunkPointer pointer to the C++ object managing BWA/C.
* @param bases ASCII representation of byte array.
* @return A list of paths through the specified BWT.
*/
protected native BWAPath[] getPaths(long thunkPointer, byte[] bases);
/**
* Do the extra steps involved in converting a local alignment to a global alignment.
* Call this method's convertPathsToAlignments() wrapper (above) instead.
* @param thunkPointer pointer to the C++ object managing BWA/C.
* @param bases ASCII representation of byte array.
* @param paths Paths through the current BWT.
* @return A list of alignments.
*/
protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths);
/**
* Gets the best alignment from BWA/C, randomly selected from all best-aligned reads.
* @param thunkPointer Pointer to BWA thunk.
* @param bases bases to align.
* @return The best alignment from BWA/C.
*/
protected native Alignment getBestAlignment(long thunkPointer, byte[] bases);
}

View File

@ -0,0 +1,101 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment.bwa.c;
/**
* Models a BWA path.
*
* @author mhanna
* @version 0.1
*/
public class BWAPath {
/**
* Number of mismatches encountered along this path.
*/
public final int numMismatches;
/**
* Number of gap opens encountered along this path.
*/
public final int numGapOpens;
/**
* Number of gap extensions along this path.
*/
public final int numGapExtensions;
/**
* Whether this alignment was found on the positive or negative strand.
*/
public final boolean negativeStrand;
/**
* Starting coordinate in the BWT.
*/
public final long k;
/**
* Ending coordinate in the BWT.
*/
public final long l;
/**
* The score of this path.
*/
public final int score;
/**
* The number of best alignments seen along this path.
*/
public final int bestCount;
/**
* The number of second best alignments seen along this path.
*/
public final int secondBestCount;
/**
* Create a new path with the given attributes.
* @param numMismatches Number of mismatches along path.
* @param numGapOpens Number of gap opens along path.
* @param numGapExtensions Number of gap extensions along path.
* @param k Index to first coordinate within BWT.
* @param l Index to last coordinate within BWT.
* @param score Score of this alignment. Not the mapping quality.
*/
public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) {
this.numMismatches = numMismatches;
this.numGapOpens = numGapOpens;
this.numGapExtensions = numGapExtensions;
this.negativeStrand = negativeStrand;
this.k = k;
this.l = l;
this.score = score;
this.bestCount = bestCount;
this.secondBestCount = secondBestCount;
}
}

View File

@ -0,0 +1,164 @@
package org.broadinstitute.sting.alignment.bwa.java;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.*;
import org.broadinstitute.sting.alignment.Aligner;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileNotFoundException;
/**
* A test harness to ensure that the perfect aligner works.
*
* @author mhanna
* @version 0.1
*/
public class AlignerTestHarness {
public static void main( String argv[] ) throws FileNotFoundException {
if( argv.length != 6 ) {
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <rbwt> <sa> <rsa> <bam>");
System.exit(1);
}
File referenceFile = new File(argv[0]);
File bwtFile = new File(argv[1]);
File rbwtFile = new File(argv[2]);
File suffixArrayFile = new File(argv[3]);
File reverseSuffixArrayFile = new File(argv[4]);
File bamFile = new File(argv[5]);
align(referenceFile,bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile,bamFile);
}
private static void align(File referenceFile, File bwtFile, File rbwtFile, File suffixArrayFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException {
Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile);
int count = 0;
SAMFileReader reader = new SAMFileReader(bamFile);
reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
int mismatches = 0;
int failures = 0;
for(SAMRecord read: reader) {
count++;
if( count > 200000 ) break;
//if( count < 366000 ) continue;
//if( count > 2 ) break;
//if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
// continue;
//if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )
// continue;
//if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") )
// continue;
SAMRecord alignmentCleaned = null;
try {
alignmentCleaned = (SAMRecord)read.clone();
}
catch( CloneNotSupportedException ex ) {
throw new ReviewedStingException("SAMRecord clone not supported", ex);
}
if( alignmentCleaned.getReadNegativeStrandFlag() )
alignmentCleaned.setReadBases(BaseUtils.simpleReverseComplement(alignmentCleaned.getReadBases()));
alignmentCleaned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
alignmentCleaned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
alignmentCleaned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
alignmentCleaned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
// Clear everything except flags pertaining to pairing and set 'unmapped' status to true.
alignmentCleaned.setFlags(alignmentCleaned.getFlags() & 0x00A1 | 0x000C);
Iterable<Alignment[]> alignments = aligner.getAllAlignments(alignmentCleaned.getReadBases());
if(!alignments.iterator().hasNext() ) {
//throw new StingException(String.format("Unable to align read %s to reference; count = %d",read.getReadName(),count));
System.out.printf("Unable to align read %s to reference; count = %d%n",read.getReadName(),count);
failures++;
}
Alignment foundAlignment = null;
for(Alignment[] alignmentsOfQuality: alignments) {
for(Alignment alignment: alignmentsOfQuality) {
if( read.getReadNegativeStrandFlag() != alignment.isNegativeStrand() )
continue;
if( read.getAlignmentStart() != alignment.getAlignmentStart() )
continue;
foundAlignment = alignment;
}
}
if( foundAlignment != null ) {
//System.out.printf("%s: Aligned read to reference at position %d with %d mismatches, %d gap opens, and %d gap extensions.%n", read.getReadName(), foundAlignment.getAlignmentStart(), foundAlignment.getMismatches(), foundAlignment.getGapOpens(), foundAlignment.getGapExtensions());
}
else {
System.out.printf("Error aligning read %s%n", read.getReadName());
mismatches++;
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION),
read.getAlignmentStart(),
read.getReadNegativeStrandFlag());
int numDeletions = numDeletionsInCigar(read.getCigar());
String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()+numDeletions-1).getBases());
System.out.printf("expected ref = %s%n", formatBasesBasedOnCigar(expectedRef,read.getCigar(),CigarOperator.INSERTION));
for(Alignment[] alignmentsOfQuality: alignments) {
for(Alignment alignment: alignmentsOfQuality) {
System.out.println();
Cigar cigar = ((BWAAlignment)alignment).getCigar();
System.out.printf("read = %s%n", formatBasesBasedOnCigar(read.getReadString(),cigar,CigarOperator.DELETION));
int deletionCount = ((BWAAlignment)alignment).getNumberOfBasesMatchingState(AlignmentState.DELETION);
String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignment.getAlignmentStart(),alignment.getAlignmentStart()+read.getReadLength()+deletionCount-1).getBases());
System.out.printf("actual ref = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(alignedRef,cigar,CigarOperator.INSERTION),
alignment.getAlignmentStart(),
alignment.isNegativeStrand());
}
}
//throw new StingException(String.format("Read %s was placed at incorrect location; count = %d%n",read.getReadName(),count));
}
if( count % 1000 == 0 )
System.out.printf("%d reads examined.%n",count);
}
System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures);
}
private static String formatBasesBasedOnCigar( String bases, Cigar cigar, CigarOperator toBlank ) {
StringBuilder formatted = new StringBuilder();
int readIndex = 0;
for(CigarElement cigarElement: cigar.getCigarElements()) {
if(cigarElement.getOperator() == toBlank) {
int number = cigarElement.getLength();
while( number-- > 0 ) formatted.append(' ');
}
else {
int number = cigarElement.getLength();
while( number-- > 0 ) formatted.append(bases.charAt(readIndex++));
}
}
return formatted.toString();
}
private static int numDeletionsInCigar( Cigar cigar ) {
int numDeletions = 0;
for(CigarElement cigarElement: cigar.getCigarElements()) {
if(cigarElement.getOperator() == CigarOperator.DELETION)
numDeletions += cigarElement.getLength();
}
return numDeletions;
}
}

View File

@ -0,0 +1,150 @@
package org.broadinstitute.sting.alignment.bwa.java;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Iterator;
/**
* Represents a sequence of matches.
*
* @author mhanna
* @version 0.1
*/
public class AlignmentMatchSequence implements Cloneable {
/**
* Stores the particular match entries in the order they occur.
*/
private Deque<AlignmentMatchSequenceEntry> entries = new ArrayDeque<AlignmentMatchSequenceEntry>();
/**
* Clone the given match sequence.
* @return A deep copy of the current match sequence.
*/
public AlignmentMatchSequence clone() {
AlignmentMatchSequence copy = null;
try {
copy = (AlignmentMatchSequence)super.clone();
}
catch( CloneNotSupportedException ex ) {
throw new ReviewedStingException("Unable to clone AlignmentMatchSequence.");
}
copy.entries = new ArrayDeque<AlignmentMatchSequenceEntry>();
for( AlignmentMatchSequenceEntry entry: entries )
copy.entries.add(entry.clone());
return copy;
}
public Cigar convertToCigar(boolean negativeStrand) {
Cigar cigar = new Cigar();
Iterator<AlignmentMatchSequenceEntry> iterator = negativeStrand ? entries.descendingIterator() : entries.iterator();
while( iterator.hasNext() ) {
AlignmentMatchSequenceEntry entry = iterator.next();
CigarOperator operator;
switch( entry.getAlignmentState() ) {
case MATCH_MISMATCH: operator = CigarOperator.MATCH_OR_MISMATCH; break;
case INSERTION: operator = CigarOperator.INSERTION; break;
case DELETION: operator = CigarOperator.DELETION; break;
default: throw new ReviewedStingException("convertToCigar: cannot process state: " + entry.getAlignmentState());
}
cigar.add( new CigarElement(entry.count,operator) );
}
return cigar;
}
/**
* All a new alignment of the given state.
* @param state State to add to the sequence.
*/
public void addNext( AlignmentState state ) {
AlignmentMatchSequenceEntry last = entries.peekLast();
// If the last entry is the same as this one, increment it. Otherwise, add a new entry.
if( last != null && last.alignmentState == state )
last.increment();
else
entries.add(new AlignmentMatchSequenceEntry(state));
}
/**
* Gets the current state of this alignment (what's the state of the last base?)
* @return State of the most recently aligned base.
*/
public AlignmentState getCurrentState() {
if( entries.size() == 0 )
return AlignmentState.MATCH_MISMATCH;
return entries.peekLast().getAlignmentState();
}
/**
* How many bases in the read match the given state.
* @param state State to test.
* @return number of bases which match that state.
*/
public int getNumberOfBasesMatchingState(AlignmentState state) {
int matches = 0;
for( AlignmentMatchSequenceEntry entry: entries ) {
if( entry.getAlignmentState() == state )
matches += entry.count;
}
return matches;
}
/**
* Stores an individual match sequence entry.
*/
private class AlignmentMatchSequenceEntry implements Cloneable {
/**
* The state of the alignment throughout a given point in the sequence.
*/
private final AlignmentState alignmentState;
/**
* The number of bases having this particular state.
*/
private int count;
/**
* Create a new sequence entry with the given state.
* @param alignmentState The state that this sequence should contain.
*/
AlignmentMatchSequenceEntry( AlignmentState alignmentState ) {
this.alignmentState = alignmentState;
this.count = 1;
}
/**
* Clone the given match sequence entry.
* @return A deep copy of the current match sequence entry.
*/
public AlignmentMatchSequenceEntry clone() {
try {
return (AlignmentMatchSequenceEntry)super.clone();
}
catch( CloneNotSupportedException ex ) {
throw new ReviewedStingException("Unable to clone AlignmentMatchSequenceEntry.");
}
}
/**
* Retrieves the current state of the alignment.
* @return The state of the current sequence.
*/
AlignmentState getAlignmentState() {
return alignmentState;
}
/**
* Increment the count of alignments having this particular state.
*/
void increment() {
count++;
}
}
}

View File

@ -0,0 +1,13 @@
package org.broadinstitute.sting.alignment.bwa.java;
/**
* The state of a given base in the alignment.
*
* @author mhanna
* @version 0.1
*/
public enum AlignmentState {
MATCH_MISMATCH,
INSERTION,
DELETION
}

View File

@ -0,0 +1,190 @@
package org.broadinstitute.sting.alignment.bwa.java;
import net.sf.samtools.Cigar;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* An alignment object to be used incrementally as the BWA aligner
* inspects the read.
*
* @author mhanna
* @version 0.1
*/
public class BWAAlignment extends Alignment implements Cloneable {
/**
* Track the number of alignments that have been created.
*/
private static long numCreated;
/**
* Which number alignment is this?
*/
private long creationNumber;
/**
* The aligner performing the alignments.
*/
protected BWAJavaAligner aligner;
/**
* The sequence of matches/mismatches/insertions/deletions.
*/
private AlignmentMatchSequence alignmentMatchSequence = new AlignmentMatchSequence();
/**
* Working variable. How many bases have been matched at this point.
*/
protected int position;
/**
* Working variable. How many mismatches have been encountered at this point.
*/
private int mismatches;
/**
* Number of gap opens in alignment.
*/
private int gapOpens;
/**
* Number of gap extensions in alignment.
*/
private int gapExtensions;
/**
* Working variable. The lower bound of the alignment within the BWT.
*/
protected long loBound;
/**
* Working variable. The upper bound of the alignment within the BWT.
*/
protected long hiBound;
protected void setAlignmentStart(long position) {
this.alignmentStart = position;
}
protected void setNegativeStrand(boolean negativeStrand) {
this.negativeStrand = negativeStrand;
}
/**
* Cache the score.
*/
private int score;
public Cigar getCigar() {
return alignmentMatchSequence.convertToCigar(isNegativeStrand());
}
/**
* Gets the current state of this alignment (state of the last base viewed)..
* @return Current state of the alignment.
*/
public AlignmentState getCurrentState() {
return alignmentMatchSequence.getCurrentState();
}
/**
* Adds the given state to the current alignment.
* @param state State to add to the given alignment.
*/
public void addState( AlignmentState state ) {
alignmentMatchSequence.addNext(state);
}
/**
* Gets the BWA score of this alignment.
* @return BWA-style scores. 0 is best.
*/
public int getScore() {
return score;
}
public int getMismatches() { return mismatches; }
public int getGapOpens() { return gapOpens; }
public int getGapExtensions() { return gapExtensions; }
public void incrementMismatches() {
this.mismatches++;
updateScore();
}
public void incrementGapOpens() {
this.gapOpens++;
updateScore();
}
public void incrementGapExtensions() {
this.gapExtensions++;
updateScore();
}
/**
* Updates the score based on new information about matches / mismatches.
*/
private void updateScore() {
score = mismatches*aligner.MISMATCH_PENALTY + gapOpens*aligner.GAP_OPEN_PENALTY + gapExtensions*aligner.GAP_EXTENSION_PENALTY;
}
/**
* Create a new alignment with the given parent aligner.
* @param aligner Aligner being used.
*/
public BWAAlignment( BWAJavaAligner aligner ) {
this.aligner = aligner;
this.creationNumber = numCreated++;
}
/**
* Clone the alignment.
* @return New instance of the alignment.
*/
public BWAAlignment clone() {
BWAAlignment newAlignment = null;
try {
newAlignment = (BWAAlignment)super.clone();
}
catch( CloneNotSupportedException ex ) {
throw new ReviewedStingException("Unable to clone BWAAlignment.");
}
newAlignment.creationNumber = numCreated++;
newAlignment.alignmentMatchSequence = alignmentMatchSequence.clone();
return newAlignment;
}
/**
* How many bases in the read match the given state.
* @param state State to test.
* @return number of bases which match that state.
*/
public int getNumberOfBasesMatchingState(AlignmentState state) {
return alignmentMatchSequence.getNumberOfBasesMatchingState(state);
}
/**
* Compare this alignment to another alignment.
* @param rhs Other alignment to which to compare.
* @return < 0 if this < other, == 0 if this == other, > 0 if this > other
*/
public int compareTo(Alignment rhs) {
BWAAlignment other = (BWAAlignment)rhs;
// If the scores are different, disambiguate using the score.
if(score != other.score)
return score > other.score ? 1 : -1;
// Otherwise, use the order in which the elements were created.
if(creationNumber != other.creationNumber)
return creationNumber > other.creationNumber ? -1 : 1;
return 0;
}
public String toString() {
return String.format("position: %d, strand: %b, state: %s, mismatches: %d, gap opens: %d, gap extensions: %d, loBound: %d, hiBound: %d, score: %d, creationNumber: %d", position, negativeStrand, alignmentMatchSequence.getCurrentState(), mismatches, gapOpens, gapExtensions, loBound, hiBound, getScore(), creationNumber);
}
}

View File

@ -0,0 +1,393 @@
package org.broadinstitute.sting.alignment.bwa.java;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.alignment.bwa.BWAAligner;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.reference.bwt.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.PriorityQueue;
/**
* Create imperfect alignments from the read to the genome represented by the given BWT / suffix array.
*
* @author mhanna
* @version 0.1
*/
public class BWAJavaAligner extends BWAAligner {
/**
* BWT in the forward direction.
*/
private BWT forwardBWT;
/**
* BWT in the reverse direction.
*/
private BWT reverseBWT;
/**
* Suffix array in the forward direction.
*/
private SuffixArray forwardSuffixArray;
/**
* Suffix array in the reverse direction.
*/
private SuffixArray reverseSuffixArray;
/**
* Maximum edit distance (-n option from original BWA).
*/
private final int MAXIMUM_EDIT_DISTANCE = 4;
/**
* Maximum number of gap opens (-o option from original BWA).
*/
private final int MAXIMUM_GAP_OPENS = 1;
/**
* Maximum number of gap extensions (-e option from original BWA).
*/
private final int MAXIMUM_GAP_EXTENSIONS = 6;
/**
* Penalty for straight mismatches (-M option from original BWA).
*/
public final int MISMATCH_PENALTY = 3;
/**
* Penalty for gap opens (-O option from original BWA).
*/
public final int GAP_OPEN_PENALTY = 11;
/**
* Penalty for gap extensions (-E option from original BWA).
*/
public final int GAP_EXTENSION_PENALTY = 4;
/**
* Skip the ends of indels.
*/
public final int INDEL_END_SKIP = 5;
public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
super(null,null);
forwardBWT = new BWTReader(forwardBWTFile).read();
reverseBWT = new BWTReader(reverseBWTFile).read();
forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
}
/**
* Close this instance of the BWA pointer and delete its resources.
*/
@Override
public void close() {
throw new UnsupportedOperationException("BWA aligner can't currently be closed.");
}
/**
* Update the current parameters of this aligner.
* @param configuration New configuration to set.
*/
public void updateConfiguration(BWAConfiguration configuration) {
throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed.");
}
/**
* Allow the aligner to choose one alignment randomly from the pile of best alignments.
* @param bases Bases to align.
* @return An align
*/
public Alignment getBestAlignment(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
/**
* Align the read to the reference.
* @param read Read to align.
* @param header Optional header to drop in place.
* @return A list of the alignments.
*/
public SAMRecord align(final SAMRecord read, final SAMFileHeader header) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
/**
* Get a iterator of alignments, batched by mapping quality.
* @param bases List of bases.
* @return Iterator to alignments.
*/
public Iterable<Alignment[]> getAllAlignments(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
/**
* Get a iterator of aligned reads, batched by mapping quality.
* @param read Read to align.
* @param newHeader Optional new header to use when aligning the read. If present, it must be null.
* @return Iterator to alignments.
*/
public Iterable<SAMRecord[]> alignAll(final SAMRecord read, final SAMFileHeader newHeader) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
public List<Alignment> align( SAMRecord read ) {
List<Alignment> successfulMatches = new ArrayList<Alignment>();
Byte[] uncomplementedBases = normalizeBases(read.getReadBases());
Byte[] complementedBases = normalizeBases(Utils.reverse(BaseUtils.simpleReverseComplement(read.getReadBases())));
List<LowerBound> forwardLowerBounds = LowerBound.create(uncomplementedBases,forwardBWT);
List<LowerBound> reverseLowerBounds = LowerBound.create(complementedBases,reverseBWT);
// Seed the best score with any score that won't overflow on comparison.
int bestScore = Integer.MAX_VALUE - MISMATCH_PENALTY;
int bestDiff = MAXIMUM_EDIT_DISTANCE+1;
int maxDiff = MAXIMUM_EDIT_DISTANCE;
PriorityQueue<BWAAlignment> alignments = new PriorityQueue<BWAAlignment>();
// Create a fictional initial alignment, with the position just off the end of the read, and the limits
// set as the entire BWT.
alignments.add(createSeedAlignment(reverseBWT));
alignments.add(createSeedAlignment(forwardBWT));
while(!alignments.isEmpty()) {
BWAAlignment alignment = alignments.remove();
// From bwtgap.c in the original BWT; if the rank is worse than the best score + the mismatch PENALTY, move on.
if( alignment.getScore() > bestScore + MISMATCH_PENALTY )
break;
Byte[] bases = alignment.isNegativeStrand() ? complementedBases : uncomplementedBases;
BWT bwt = alignment.isNegativeStrand() ? forwardBWT : reverseBWT;
List<LowerBound> lowerBounds = alignment.isNegativeStrand() ? reverseLowerBounds : forwardLowerBounds;
// if z < D(i) then return {}
int mismatches = maxDiff - alignment.getMismatches() - alignment.getGapOpens() - alignment.getGapExtensions();
if( alignment.position < lowerBounds.size()-1 && mismatches < lowerBounds.get(alignment.position+1).value )
continue;
if(mismatches == 0) {
exactMatch(alignment,bases,bwt);
if(alignment.loBound > alignment.hiBound)
continue;
}
// Found a valid alignment; store it and move on.
if(alignment.position >= read.getReadLength()-1) {
for(long bwtIndex = alignment.loBound; bwtIndex <= alignment.hiBound; bwtIndex++) {
BWAAlignment finalAlignment = alignment.clone();
if( finalAlignment.isNegativeStrand() )
finalAlignment.setAlignmentStart(forwardSuffixArray.get(bwtIndex) + 1);
else {
int sizeAlongReference = read.getReadLength() -
finalAlignment.getNumberOfBasesMatchingState(AlignmentState.INSERTION) +
finalAlignment.getNumberOfBasesMatchingState(AlignmentState.DELETION);
finalAlignment.setAlignmentStart(reverseBWT.length() - reverseSuffixArray.get(bwtIndex) - sizeAlongReference + 1);
}
successfulMatches.add(finalAlignment);
bestScore = Math.min(finalAlignment.getScore(),bestScore);
bestDiff = Math.min(finalAlignment.getMismatches()+finalAlignment.getGapOpens()+finalAlignment.getGapExtensions(),bestDiff);
maxDiff = bestDiff + 1;
}
continue;
}
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
/*
System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
alignment.negativeStrand?1:0,
bases.length-alignment.position-1,
alignment.getCurrentState().toString().charAt(0),
alignment.getMismatches(),
alignment.getGapOpens(),
alignment.getGapExtensions(),
lowerBounds.get(alignment.position+1).value,
lowerBounds.get(alignment.position+1).width,
alignment.loBound,
alignment.hiBound);
*/
// Temporary -- look ahead to see if the next alignment is bounded.
boolean allowDifferences = mismatches > 0;
boolean allowMismatches = mismatches > 0;
if( allowDifferences &&
alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&
read.getReadLength()-1-(alignment.position+1) >= INDEL_END_SKIP+alignment.getGapOpens()+alignment.getGapExtensions() ) {
if( alignment.getCurrentState() == AlignmentState.MATCH_MISMATCH ) {
if( alignment.getGapOpens() < MAXIMUM_GAP_OPENS ) {
// Add a potential insertion extension.
BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
insertionAlignment.incrementGapOpens();
alignments.add(insertionAlignment);
// Add a potential deletion by marking a deletion and augmenting the position.
List<BWAAlignment> deletionAlignments = createDeletionAlignments(bwt,alignment);
for( BWAAlignment deletionAlignment: deletionAlignments )
deletionAlignment.incrementGapOpens();
alignments.addAll(deletionAlignments);
}
}
else if( alignment.getCurrentState() == AlignmentState.INSERTION ) {
if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
// Add a potential insertion extension.
BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
insertionAlignment.incrementGapExtensions();
alignments.add(insertionAlignment);
}
}
else if( alignment.getCurrentState() == AlignmentState.DELETION ) {
if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
// Add a potential deletion by marking a deletion and augmenting the position.
List<BWAAlignment> deletionAlignments = createDeletionAlignments(bwt,alignment);
for( BWAAlignment deletionAlignment: deletionAlignments )
deletionAlignment.incrementGapExtensions();
alignments.addAll(deletionAlignments);
}
}
}
// Mismatches
alignments.addAll(createMatchedAlignments(bwt,alignment,bases,allowDifferences&&allowMismatches));
}
return successfulMatches;
}
/**
* Create an seeding alignment to use as a starting point when traversing.
* @param bwt source BWT.
* @return Seed alignment.
*/
private BWAAlignment createSeedAlignment(BWT bwt) {
BWAAlignment seed = new BWAAlignment(this);
seed.setNegativeStrand(bwt == forwardBWT);
seed.position = -1;
seed.loBound = 0;
seed.hiBound = bwt.length();
return seed;
}
/**
* Creates a new alignments representing direct matches / mismatches.
* @param bwt Source BWT with which to work.
* @param alignment Alignment for the previous position.
* @param bases The bases in the read.
* @param allowMismatch Should mismatching bases be allowed?
* @return New alignment representing this position if valid; null otherwise.
*/
private List<BWAAlignment> createMatchedAlignments( BWT bwt, BWAAlignment alignment, Byte[] bases, boolean allowMismatch ) {
List<BWAAlignment> newAlignments = new ArrayList<BWAAlignment>();
List<Byte> baseChoices = new ArrayList<Byte>();
Byte thisBase = bases[alignment.position+1];
if( allowMismatch )
baseChoices.addAll(Bases.allOf());
else
baseChoices.add(thisBase);
if( thisBase != null ) {
// Keep rotating the current base to the last position until we've hit the current base.
for( ;; ) {
baseChoices.add(baseChoices.remove(0));
if( thisBase.equals(baseChoices.get(baseChoices.size()-1)) )
break;
}
}
for(byte base: baseChoices) {
BWAAlignment newAlignment = alignment.clone();
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
// If this alignment is valid, skip it.
if( newAlignment.loBound > newAlignment.hiBound )
continue;
newAlignment.position++;
newAlignment.addState(AlignmentState.MATCH_MISMATCH);
if( bases[newAlignment.position] == null || base != bases[newAlignment.position] )
newAlignment.incrementMismatches();
newAlignments.add(newAlignment);
}
return newAlignments;
}
/**
* Create a new alignment representing an insertion at this point in the read.
* @param alignment Alignment from which to derive the insertion.
* @return New alignment reflecting the insertion.
*/
private BWAAlignment createInsertionAlignment( BWAAlignment alignment ) {
// Add a potential insertion extension.
BWAAlignment newAlignment = alignment.clone();
newAlignment.position++;
newAlignment.addState(AlignmentState.INSERTION);
return newAlignment;
}
/**
* Create new alignments representing a deletion at this point in the read.
* @param bwt source BWT for inferring deletion info.
* @param alignment Alignment from which to derive the deletion.
* @return New alignments reflecting all possible deletions.
*/
private List<BWAAlignment> createDeletionAlignments( BWT bwt, BWAAlignment alignment) {
List<BWAAlignment> newAlignments = new ArrayList<BWAAlignment>();
for(byte base: Bases.instance) {
BWAAlignment newAlignment = alignment.clone();
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
// If this alignment is valid, skip it.
if( newAlignment.loBound > newAlignment.hiBound )
continue;
newAlignment.addState(AlignmentState.DELETION);
newAlignments.add(newAlignment);
}
return newAlignments;
}
/**
* Exactly match the given alignment against the given BWT.
* @param alignment Alignment to match.
* @param bases Bases to use.
* @param bwt BWT to use.
*/
private void exactMatch( BWAAlignment alignment, Byte[] bases, BWT bwt ) {
while( ++alignment.position < bases.length ) {
byte base = bases[alignment.position];
alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
if( alignment.loBound > alignment.hiBound )
return;
}
}
/**
* Make each base into A/C/G/T or null if unknown.
* @param bases Base string to normalize.
* @return Array of normalized bases.
*/
private Byte[] normalizeBases( byte[] bases ) {
Byte[] normalBases = new Byte[bases.length];
for(int i = 0; i < bases.length; i++)
normalBases[i] = Bases.fromASCII(bases[i]);
return normalBases;
}
}

View File

@ -0,0 +1,88 @@
package org.broadinstitute.sting.alignment.bwa.java;
import org.broadinstitute.sting.alignment.reference.bwt.BWT;
import java.util.ArrayList;
import java.util.List;
/**
* At any point along the given read, what is a good lower bound for the
* total number of differences?
*
* @author mhanna
* @version 0.1
*/
public class LowerBound {
/**
* Lower bound of the suffix array.
*/
public final long loIndex;
/**
* Upper bound of the suffix array.
*/
public final long hiIndex;
/**
* Width of the bwt from loIndex -> hiIndex, inclusive.
*/
public final long width;
/**
* The lower bound at the given point.
*/
public final int value;
/**
* Create a new lower bound with the given value.
* @param loIndex The lower bound of the BWT.
* @param hiIndex The upper bound of the BWT.
* @param value Value for the lower bound at this site.
*/
private LowerBound(long loIndex, long hiIndex, int value) {
this.loIndex = loIndex;
this.hiIndex = hiIndex;
this.width = hiIndex - loIndex + 1;
this.value = value;
}
/**
* Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper.
* @param bases Bases of the read to use when creating a new BWT.
* @param bwt BWT to check against.
* @return A list of lower bounds at every point in the reference.
*
*/
public static List<LowerBound> create(Byte[] bases, BWT bwt) {
List<LowerBound> bounds = new ArrayList<LowerBound>();
long loIndex = 0, hiIndex = bwt.length();
int mismatches = 0;
for( int i = bases.length-1; i >= 0; i-- ) {
Byte base = bases[i];
// Ignore non-ACGT bases.
if( base != null ) {
loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1;
hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex);
}
if( base == null || loIndex > hiIndex ) {
loIndex = 0;
hiIndex = bwt.length();
mismatches++;
}
bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches));
}
return bounds;
}
/**
* Create a string representation of this bound.
* @return String version of this bound.
*/
public String toString() {
return String.format("LowerBound: w = %d, value = %d",width,value);
}
}

View File

@ -0,0 +1,4 @@
/**
* Analyses used to validate the correctness and performance the BWA Java bindings.
*/
package org.broadinstitute.sting.alignment;

View File

@ -0,0 +1,68 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
/**
* Writes .amb files - a file indicating where 'holes' (indeterminant bases)
* exist in the contig. Currently, only empty, placeholder AMBs are supported.
*
* @author mhanna
* @version 0.1
*/
public class AMBWriter {
/**
* Number of holes is fixed at zero.
*/
private static final int NUM_HOLES = 0;
/**
* Input stream from which to read BWT data.
*/
private final PrintStream out;
/**
* Create a new ANNWriter targeting the given file.
* @param file file into which ANN data should be written.
* @throws java.io.IOException if there is a problem opening the output file.
*/
public AMBWriter(File file) throws IOException {
out = new PrintStream(file);
}
/**
* Create a new ANNWriter targeting the given OutputStream.
* @param stream Stream into which ANN data should be written.
*/
public AMBWriter(OutputStream stream) {
out = new PrintStream(stream);
}
/**
* Write the contents of the given dictionary into the AMB file.
* Assumes that there are no holes in the dictionary.
* @param dictionary Dictionary to write.
*/
public void writeEmpty(SAMSequenceDictionary dictionary) {
long genomeLength = 0L;
for(SAMSequenceRecord sequence: dictionary.getSequences())
genomeLength += sequence.getSequenceLength();
int sequences = dictionary.getSequences().size();
// Write the header
out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES);
}
/**
* Close the given output stream.
*/
public void close() {
out.close();
}
}

View File

@ -0,0 +1,95 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
/**
* Writes .ann files - an alternate sequence dictionary format
* used by BWA/C. For best results, the input sequence dictionary
* should be created with Picard's CreateSequenceDictionary.jar,
* TRUNCATE_NAMES_AT_WHITESPACE=false.
*
* @author mhanna
* @version 0.1
*/
public class ANNWriter {
/**
* BWA uses a fixed seed of 11, written into every file.
*/
private static final int BNS_SEED = 11;
/**
* A seemingly unused value that appears in every contig in the ANN.
*/
private static final int GI = 0;
/**
* Input stream from which to read BWT data.
*/
private final PrintStream out;
/**
* Create a new ANNWriter targeting the given file.
* @param file file into which ANN data should be written.
* @throws IOException if there is a problem opening the output file.
*/
public ANNWriter(File file) throws IOException {
out = new PrintStream(file);
}
/**
* Create a new ANNWriter targeting the given OutputStream.
* @param stream Stream into which ANN data should be written.
*/
public ANNWriter(OutputStream stream) {
out = new PrintStream(stream);
}
/**
* Write the contents of the given dictionary into the ANN file.
* Assumes that no ambs (blocks of indeterminate base) are present in the dictionary.
* @param dictionary Dictionary to write.
*/
public void write(SAMSequenceDictionary dictionary) {
long genomeLength = 0L;
for(SAMSequenceRecord sequence: dictionary.getSequences())
genomeLength += sequence.getSequenceLength();
int sequences = dictionary.getSequences().size();
// Write the header
out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED);
for(SAMSequenceRecord sequence: dictionary.getSequences()) {
String fullSequenceName = sequence.getSequenceName();
String trimmedSequenceName = fullSequenceName;
String sequenceComment = "(null)";
long offset = 0;
// Separate the sequence name from the sequence comment, based on BWA's definition.
// BWA's definition appears to accept a zero-length contig name, so mimic that behavior.
if(fullSequenceName.indexOf(' ') >= 0) {
trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' '));
sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1);
}
// Write the sequence GI (?), name, and comment.
out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment);
// Write the sequence offset, length, and ambs (currently fixed at 0).
out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0);
}
}
/**
* Close the given output stream.
*/
public void close() {
out.close();
}
}

View File

@ -0,0 +1,172 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* Represents the Burrows-Wheeler Transform of a reference sequence.
*
* @author mhanna
* @version 0.1
*/
public class BWT {
/**
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
*/
public static final int SEQUENCE_BLOCK_SIZE = 128;
/**
* The inverse SA, used as a placeholder for determining where the special EOL character sits.
*/
protected final long inverseSA0;
/**
* Cumulative counts for the entire BWT.
*/
protected final Counts counts;
/**
* The individual sequence blocks, modelling how they appear on disk.
*/
protected final SequenceBlock[] sequenceBlocks;
/**
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param counts Cumulative count of bases, in A,C,G,T order.
* @param sequenceBlocks The full BWT sequence, sans the '$'.
*/
public BWT( long inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) {
this.inverseSA0 = inverseSA0;
this.counts = counts;
this.sequenceBlocks = sequenceBlocks;
}
/**
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param counts Count of bases, in A,C,G,T order.
* @param sequence The full BWT sequence, sans the '$'.
*/
public BWT( long inverseSA0, Counts counts, byte[] sequence ) {
this(inverseSA0,counts,generateSequenceBlocks(sequence));
}
/**
* Extract the full sequence from the list of block.
* @return The full BWT string as a byte array.
*/
public byte[] getSequence() {
byte[] sequence = new byte[(int)counts.getTotal()];
for( SequenceBlock block: sequenceBlocks )
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
return sequence;
}
/**
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
* @param base The base.
* @return Total counts for all bases lexicographically smaller than this base.
*/
public long counts(byte base) {
return counts.getCumulative(base);
}
/**
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
* @param base The base.
* @param index The position to search within the BWT.
* @return Total counts for all bases lexicographically smaller than this base.
*/
public long occurrences(byte base,long index) {
SequenceBlock block = getSequenceBlock(index);
int position = getSequencePosition(index);
long accumulator = block.occurrences.get(base);
for(int i = 0; i <= position; i++) {
if(base == block.sequence[i])
accumulator++;
}
return accumulator;
}
/**
* The number of bases in the BWT as a whole.
* @return Number of bases.
*/
public long length() {
return counts.getTotal();
}
/**
* Create a new BWT from the given reference sequence.
* @param referenceSequence Sequence from which to derive the BWT.
* @return reference sequence-derived BWT.
*/
public static BWT createFromReferenceSequence(byte[] referenceSequence) {
SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
byte[] bwt = new byte[(int)suffixArray.length()-1];
int bwtIndex = 0;
for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) {
if(suffixArray.get(suffixArrayIndex) == 0)
continue;
bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1];
}
return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt);
}
/**
* Gets the base at a given position in the BWT.
* @param index The index to use.
* @return The base at that location.
*/
protected byte getBase(long index) {
if(index == inverseSA0)
throw new ReviewedStingException(String.format("Base at index %d does not have a text representation",index));
SequenceBlock block = getSequenceBlock(index);
int position = getSequencePosition(index);
return block.sequence[position];
}
private SequenceBlock getSequenceBlock(long index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if(index > inverseSA0) index--;
return sequenceBlocks[(int)(index/SEQUENCE_BLOCK_SIZE)];
}
private int getSequencePosition(long index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if(index > inverseSA0) index--;
return (int)(index%SEQUENCE_BLOCK_SIZE);
}
/**
* Create a set of sequence blocks from one long sequence.
* @param sequence Sequence from which to derive blocks.
* @return Array of sequence blocks containing data from the sequence.
*/
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
Counts occurrences = new Counts();
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
for( int block = 0; block < numSequenceBlocks; block++ ) {
int blockStart = block*SEQUENCE_BLOCK_SIZE;
int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
byte[] subsequence = new byte[blockLength];
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence);
for( byte base: subsequence )
occurrences.increment(base);
}
return sequenceBlocks;
}
}

View File

@ -0,0 +1,89 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.alignment.reference.packing.BasePackedInputStream;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteOrder;
/**
* Reads a BWT from a given file.
*
* @author mhanna
* @version 0.1
*/
public class BWTReader {
/**
* Input stream from which to read BWT data.
*/
private FileInputStream inputStream;
/**
* Create a new BWT reader.
* @param inputFile File in which the BWT is stored.
*/
public BWTReader( File inputFile ) {
try {
this.inputStream = new FileInputStream(inputFile);
}
catch( FileNotFoundException ex ) {
throw new ReviewedStingException("Unable to open input file", ex);
}
}
/**
* Read a BWT from the input stream.
* @return The BWT stored in the input stream.
*/
public BWT read() {
UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
long inverseSA0;
long[] count;
SequenceBlock[] sequenceBlocks;
try {
inverseSA0 = uintPackedInputStream.read();
count = new long[PackUtils.ALPHABET_SIZE];
uintPackedInputStream.read(count);
long bwtSize = count[PackUtils.ALPHABET_SIZE-1];
sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
for( int block = 0; block < sequenceBlocks.length; block++ ) {
int sequenceStart = block* BWT.SEQUENCE_BLOCK_SIZE;
int sequenceLength = (int)Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
long[] occurrences = new long[PackUtils.ALPHABET_SIZE];
byte[] bwt = new byte[sequenceLength];
uintPackedInputStream.read(occurrences);
basePackedInputStream.read(bwt);
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt);
}
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
}
return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks);
}
/**
* Close the input stream.
*/
public void close() {
try {
inputStream.close();
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to close input file", ex);
}
}
}

View File

@ -0,0 +1,60 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMSequenceDictionary;
import java.io.File;
import java.io.IOException;
/**
* Generate BWA supplementary files (.ann, .amb) from the command line.
*
* @author mhanna
* @version 0.1
*/
public class BWTSupplementaryFileGenerator {
enum SupplementaryFileType { ANN, AMB }
public static void main(String[] args) throws IOException {
if(args.length < 3)
usage("Incorrect number of arguments supplied");
File fastaFile = new File(args[0]);
File outputFile = new File(args[1]);
SupplementaryFileType outputType = null;
try {
outputType = Enum.valueOf(SupplementaryFileType.class,args[2]);
}
catch(IllegalArgumentException ex) {
usage("Invalid output type: " + args[2]);
}
ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile);
SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary();
switch(outputType) {
case ANN:
ANNWriter annWriter = new ANNWriter(outputFile);
annWriter.write(dictionary);
annWriter.close();
break;
case AMB:
AMBWriter ambWriter = new AMBWriter(outputFile);
ambWriter.writeEmpty(dictionary);
ambWriter.close();
break;
default:
usage("Unsupported output type: " + outputType);
}
}
/**
* Print usage information and exit.
*/
private static void usage(String message) {
System.err.println(message);
System.err.println("Usage: BWTSupplementaryFileGenerator <fasta> <output file> <output type>");
System.exit(1);
}
}

View File

@ -0,0 +1,71 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.alignment.reference.packing.BasePackedOutputStream;
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* Writes an in-memory BWT to an outputstream.
*
* @author mhanna
* @version 0.1
*/
public class BWTWriter {
/**
* Input stream from which to read BWT data.
*/
private final OutputStream outputStream;
/**
* Create a new BWT writer.
* @param outputFile File in which the BWT is stored.
*/
public BWTWriter( File outputFile ) {
try {
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
}
catch( FileNotFoundException ex ) {
throw new ReviewedStingException("Unable to open output file", ex);
}
}
/**
* Write a BWT to the output stream.
* @param bwt Transform to be written to the output stream.
*/
public void write( BWT bwt ) {
UnsignedIntPackedOutputStream intPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream<Integer>(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN);
try {
intPackedOutputStream.write(bwt.inverseSA0);
intPackedOutputStream.write(bwt.counts.toArray(true));
for( SequenceBlock block: bwt.sequenceBlocks ) {
intPackedOutputStream.write(block.occurrences.toArray(false));
basePackedOutputStream.write(block.sequence);
}
// The last block is the last set of counts in the structure.
intPackedOutputStream.write(bwt.counts.toArray(false));
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
}
}
/**
* Close the input stream.
*/
public void close() {
try {
outputStream.close();
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to close input file", ex);
}
}
}

View File

@ -0,0 +1,108 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Enhanced enum representation of a base.
*
* @author mhanna
* @version 0.1
*/
public class Bases implements Iterable<Byte>
{
public static byte A = 'A';
public static byte C = 'C';
public static byte G = 'G';
public static byte T = 'T';
public static final Bases instance = new Bases();
private static final List<Byte> allBases;
/**
* Representation of the base broken down by packed value.
*/
private static final Map<Integer,Byte> basesByPack = new HashMap<Integer,Byte>();
static {
List<Byte> bases = new ArrayList<Byte>();
bases.add(A);
bases.add(C);
bases.add(G);
bases.add(T);
allBases = Collections.unmodifiableList(bases);
for(int i = 0; i < allBases.size(); i++)
basesByPack.put(i,allBases.get(i));
}
/**
* Create a new base with the given ascii representation and
* pack value.
*/
private Bases() {
}
/**
* Return all possible bases.
* @return Byte representation of all bases.
*/
public static Collection<Byte> allOf() {
return allBases;
}
/**
* Gets the number of known bases.
* @return The number of known bases.
*/
public static int size() {
return allBases.size();
}
/**
* Gets an iterator over the total number of known base types.
* @return Iterator over all known bases.
*/
public Iterator<Byte> iterator() {
return basesByPack.values().iterator();
}
/**
* Get the given base from the packed representation.
* @param pack Packed representation.
* @return base.
*/
public static byte fromPack( int pack ) { return basesByPack.get(pack); }
/**
* Convert the given base to its packed value.
* @param ascii ASCII representation of the base.
* @return Packed value.
*/
public static int toPack( byte ascii )
{
for( Map.Entry<Integer,Byte> entry: basesByPack.entrySet() ) {
if( entry.getValue().equals(ascii) )
return entry.getKey();
}
throw new ReviewedStingException(String.format("Base %c is an invalid base to pack", (char)ascii));
}
/**
* Convert the ASCII representation of a base to its 'normalized' representation.
* @param base The base itself.
* @return The byte, if present. Null if unknown.
*/
public static Byte fromASCII( byte base ) {
Byte found = null;
for( Byte normalized: allBases ) {
if( normalized.equals(base) ) {
found = normalized;
break;
}
}
return found;
}
}

View File

@ -0,0 +1,151 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.HashMap;
import java.util.Map;
/**
* Counts of how many bases of each type have been seen.
*
* @author mhanna
* @version 0.1
*/
public class Counts implements Cloneable {
/**
* Internal representation of counts, broken down by ASCII value.
*/
private Map<Byte,Long> counts = new HashMap<Byte,Long>();
/**
* Internal representation of cumulative counts, broken down by ASCII value.
*/
private Map<Byte,Long> cumulativeCounts = new HashMap<Byte,Long>();
/**
* Create an empty Counts object with values A=0,C=0,G=0,T=0.
*/
public Counts()
{
for(byte base: Bases.instance) {
counts.put(base,0L);
cumulativeCounts.put(base,0L);
}
}
/**
* Create a counts data structure with the given initial values.
* @param data Count data, broken down by base.
* @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example).
*/
public Counts( long[] data, boolean cumulative ) {
if(cumulative) {
long priorCount = 0;
for(byte base: Bases.instance) {
long count = data[Bases.toPack(base)];
counts.put(base,count-priorCount);
cumulativeCounts.put(base,priorCount);
priorCount = count;
}
}
else {
long priorCount = 0;
for(byte base: Bases.instance) {
long count = data[Bases.toPack(base)];
counts.put(base,count);
cumulativeCounts.put(base,priorCount);
priorCount += count;
}
}
}
/**
* Convert to an array for persistence.
* @param cumulative Use a cumulative representation.
* @return Array of count values.
*/
public long[] toArray(boolean cumulative) {
long[] countArray = new long[counts.size()];
if(cumulative) {
int index = 0;
boolean first = true;
for(byte base: Bases.instance) {
if(first) {
first = false;
continue;
}
countArray[index++] = getCumulative(base);
}
countArray[countArray.length-1] = getTotal();
}
else {
int index = 0;
for(byte base: Bases.instance)
countArray[index++] = counts.get(base);
}
return countArray;
}
/**
* Create a unique copy of the current object.
* @return A duplicate of this object.
*/
public Counts clone() {
Counts other;
try {
other = (Counts)super.clone();
}
catch(CloneNotSupportedException ex) {
throw new ReviewedStingException("Unable to clone counts object", ex);
}
other.counts = new HashMap<Byte,Long>(counts);
other.cumulativeCounts = new HashMap<Byte,Long>(cumulativeCounts);
return other;
}
/**
* Increment the number of bases seen at the given location.
* @param base Base to increment.
*/
public void increment(byte base) {
counts.put(base,counts.get(base)+1);
boolean increment = false;
for(byte cumulative: Bases.instance) {
if(increment) cumulativeCounts.put(cumulative,cumulativeCounts.get(cumulative)+1);
increment |= (cumulative == base);
}
}
/**
* Gets a count of the number of bases seen at a given location.
* Note that counts in this case are not cumulative (counts for A,C,G,T
* are independent).
* @param base Base for which to query counts.
* @return Number of bases of this type seen.
*/
public long get(byte base) {
return counts.get(base);
}
/**
* Gets a count of the number of bases seen before this base.
* Note that counts in this case are cumulative.
* @param base Base for which to query counts.
* @return Number of bases of this type seen.
*/
public long getCumulative(byte base) {
return cumulativeCounts.get(base);
}
/**
* How many total bases are represented by this count structure?
* @return Total bases represented.
*/
public long getTotal() {
int accumulator = 0;
for(byte base: Bases.instance) {
accumulator += get(base);
}
return accumulator;
}
}

View File

@ -0,0 +1,200 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITHoc THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.IOException;
/**
* Create a suffix array data structure.
*
* @author mhanna
* @version 0.1
*/
public class CreateBWTFromReference {
private byte[] loadReference( File inputFile ) {
// Read in the first sequence in the input file
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
ReferenceSequence sequence = reference.nextSequence();
return sequence.getBases();
}
private byte[] loadReverseReference( File inputFile ) {
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
ReferenceSequence sequence = reference.nextSequence();
PackUtils.reverse(sequence.getBases());
return sequence.getBases();
}
private Counts countOccurrences( byte[] sequence ) {
Counts occurrences = new Counts();
for( byte base: sequence )
occurrences.increment(base);
return occurrences;
}
private long[] createSuffixArray( byte[] sequence ) {
return SuffixArray.createFromReferenceSequence(sequence).sequence;
}
private long[] invertSuffixArray( long[] suffixArray ) {
long[] inverseSuffixArray = new long[suffixArray.length];
for( int i = 0; i < suffixArray.length; i++ )
inverseSuffixArray[(int)suffixArray[i]] = i;
return inverseSuffixArray;
}
private long[] createCompressedSuffixArray( int[] suffixArray, int[] inverseSuffixArray ) {
long[] compressedSuffixArray = new long[suffixArray.length];
compressedSuffixArray[0] = inverseSuffixArray[0];
for( int i = 1; i < suffixArray.length; i++ )
compressedSuffixArray[i] = inverseSuffixArray[suffixArray[i]+1];
return compressedSuffixArray;
}
private long[] createInversedCompressedSuffixArray( int[] compressedSuffixArray ) {
long[] inverseCompressedSuffixArray = new long[compressedSuffixArray.length];
for( int i = 0; i < compressedSuffixArray.length; i++ )
inverseCompressedSuffixArray[compressedSuffixArray[i]] = i;
return inverseCompressedSuffixArray;
}
public static void main( String argv[] ) throws IOException {
if( argv.length != 5 ) {
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
return;
}
String inputFileName = argv[0];
File inputFile = new File(inputFileName);
String bwtFileName = argv[1];
File bwtFile = new File(bwtFileName);
String rbwtFileName = argv[2];
File rbwtFile = new File(rbwtFileName);
String saFileName = argv[3];
File saFile = new File(saFileName);
String rsaFileName = argv[4];
File rsaFile = new File(rsaFileName);
CreateBWTFromReference creator = new CreateBWTFromReference();
byte[] sequence = creator.loadReference(inputFile);
byte[] reverseSequence = creator.loadReverseReference(inputFile);
// Count the occurences of each given base.
Counts occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A),
occurrences.getCumulative(Bases.C),
occurrences.getCumulative(Bases.G),
occurrences.getCumulative(Bases.T));
// Generate the suffix array and print diagnostics.
long[] suffixArrayData = creator.createSuffixArray(sequence);
long[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);
// Invert the suffix array and print diagnostics.
long[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
long[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
/*
// Create the data structure for the compressed suffix array and print diagnostics.
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
int reconstructedInverseSA = compressedSuffixArray[0];
for( int i = 0; i < 8; i++ ) {
System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA);
reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA];
}
// Create the data structure for the inverse compressed suffix array and print diagnostics.
int[] inverseCompressedSuffixArray = creator.createInversedCompressedSuffixArray(compressedSuffixArray);
for( int i = 0; i < 8; i++ ) {
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
}
*/
// Create the BWT.
BWT bwt = BWT.createFromReferenceSequence(sequence);
BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence);
byte[] bwtSequence = bwt.getSequence();
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
BWTWriter bwtWriter = new BWTWriter(bwtFile);
bwtWriter.write(bwt);
bwtWriter.close();
BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
reverseBWTWriter.write(reverseBWT);
reverseBWTWriter.close();
/*
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
saWriter.write(suffixArray);
saWriter.close();
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
reverseSAWriter.write(reverseSuffixArray);
reverseSAWriter.close();
*/
File existingBWTFile = new File(inputFileName+".bwt");
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
BWT existingBWT = existingBWTReader.read();
byte[] existingBWTSequence = existingBWT.getSequence();
System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());
for( int i = 0; i < bwt.length(); i++ ) {
if( bwtSequence[i] != existingBWTSequence[i] )
throw new ReviewedStingException("BWT mismatch at " + i);
}
File existingSAFile = new File(inputFileName+".sa");
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
for(int i = 0; i < suffixArray.length(); i++) {
if( i % 10000 == 0 )
System.out.printf("Validating suffix array entry %d%n", i);
if( suffixArray.get(i) != existingSuffixArray.get(i) )
throw new ReviewedStingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
}
}
}

View File

@ -0,0 +1,41 @@
package org.broadinstitute.sting.alignment.reference.bwt;
/**
* Models a block of bases within the BWT.
*/
public class SequenceBlock {
/**
* Start position of this sequence within the BWT.
*/
public final int sequenceStart;
/**
* Length of this sequence within the BWT.
*/
public final int sequenceLength;
/**
* Occurrences of each letter up to this sequence block.
*/
public final Counts occurrences;
/**
* Sequence for this segment.
*/
public final byte[] sequence;
/**
* Create a new block within this BWT.
* @param sequenceStart Starting position of this sequence within the BWT.
* @param sequenceLength Length of this sequence.
* @param occurrences How many of each base has been seen before this sequence began.
* @param sequence The actual sequence from the BWT.
*/
public SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) {
this.sequenceStart = sequenceStart;
this.sequenceLength = sequenceLength;
this.occurrences = occurrences;
this.sequence = sequence;
}
}

View File

@ -0,0 +1,158 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Comparator;
import java.util.TreeSet;
/**
* An in-memory representation of a suffix array.
*
* @author mhanna
* @version 0.1
*/
public class SuffixArray {
public final long inverseSA0;
public final Counts occurrences;
/**
* The elements of the sequence actually stored in memory.
*/
protected final long[] sequence;
/**
* How often are individual elements in the sequence actually stored
* in memory, as opposed to being calculated on the fly?
*/
protected final int sequenceInterval;
/**
* The BWT used to calculate missing portions of the sequence.
*/
protected final BWT bwt;
public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence) {
this(inverseSA0,occurrences,sequence,1,null);
}
/**
* Creates a new sequence array with the given inverse SA, occurrences, and values.
* @param inverseSA0 Inverse SA entry for the first element.
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param sequence The full suffix array.
* @param sequenceInterval How frequently is the sequence interval stored.
* @param bwt bwt used to infer the remaining entries in the BWT.
*/
public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence, int sequenceInterval, BWT bwt) {
this.inverseSA0 = inverseSA0;
this.occurrences = occurrences;
this.sequence = sequence;
this.sequenceInterval = sequenceInterval;
this.bwt = bwt;
if(sequenceInterval != 1 && bwt == null)
throw new ReviewedStingException("A BWT must be provided if the sequence interval is not 1");
}
/**
* Retrieves the length of the sequence array.
* @return Length of the suffix array.
*/
public long length() {
if( bwt != null )
return bwt.length()+1;
else
return sequence.length;
}
/**
* Get the suffix array value at a given sequence.
* @param index Index at which to retrieve the suffix array vaule.
* @return The suffix array value at that entry.
*/
public long get(long index) {
int iterations = 0;
while(index%sequenceInterval != 0) {
// The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case.
if(index == inverseSA0)
index = 0;
else {
byte base = bwt.getBase(index);
index = bwt.counts(base) + bwt.occurrences(base,index);
}
iterations++;
}
return (sequence[(int)(index/sequenceInterval)]+iterations) % length();
}
/**
* Create a suffix array from a given reference sequence.
* @param sequence The reference sequence to use when building the suffix array.
* @return a constructed suffix array.
*/
public static SuffixArray createFromReferenceSequence(byte[] sequence) {
// The builder for the suffix array. Use an integer in this case because
// Java arrays can only hold an integer.
TreeSet<Integer> suffixArrayBuilder = new TreeSet<Integer>(new SuffixArrayComparator(sequence));
Counts occurrences = new Counts();
for( byte base: sequence )
occurrences.increment(base);
// Build out the suffix array using a custom comparator.
for( int i = 0; i <= sequence.length; i++ )
suffixArrayBuilder.add(i);
// Copy the suffix array into an array.
long[] suffixArray = new long[suffixArrayBuilder.size()];
int i = 0;
for( Integer element: suffixArrayBuilder )
suffixArray[i++] = element;
// Find the first element in the inverse suffix array.
long inverseSA0 = -1;
for(i = 0; i < suffixArray.length; i++) {
if(suffixArray[i] == 0)
inverseSA0 = i;
}
if(inverseSA0 < 0)
throw new ReviewedStingException("Unable to find first inverse SA entry in generated suffix array.");
return new SuffixArray(inverseSA0,occurrences,suffixArray);
}
/**
* Compares two suffix arrays of the given sequence. Will return whichever string appears
* first in lexicographic order.
*/
private static class SuffixArrayComparator implements Comparator<Integer> {
/**
* The data source for all suffix arrays.
*/
private final String sequence;
/**
* Create a new comparator.
* @param sequence Reference sequence to use as basis for comparison.
*/
public SuffixArrayComparator( byte[] sequence ) {
// Processing the suffix array tends to be easier as a string.
this.sequence = StringUtil.bytesToString(sequence);
}
/**
* Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of
* the two substrings sequence[lhs:], sequence[rhs:].
* @param lhs Left-hand side of comparison.
* @param rhs Right-hand side of comparison.
* @return How the suffix arrays represented by lhs, rhs compare.
*/
public int compare( Integer lhs, Integer rhs ) {
String lhsSuffixArray = sequence.substring(lhs);
String rhsSuffixArray = sequence.substring(rhs);
return lhsSuffixArray.compareTo(rhsSuffixArray);
}
}
}

View File

@ -0,0 +1,85 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteOrder;
/**
* A reader for suffix arrays in permanent storage.
*
* @author mhanna
* @version 0.1
*/
public class SuffixArrayReader {
/**
* Input stream from which to read suffix array data.
*/
private FileInputStream inputStream;
/**
* BWT to use to fill in missing data.
*/
private BWT bwt;
/**
* Create a new suffix array reader.
* @param inputFile File in which the suffix array is stored.
* @param bwt BWT to use when filling in missing data.
*/
public SuffixArrayReader(File inputFile, BWT bwt) {
try {
this.inputStream = new FileInputStream(inputFile);
this.bwt = bwt;
}
catch( FileNotFoundException ex ) {
throw new ReviewedStingException("Unable to open input file", ex);
}
}
/**
* Read a suffix array from the input stream.
* @return The suffix array stored in the input stream.
*/
public SuffixArray read() {
UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
long inverseSA0;
long[] occurrences;
long[] suffixArray;
int suffixArrayInterval;
try {
inverseSA0 = uintPackedInputStream.read();
occurrences = new long[PackUtils.ALPHABET_SIZE];
uintPackedInputStream.read(occurrences);
// Throw away the suffix array size in bytes and use the occurrences table directly.
suffixArrayInterval = (int)uintPackedInputStream.read();
suffixArray = new long[(int)((occurrences[occurrences.length-1]+suffixArrayInterval-1)/suffixArrayInterval)];
uintPackedInputStream.read(suffixArray);
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
}
return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt);
}
/**
* Close the input stream.
*/
public void close() {
try {
inputStream.close();
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to close input file", ex);
}
}
}

View File

@ -0,0 +1,67 @@
package org.broadinstitute.sting.alignment.reference.bwt;
import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* Javadoc goes here.
*
* @author mhanna
* @version 0.1
*/
public class SuffixArrayWriter {
/**
* Input stream from which to read suffix array data.
*/
private OutputStream outputStream;
/**
* Create a new suffix array reader.
* @param outputFile File in which the suffix array is stored.
*/
public SuffixArrayWriter( File outputFile ) {
try {
this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
}
catch( FileNotFoundException ex ) {
throw new ReviewedStingException("Unable to open input file", ex);
}
}
/**
* Write a suffix array to the output stream.
* @param suffixArray suffix array to write.
*/
public void write(SuffixArray suffixArray) {
UnsignedIntPackedOutputStream uintPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
try {
uintPackedOutputStream.write(suffixArray.inverseSA0);
uintPackedOutputStream.write(suffixArray.occurrences.toArray(true));
// How frequently the suffix array entry is placed.
uintPackedOutputStream.write(1);
// Length of the suffix array.
uintPackedOutputStream.write(suffixArray.length()-1);
uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length-1);
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
}
}
/**
* Close the input stream.
*/
public void close() {
try {
outputStream.close();
}
catch( IOException ex ) {
throw new ReviewedStingException("Unable to close input file", ex);
}
}
}

View File

@ -0,0 +1,95 @@
package org.broadinstitute.sting.alignment.reference.packing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
/**
* Reads a packed version of the input stream.
*
* @author mhanna
* @version 0.1
*/
public class BasePackedInputStream<T> {
/**
* Type of object to unpack.
*/
private final Class<T> type;
/**
* Ultimate source for packed bases.
*/
private final FileInputStream targetInputStream;
/**
* Channel source for packed bases.
*/
private final FileChannel targetInputChannel;
/**
* A fixed-size buffer for word-packed data.
*/
private final ByteOrder byteOrder;
/**
* How many bases are in a given packed word.
*/
private final int basesPerPackedWord = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
/**
* How many bytes in an integer?
*/
private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE;
public BasePackedInputStream( Class<T> type, File inputFile, ByteOrder byteOrder ) throws FileNotFoundException {
this(type,new FileInputStream(inputFile),byteOrder);
}
public BasePackedInputStream( Class<T> type, FileInputStream inputStream, ByteOrder byteOrder ) {
if( type != Integer.class )
throw new ReviewedStingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName());
this.type = type;
this.targetInputStream = inputStream;
this.targetInputChannel = inputStream.getChannel();
this.byteOrder = byteOrder;
}
/**
* Read the entire contents of the input stream.
* @param bwt array into which bases should be read.
* @throws IOException if an I/O error occurs.
*/
public void read(byte[] bwt) throws IOException {
read(bwt,0,bwt.length);
}
/**
* Read the next <code>length</code> bases into the bwt array, starting at the given offset.
* @param bwt array holding the given data.
* @param offset target position in the bases array into which bytes should be written.
* @param length number of bases to read from the stream.
* @throws IOException if an I/O error occurs.
*/
public void read(byte[] bwt, int offset, int length) throws IOException {
int bufferWidth = ((bwt.length+basesPerPackedWord-1)/basesPerPackedWord)*bytesPerInteger;
ByteBuffer buffer = ByteBuffer.allocate(bufferWidth).order(byteOrder);
targetInputChannel.read(buffer);
targetInputChannel.position(targetInputChannel.position()+buffer.remaining());
buffer.flip();
int packedWord = 0;
int i = 0;
while(i < length) {
if(i % basesPerPackedWord == 0) packedWord = buffer.getInt();
int position = basesPerPackedWord - i%basesPerPackedWord - 1;
bwt[offset+i++] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
}
}
}

View File

@ -0,0 +1,140 @@
package org.broadinstitute.sting.alignment.reference.packing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* A general-purpose stream for writing packed bases.
*
* @author mhanna
* @version 0.1
*/
public class BasePackedOutputStream<T> {
/**
* Type of object to pack.
*/
private final Class<T> type;
/**
* How many bases can be stored in the given data structure?
*/
private final int basesPerType;
/**
* Ultimate target for the packed bases.
*/
private final OutputStream targetOutputStream;
/**
* A fixed-size buffer for word-packed data.
*/
private final ByteBuffer buffer;
public BasePackedOutputStream( Class<T> type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException {
this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder);
}
/**
* Write packed bases to the given output stream.
* @param type Type of data to pack bases into.
* @param outputStream Output stream to which to write packed bases.
* @param byteOrder Switch between big endian / little endian when reading / writing files.
*/
public BasePackedOutputStream( Class<T> type, OutputStream outputStream, ByteOrder byteOrder) {
this.targetOutputStream = outputStream;
this.type = type;
basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE;
this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder);
}
/**
* Writes the given base to the output stream. Will write only this base; no packing will be performed.
* @param base List of bases to write.
* @throws IOException if an I/O error occurs.
*/
public void write( int base ) throws IOException {
write( new byte[] { (byte)base } );
}
/**
* Writes an array of bases to the target output stream.
* @param bases List of bases to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte[] bases ) throws IOException {
write(bases,0,bases.length);
}
/**
* Writes a subset of the array of bases to the output stream.
* @param bases List of bases to write.
* @param offset site at which to start writing.
* @param length number of bases to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte[] bases, int offset, int length ) throws IOException {
int packedBases = 0;
int positionInPack = 0;
for( int base = offset; base < offset+length; base++ ) {
packedBases = packBase(bases[base], packedBases, positionInPack);
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
positionInPack = ++positionInPack % basesPerType;
if( positionInPack == 0 ) {
writePackedBases(packedBases);
packedBases = 0;
}
}
if( positionInPack > 0 )
writePackedBases(packedBases);
}
/**
* Flush the contents of the OutputStream to disk.
* @throws IOException if an I/O error occurs.
*/
public void flush() throws IOException {
targetOutputStream.flush();
}
/**
* Closes the given output stream.
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
targetOutputStream.close();
}
/**
* Pack the given base into the basepack.
* @param base The base to pack.
* @param basePack Target for the pack operation.
* @param position Position within the pack to which to add the base.
* @return The packed integer.
*/
private int packBase( byte base, int basePack, int position ) {
basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1));
return basePack;
}
/**
* Write the given packed base structure to the output file.
* @param packedBases Packed bases to write.
* @throws IOException on error writing to the file.
*/
private void writePackedBases(int packedBases) throws IOException {
buffer.rewind();
if( type == Integer.class )
buffer.putInt(packedBases);
else if( type == Byte.class )
buffer.put((byte)packedBases);
else
throw new ReviewedStingException("Cannot pack bases into type " + type.getName());
targetOutputStream.write(buffer.array());
}
}

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment.reference.packing;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import java.io.File;
import java.io.IOException;
/**
* Generate a .PAC file from a given reference.
*
* @author hanna
* @version 0.1
*/
public class CreatePACFromReference {
public static void main( String argv[] ) throws IOException {
if( argv.length != 3 ) {
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output pac> <output rpac>");
return;
}
// Read in the first sequence in the input file
String inputFileName = argv[0];
File inputFile = new File(inputFileName);
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
ReferenceSequence sequence = reference.nextSequence();
// Target file for output
PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() );
// Reverse the bases in the reference
PackUtils.reverse(sequence.getBases());
// Target file for output
PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() );
}
}

View File

@ -0,0 +1,135 @@
package org.broadinstitute.sting.alignment.reference.packing;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteOrder;
/**
* Utilities designed for packing / unpacking bases.
*
* @author mhanna
* @version 0.1
*/
public class PackUtils {
/**
* How many possible bases can be encoded?
*/
public static final int ALPHABET_SIZE = 4;
/**
* How many bits does it take to store a single base?
*/
public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2));
/**
* How many bits fit into a single byte?
*/
public static final int BITS_PER_BYTE = 8;
/**
* Writes a reference sequence to a PAC file.
* @param outputFile Filename for the PAC file.
* @param referenceSequence Reference sequence to write.
* @throws IOException If there's a problem writing to the output file.
*/
public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException {
OutputStream outputStream = new FileOutputStream(outputFile);
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
basePackedOutputStream.write(referenceSequence);
outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE);
outputStream.close();
}
/**
* How many bits can a given type hold?
* @param type Type to test.
* @return Number of bits that the given type can hold.
*/
public static int bitsInType( Class<?> type ) {
try {
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
if( typeSize > intTypeSize )
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName());
return (int)(Math.log(typeSize)/Math.log(2));
}
catch( NoSuchFieldException ex ) {
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex);
}
catch( IllegalAccessException ex ) {
throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex);
}
}
/**
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
* @param base ASCII value for the base to pack.
* @return A byte from 0-3 indicating the base's packed value.
*/
public static byte packBase(byte base) {
switch( base ) {
case 'A':
return 0;
case 'C':
return 1;
case 'G':
return 2;
case 'T':
return 3;
default:
throw new ReviewedStingException("Unknown base type: " + base);
}
}
/**
* Converts a two-bit representation of a base into an ASCII representation of a base.
* @param pack Byte from 0-3 indicating which base is represented.
* @return An ASCII value representing the packed base.
*/
public static byte unpackBase(byte pack) {
switch( pack ) {
case 0:
return 'A';
case 1:
return 'C';
case 2:
return 'G';
case 3:
return 'T';
default:
throw new ReviewedStingException("Unknown pack type: " + pack);
}
}
/**
* Reverses an unpacked sequence of bases.
* @param bases bases to reverse.
*/
public static void reverse( byte[] bases ) {
for( int i = 0, j = bases.length-1; i < j; i++, j-- ) {
byte temp = bases[j];
bases[j] = bases[i];
bases[i] = temp;
}
}
/**
* Given a structure of size <code>size</code> that should be split
* into <code>partitionSize</code> partitions, how many partitions should
* be created? Size of last partition will be <= partitionSize.
* @param size Total size of the data structure.
* @param partitionSize Size of an individual partition.
* @return Number of partitions that would be created.
*/
public static int numberOfPartitions( long size, long partitionSize ) {
return (int)((size+partitionSize-1) / partitionSize);
}
}

View File

@ -0,0 +1,104 @@
package org.broadinstitute.sting.alignment.reference.packing;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
/**
* Read a set of integers packed into
*
* @author mhanna
* @version 0.1
*/
public class UnsignedIntPackedInputStream {
/**
* Ultimate target for the occurrence array.
*/
private final FileInputStream targetInputStream;
/**
* Target channel from which to pull file data.
*/
private final FileChannel targetInputChannel;
/**
* The byte order in which integer input data appears.
*/
private final ByteOrder byteOrder;
/**
* How many bytes are required to store an integer?
*/
private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE;
/**
* Create a new PackedIntInputStream, writing to the given target file.
* @param inputFile target input file.
* @param byteOrder Endianness to use when writing a list of integers.
* @throws java.io.IOException if an I/O error occurs.
*/
public UnsignedIntPackedInputStream(File inputFile, ByteOrder byteOrder) throws IOException {
this(new FileInputStream(inputFile),byteOrder);
}
/**
* Read ints from the given InputStream.
* @param inputStream Input stream from which to read ints.
* @param byteOrder Endianness to use when writing a list of integers.
*/
public UnsignedIntPackedInputStream(FileInputStream inputStream, ByteOrder byteOrder) {
this.targetInputStream = inputStream;
this.targetInputChannel = inputStream.getChannel();
this.byteOrder = byteOrder;
}
/**
* Read a datum from the input stream.
* @return The next input datum in the stream.
* @throws IOException if an I/O error occurs.
*/
public long read() throws IOException {
long[] data = new long[1];
read(data);
return data[0];
}
/**
* Read the data from the input stream.
* @param data placeholder for input data.
* @throws IOException if an I/O error occurs.
*/
public void read( long[] data ) throws IOException {
read( data, 0, data.length );
}
/**
* Read the data from the input stream, starting at the given offset.
* @param data placeholder for input data.
* @param offset place in the array to start reading in data.
* @param length number of ints to read in.
* @throws IOException if an I/O error occurs.
*/
public void read( long[] data, int offset, int length ) throws IOException {
ByteBuffer readBuffer = ByteBuffer.allocate(bytesPerInteger*length).order(byteOrder);
targetInputChannel.read(readBuffer,targetInputChannel.position());
readBuffer.flip();
targetInputChannel.position(targetInputChannel.position()+readBuffer.remaining());
int i = 0;
while(i < length)
data[offset+i++] = readBuffer.getInt() & 0xFFFFFFFFL;
}
/**
* Closes the given output stream.
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
targetInputStream.close();
}
}

View File

@ -0,0 +1,121 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment.reference.packing;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* Writes an list of integers to the output file.
*
* @author mhanna
* @version 0.1
*/
public class UnsignedIntPackedOutputStream {
/**
* Ultimate target for the occurrence array.
*/
private final OutputStream targetOutputStream;
/**
* A fixed-size buffer for int-packed data.
*/
private final ByteBuffer buffer;
/**
* Create a new PackedIntOutputStream, writing to the given target file.
* @param outputFile target output file.
* @param byteOrder Endianness to use when writing a list of integers.
* @throws IOException if an I/O error occurs.
*/
public UnsignedIntPackedOutputStream(File outputFile, ByteOrder byteOrder) throws IOException {
this(new FileOutputStream(outputFile),byteOrder);
}
/**
* Write packed ints to the given OutputStream.
* @param outputStream Output stream to which to write packed ints.
* @param byteOrder Endianness to use when writing a list of integers.
*/
public UnsignedIntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
this.targetOutputStream = outputStream;
buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
}
/**
* Write the data to the output stream.
* @param datum datum to write.
* @throws IOException if an I/O error occurs.
*/
public void write( long datum ) throws IOException {
buffer.rewind();
buffer.putInt((int)datum);
targetOutputStream.write(buffer.array());
}
/**
* Write the data to the output stream.
* @param data data to write. occurrences.length must match alphabet size.
* @throws IOException if an I/O error occurs.
*/
public void write( long[] data ) throws IOException {
for(long datum: data)
write(datum);
}
/**
* Write the given chunk of data to the input stream.
* @param data data to write.
* @param offset position at which to start.
* @param length number of ints to write.
* @throws IOException if an I/O error occurs.
*/
public void write( long[] data, int offset, int length ) throws IOException {
for( int i = offset; i < offset+length; i++ )
write(data[i]);
}
/**
* Flush the contents of the OutputStream to disk.
* @throws IOException if an I/O error occurs.
*/
public void flush() throws IOException {
targetOutputStream.flush();
}
/**
* Closes the given output stream.
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
targetOutputStream.close();
}
}

View File

@ -30,10 +30,16 @@ import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import java.io.File;
import java.util.Collection;
import java.util.Set;
import java.util.TreeSet;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
/**
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
* in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
@ -52,6 +58,13 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
String platform = null; // E.g. ILLUMINA, 454
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
int nReadsToPrint = -1;
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
public Set<File> sampleFile = new TreeSet<File>();
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> sampleNames = new TreeSet<String>();
private TreeSet<String> samplesToChoose = new TreeSet<String>();
private boolean SAMPLES_SPECIFIED = false;
/**
* The initialize function.
@ -59,6 +72,20 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
public void initialize() {
if ( platform != null )
platform = platform.toUpperCase();
Collection<String> samplesFromFile;
if (!sampleFile.isEmpty()) {
samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
samplesToChoose.addAll(samplesFromFile);
}
if (!sampleNames.isEmpty())
samplesToChoose.addAll(sampleNames);
if(!samplesToChoose.isEmpty()) {
SAMPLES_SPECIFIED = true;
}
}
/**
@ -85,6 +112,14 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform))
return false;
}
if (SAMPLES_SPECIFIED ) {
// user specified samples to select
// todo - should be case-agnostic but for simplicity and speed this is ignored.
// todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup.
if (!samplesToChoose.contains(read.getReadGroup().getSample()))
return false;
}
// check if we've reached the output limit
if ( nReadsToPrint == 0 ) {

View File

@ -24,11 +24,27 @@ public class IndelType implements InfoFieldAnnotation, ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
int run;
if ( vc.isIndel() && vc.isBiallelic() ) {
if (vc.isMixed()) {
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%s", "MIXED"));
return map;
}
else if ( vc.isIndel() ) {
String type="";
ArrayList<Integer> inds = IndelUtils.findEventClassificationIndex(vc, ref);
for (int k : inds) {
type = type+ IndelUtils.getIndelClassificationName(k)+".";
if (!vc.isBiallelic())
type = "MULTIALLELIC_INDEL";
else {
if (vc.isInsertion())
type = "INS.";
else if (vc.isDeletion())
type = "DEL.";
else
type = "OTHER.";
ArrayList<Integer> inds = IndelUtils.findEventClassificationIndex(vc, ref);
for (int k : inds) {
type = type+ IndelUtils.getIndelClassificationName(k)+".";
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%s", type));

View File

@ -1,50 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.sequenom;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
/**
* Create a mask for use with the PickSequenomProbes walker.
*/
public class CreateSequenomMask extends RodWalker<Integer, Integer> {
@Output
PrintStream out;
public void initialize() {}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
int result = 0;
for ( VariantContext vc : tracker.getAllVariantContexts(ref) ) {
if ( vc.isSNP() ) {
GenomeLoc loc = context.getLocation();
out.println(loc.getContig() + "\t" + (loc.getStart()-1) + "\t" + loc.getStop());
result = 1;
break;
}
}
return result;
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer sum) {
logger.info("Found " + sum + " masking sites.");
}
}

View File

@ -1,332 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.sequenom;
import org.broad.tribble.bed.BEDCodec;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* Generates Sequenom probe information given a single variant track. Emitted is the variant
* along with the 200 reference bases on each side of the variant.
*/
@WalkerName("PickSequenomProbes")
@Requires(value={DataSource.REFERENCE})
@Reference(window=@Window(start=-200,stop=200))
public class PickSequenomProbes extends RodWalker<String, String> {
@Output
PrintStream out;
@Argument(required=false, shortName="snp_mask", doc="positions to be masked with N's")
protected String SNP_MASK = null;
@Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'")
String project_id = null;
@Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")")
boolean omitWindow = false;
@Argument(required = false, fullName="usePlinkRODNamingConvention", shortName="nameConvention",doc="Use the naming convention defined in PLINKROD")
boolean useNamingConvention = false;
@Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes")
int noMaskWindow = 0;
@Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name")
boolean addCounter = false;
private byte [] maskFlags = new byte[401];
private LocationAwareSeekableRODIterator snpMaskIterator=null;
private GenomeLoc positionOfLastVariant = null;
private int cnt = 0;
private int discarded = 0;
VariantCollection VCs ; // will keep a set of distinct variants at a given site
private List<GenomeLoc> processedVariantsInScope = new LinkedList<GenomeLoc>();
public void initialize() {
if ( SNP_MASK != null ) {
logger.info("Loading SNP mask... ");
ReferenceOrderedData snp_mask;
//if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) {
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
RMDTrack track = builder.createInstanceOfTrack(BEDCodec.class, new File(SNP_MASK));
snpMaskIterator = new SeekableRODIterator(track.getHeader(),
track.getSequenceDictionary(),
getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
getToolkit().getGenomeLocParser(),
track.getIterator());
//} else {
// // TODO: fix me when Plink is back
// throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)");
//}
}
VCs = new VariantCollection();
}
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return "";
logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow());
VCs.clear();
VCs.addAll( tracker.getAllVariantContexts(ref), ref.getLocus() );
discarded += VCs.discarded();
if ( VCs.size() == 0 ) {
logger.debug(" Context empty");
return "";
}
if ( VCs.size() > 1 ) {
logger.debug(" "+VCs.size()+ " variants at the locus");
}
// System.out.print("At locus "+ref.getLocus()+": ");
// for ( VariantContext vc : VCs ) {
// System.out.println(vc.toString());
// }
// little optimization: since we may have few events at the current site on the reference,
// we are going to make sure we compute the mask and ref bases only once for each location and only if we need to
boolean haveMaskForWindow = false;
boolean haveBasesForWindow = false;
String leading_bases = null;
String trailing_bases = null;
StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed)
// get all variant contexts!!!!
for ( VariantContext vc : VCs ) {
// we can only deal with biallelic sites for now
if ( !vc.isBiallelic() ) {
logger.debug(" Not biallelic; skipped");
continue;
}
// we don't want to see the same multi-base event (deletion, DNP etc) multiple times.
// All the vcs we are currently seeing are clearly on the same contig as the current reference
// poisiton (or we would not see them at all!). All we need to check is if the vc starts at the
// current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already).
if ( ref.getLocus().getStart() != vc.getStart() )
continue;
if ( ! haveMaskForWindow ) {
String contig = context.getLocation().getContig();
int offset = context.getLocation().getStart();
int true_offset = offset - 200;
// we have variant; let's load all the snps falling into the current window and prepare the mask array.
// we need to do it only once per window, regardless of how many vcs we may have at this location!
if ( snpMaskIterator != null ) {
// clear the mask
for ( int i = 0 ; i < 401; i++ )
maskFlags[i] = 0;
RODRecordList snpList = snpMaskIterator.seekForward(getToolkit().getGenomeLocParser().createGenomeLoc(contig,offset-200,offset+200));
if ( snpList != null && snpList.size() != 0 ) {
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
int i = 0;
while ( snpsInWindow.hasNext() ) {
GenomeLoc snp = snpsInWindow.next().getLocation();
// we don't really want to mask out multi-base indels
if ( snp.size() > 1 )
continue;
logger.debug(" SNP at "+snp.getStart());
int offsetInWindow = (int)(snp.getStart() - true_offset);
maskFlags[offsetInWindow] = 1;
}
}
}
haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window...
}
if ( ! haveBasesForWindow ) {
byte[] context_bases = ref.getBases();
for (int i = 0; i < 401; i++) {
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
context_bases[i] = 'N';
}
}
leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
// masked bases are not gonna change for the current window, unless we use windowed masking;
// in the latter case the bases (N's) will depend on the event we are currently looking at,
// so we better recompute..
if ( noMaskWindow == 0 ) haveBasesForWindow = true;
}
// below, build single assay line for the current VC:
String assay_sequence;
if ( vc.isSNP() )
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.isMNP() )
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
else if ( vc.isInsertion() )
assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.isDeletion() )
assay_sequence = leading_bases + (char)ref.getBase() + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length());
else
continue;
StringBuilder assay_id = new StringBuilder();
if ( project_id != null ) {
assay_id.append(project_id);
assay_id.append('|');
}
if ( useNamingConvention ) {
assay_id.append('c');
assay_id.append(context.getLocation().toString().replace(":","_p"));
} else {
assay_id.append(context.getLocation().toString().replace(':','_'));
}
if ( vc.isInsertion() ) assay_id.append("_gI");
else if ( vc.isDeletion()) assay_id.append("_gD");
if ( ! omitWindow ) {
assay_id.append("_");
assay_id.append(ref.getWindow().toString().replace(':', '_'));
}
++cnt;
if ( addCounter ) assay_id.append("_"+cnt);
assaysForLocus.append(assay_id);
assaysForLocus.append('\t');
assaysForLocus.append(assay_sequence);
assaysForLocus.append('\n');
}
return assaysForLocus.toString();
}
public String reduceInit() {
return "";
}
public String reduce(String data, String sum) {
out.print(data);
return "";
}
private int getNoMaskWindowRightEnd(VariantContext vc, int window) {
if ( window == 0 ) {
return 0;
}
if ( vc.isInsertion() ) {
return window-1;
}
int max = 0;
for (Allele a : vc.getAlleles() ) {
if ( vc.isInsertion() ) {
logger.debug("Getting length of allele "+a.toString()+" it is "+a.getBases().length+" (ref allele is "+vc.getReference().toString()+")");
}
if ( a.getBases().length > max ) {
max = a.getBases().length;
}
}
return max+window-1;
}
public void onTraversalDone(String sum) {
logger.info(cnt+" assay seqences generated");
logger.info(discarded+" events were found to be duplicates and discarded (no redundant assays generated)");
}
static class EventComparator implements Comparator<VariantContext> {
public int compare(VariantContext o1, VariantContext o2) {
// if variants start at different positions, they are different. All we actually
// care about is detecting the variants that are strictly the same; the actual ordering of distinct variants
// (which one we deem less and which one greater) is utterly unimportant. We just need to be consistent.
if ( o1.getStart() < o2.getStart() ) return -1;
if ( o1.getStart() > o2.getStart() ) return 1;
if ( o1.getType() != o2.getType() ) return o1.getType().compareTo(o2.getType());
int refComp = o1.getReference().compareTo(o2.getReference());
if ( refComp != 0 ) return refComp;
return o1.getAlternateAllele(0).compareTo(o2.getAlternateAllele(0));
}
}
static class VariantCollection implements Iterable<VariantContext> {
TreeSet<VariantContext> variants = new TreeSet(new EventComparator());
int discarded = 0;
public void add(VariantContext vc, GenomeLoc current) {
if ( vc.getStart() != current.getStart() ) return; // we add only variants that start at current locus
// note that we do not check chr here, since the way this class is used, the mathod is always called with
// VCs coming from the same metadata tracker, so they simply can not be on different chrs!
if ( !vc.isBiallelic() ) {
logger.info(" Non-biallelic variant encountered; skipped");
return;
}
if ( variants.add(vc) == false ) discarded++;
}
public void addAll(Collection<VariantContext> c, GenomeLoc current) {
for ( VariantContext vc : c ) add(vc,current);
}
public void clear() {
variants.clear();
discarded = 0;
}
public int discarded() { return discarded; }
public int size() { return variants.size(); }
public Iterator<VariantContext> iterator() { return variants.iterator(); }
}
}

View File

@ -0,0 +1,413 @@
package org.broadinstitute.sting.gatk.walkers.validation;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.alignment.Alignment;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 6/13/11
* Time: 2:12 PM
* To change this template use File | Settings | File Templates.
*/
@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class),
@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)})
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
@Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false)
boolean lowerCaseSNPs = false;
@Argument(doc="Size of the virtual primer to use for lower-casing regions with low specificity",fullName="virtualPrimerSize",required=false)
int virtualPrimerSize = 20;
@Argument(doc="Monomorphic sites in the mask file will be treated as filtered",fullName="filterMonomorphic",required=false)
boolean filterMonomorphic = false;
@Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false)
boolean doNotUseBWA = false;
GenomeLoc prevInterval;
GenomeLoc allelePos;
String probeName;
StringBuilder sequence;
StringBuilder rawSequence;
boolean sequenceInvalid;
List<String> invReason;
int indelCounter;
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
"generated by bwa index -d bwtsw. If unspecified, will default " +
"to the reference specified via the -R argument.",required=false)
private File targetReferenceFile = null;
@Output
PrintStream out;
BWACAligner aligner = null;
private SAMFileHeader header = null;
public void initialize() {
if ( ! doNotUseBWA ) {
if(targetReferenceFile == null)
targetReferenceFile = getToolkit().getArguments().referenceFile;
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
header = new SAMFileHeader();
SAMSequenceDictionary referenceDictionary =
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
header.setSequenceDictionary(referenceDictionary);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
}
}
public Integer reduceInit() {
prevInterval = null;
sequence = null;
rawSequence = null;
sequenceInvalid = false;
probeName = null;
invReason = null;
indelCounter = 0;
return 0;
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; }
GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation();
//logger.debug(interval);
if ( prevInterval == null || ! interval.equals(prevInterval) ) {
// we're in a new interval, we should:
// 1) print out previous data
// 2) reset internal data
// 3) instantiate traversal of this interval
// step 1:
if ( prevInterval != null ) {
// there was a previous interval
validateSequence(); // ensure the sequence in the region is valid
// next line removed in favor of the one after
if ( doNotUseBWA ) {
lowerRepeats(); // change repeats in sequence to lower case
} else {
lowerNonUniqueSegments();
}
print(); // print out the fasta sequence
}
// step 2:
prevInterval = interval;
allelePos = null;
sequence = new StringBuilder();
rawSequence = new StringBuilder();
sequenceInvalid = false;
invReason = new LinkedList<String>();
logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues()));
probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1);
indelCounter = 0;
}
// step 3 (or 1 if not new):
// build up the sequence
VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus());
VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus());
if ( mask == null && validate == null ) {
if ( indelCounter > 0 ) {
sequence.append('N');
indelCounter--;
} else {
sequence.append(Character.toUpperCase((char) ref.getBase()));
}
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
} else if ( validate != null ) {
// doesn't matter if there's a mask here too -- this is what we want to validate
if ( validate.isFiltered() ) {
logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site.");
sequenceInvalid = true;
invReason.add("SITE_IS_FILTERED");
}
if ( validate.isIndel() ) {
sequence.append(Character.toUpperCase((char)ref.getBase()));
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
}
sequence.append('[');
sequence.append(validate.getAlternateAllele(0).toString());
sequence.append('/');
sequence.append(validate.getReference().toString());
sequence.append(']');
// do this to the raw sequence to -- the indeces will line up that way
rawSequence.append('[');
rawSequence.append(validate.getAlternateAllele(0).getBaseString());
rawSequence.append('/');
rawSequence.append(validate.getReference().getBaseString());
rawSequence.append(']');
allelePos = ref.getLocus();
if ( indelCounter > 0 ) {
logger.warn("An indel event overlaps the event to be validated. This completely invalidates the probe.");
sequenceInvalid = true;
invReason.add("INDEL_OVERLAPS_VALIDATION_SITE");
if ( validate.isSNP() ) {
indelCounter--;
} else {
indelCounter -= validate.getEnd()-validate.getStart();
}
}
} else /* (mask != null && validate == null ) */ {
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
sequenceInvalid = true;
invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
// note: indelCounter could be > 0 (could have small deletion within larger one). This always selects
// the larger event.
int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart();
if ( indelCounterNew > indelCounter ) {
indelCounter = indelCounterNew;
}
//sequence.append((char) ref.getBase());
//sequence.append(mask.isInsertion() ? 'I' : 'D');
sequence.append("N");
indelCounter--;
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
} else if ( indelCounter > 0 ) {
// previous section resets the indel counter. Doesn't matter if there's a SNP underlying this, we just want to append an 'N' and move on.
sequence.append('N');
indelCounter--;
rawSequence.append(Character.toUpperCase((char)ref.getBase()));
} else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){
logger.debug("SNP in mask found at " + ref.getLocus().toString());
if ( lowerCaseSNPs ) {
sequence.append(Character.toLowerCase((char) ref.getBase()));
} else {
sequence.append((char) BaseUtils.N);
}
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
} else if ( mask.isSNP() ) {
logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic");
sequence.append((Character.toUpperCase((char) ref.getBase())));
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
}
}
return 1;
}
public Integer reduce(Integer i, Integer j) {
return 0;
}
public void onTraversalDone(Integer fin ) {
validateSequence();
if ( doNotUseBWA ) {
lowerRepeats();
} else {
lowerNonUniqueSegments();
}
print();
}
public void validateSequence() {
// code for ensuring primer sequence is valid goes here
// validate that there are no masked sites near to the variant site
String seq = sequence.toString();
int start = seq.indexOf('[') - 4;
int end = seq.indexOf(']') + 5;
if ( start < 50 ) {
logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed.");
sequenceInvalid = true;
invReason.add("START_TOO_CLOSE");
} else if ( end > seq.length() - 50 ) {
logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. ");
sequenceInvalid = true;
invReason.add("END_TOO_CLOSE");
} else {
boolean maskNearVariantSite = false;
for ( int i = start; i < end; i++ ) {
maskNearVariantSite |= (seq.charAt(i) == 'N' || Character.isLowerCase(seq.charAt(i)));
}
if ( maskNearVariantSite ) {
logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed.");
sequenceInvalid = true;
invReason.add("VARIANT_TOO_NEAR_PROBE");
}
}
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap.");
sequenceInvalid = true;
invReason.add("MULTIPLE_PROBES");
}
if ( seq.indexOf("[") < 0 ) {
logger.warn("No variants in region were found. This site will not be designed.");
sequenceInvalid = true;
invReason.add("NO_VARIANTS_FOUND");
}
}
public void lowerNonUniqueSegments() {
if ( ! invReason.contains("MULTIPLE_PROBES") && !invReason.contains("NO_VARIANTS_FOUND") ) {
String leftFlank = rawSequence.toString().split("\\[")[0];
String rightFlank = rawSequence.toString().split("\\]")[1];
List<Integer> badLeft = getBadIndeces(leftFlank);
List<Integer> badRight = getBadIndeces(rightFlank);
// propagate lowercases into the printed sequence
for ( int idx = 0; idx < leftFlank.length(); idx++ ) {
while ( badLeft.size() > 0 && idx > badLeft.get(0) + virtualPrimerSize ) {
badLeft.remove(0);
}
if ( badLeft.size() > 0 && badLeft.get(0) <= idx && idx <= badLeft.get(0) + virtualPrimerSize ) {
sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx)));
}
}
int offset = 1 + rawSequence.indexOf("]");
for ( int i= 0; i < rightFlank.length(); i++ ) {
int idx = i + offset;
while ( badRight.size() > 0 && i > badRight.get(0) + virtualPrimerSize ) {
//logger.debug("Removing "+badRight.get(0)+" because "+(badRight.get(0)+virtualPrimerSize)+" < "+i);
badRight.remove(0);
}
if ( badRight.size() > 0 && badRight.get(0) <= i && i <= badRight.get(0) + virtualPrimerSize ) {
//logger.debug("Resetting character on right flank: "+idx+" "+i+" offset="+offset);
//logger.debug(sequence);
sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx)));
//logger.debug(sequence);
}
}
}
}
private List<Integer> getBadIndeces(String sequence) {
List<Integer> badLeftIndeces = new ArrayList<Integer>(sequence.length()-virtualPrimerSize);
for ( int i = 0; i < sequence.length()-virtualPrimerSize ; i++ ) {
String toAlign = sequence.substring(i,i+virtualPrimerSize);
Iterable<Alignment[]> allAlignments = aligner.getAllAlignments(toAlign.getBytes());
for ( Alignment[] alignments : allAlignments ) {
if ( alignments.length > 1 ) {
if ( alignments[0].getMappingQuality() == 0 ) {
// this region is bad -- multiple MQ alignments
badLeftIndeces.add(i);
}
}
}
}
return badLeftIndeces;
}
/**
* Note- this is an old function - a proxy for identifying regions with low specificity to genome. Saved in case the alignment-based version
* turns out to be worse than just doing a simple repeat-lowering method.
*/
public void lowerRepeats() {
// convert to lower case low-complexity repeats, e.g. tandem k-mers
final int K_LIM = 8;
String seq = sequence.toString();
StringBuilder newSequence = new StringBuilder();
int start_pos = 0;
while( start_pos < seq.length() ) {
boolean broke = false;
for ( int length = K_LIM; length > 1; length -- ) {
//logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length()));
if ( start_pos + 2*length> seq.length() ) {
continue;
}
if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) {
newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase());
newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase());
start_pos += 2*length;
broke = true;
break;
}
}
if ( ! broke ) {
newSequence.append(seq.substring(start_pos,start_pos+1));
start_pos++;
}
}
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
return;
}
sequence = newSequence;
}
public boolean equalsIgnoreNs(String one, String two) {
if ( one.length() != two.length() ) { return false; }
for ( int idx = 0; idx < one.length(); idx++ ) {
if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) {
if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) {
return false;
}
}
}
//logger.debug(String.format("one: %s two: %s",one,two));
return true;
}
public void print() {
String valid;
if ( sequenceInvalid ) {
valid = "";
while ( invReason.size() > 0 ) {
String reason = invReason.get(0);
invReason.remove(reason);
int num = 1;
while ( invReason.contains(reason) ) {
num++;
invReason.remove(reason);
}
valid += String.format("%s=%d,",reason,num);
}
} else {
valid = "Valid";
}
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
}
}

View File

@ -24,6 +24,16 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
@ -44,6 +54,9 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.lang.annotation.AnnotationFormatError;
import java.util.*;
/**
@ -91,6 +104,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
private boolean KEEP_AF_SPECTRUM = false;
@Hidden
@Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false)
private File AF_FILE = new File("");
@Hidden
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private File FAMILY_STRUCTURE_FILE = null;
@Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = "";
@ -113,6 +133,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false)
private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String outMVFile = null;
/* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure {
@ -140,7 +163,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean DISCORDANCE_ONLY = false;
private boolean CONCORDANCE_ONLY = false;
private MendelianViolation mv;
private Set<MendelianViolation> mvSet = new HashSet<MendelianViolation>();
/* default name for the variant dataset (VCF) */
private final String variantRodName = "variant";
@ -155,8 +178,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private RandomVariantStructure [] variantArray;
/* Variables used for random selection with AF boosting */
private ArrayList<Double> afBreakpoints = null;
private ArrayList<Double> afBoosts = null;
double bkDelta = 0.0;
private PrintStream outMVFileStream = null;
/**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
@ -212,10 +241,29 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
CONCORDANCE_ONLY = concordanceRodName.length() > 0;
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName);
if (MENDELIAN_VIOLATIONS)
mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD);
if (MENDELIAN_VIOLATIONS) {
if ( FAMILY_STRUCTURE_FILE != null) {
try {
for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) {
MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom()))
mvSet.add(mv);
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
}
if (outMVFile != null)
try {
outMVFileStream = new PrintStream(outMVFile);
}
catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); }
}
else
mvSet.add(new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD));
}
else if (!FAMILY_STRUCTURE.isEmpty()) {
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
MENDELIAN_VIOLATIONS = true;
}
@ -227,6 +275,33 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
SELECT_RANDOM_FRACTION = fractionRandom > 0;
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track");
if (KEEP_AF_SPECTRUM) {
try {
afBreakpoints = new ArrayList<Double>();
afBoosts = new ArrayList<Double>();
logger.info("Reading in AF boost table...");
boolean firstLine = false;
for ( final String line : new XReadLines( AF_FILE ) ) {
if (!firstLine) {
firstLine = true;
continue;
}
final String[] vals = line.split(" ");
double bkp = Double.valueOf(vals[0]);
double afb = Double.valueOf(vals[1]);
afBreakpoints.add(bkp);
afBoosts.add(afb);
}
bkDelta = afBreakpoints.get(0);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
}
}
}
/**
@ -250,9 +325,24 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
for (VariantContext vc : vcs) {
if (MENDELIAN_VIOLATIONS) {
if (!mv.isViolation(vc)) {
break;
boolean foundMV = false;
for (MendelianViolation mv : mvSet) {
if (mv.isViolation(vc)) {
foundMV = true;
//System.out.println(vc.toString());
if (outMVFile != null)
outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " +
"childG=%s childGL=%s\n",vc.getChr(), vc.getStart(),
vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)),
mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(),
vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() );
}
}
if (!foundMV)
break;
}
if (DISCORDANCE_ONLY) {
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false);
@ -283,46 +373,59 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if (SELECT_RANDOM_NUMBER) {
randomlyAddVariant(++variantNumber, sub, ref.getBase());
}
else if (!SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom) {
else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
vcfWriter.add(sub, ref.getBase());
}
else {
if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) {
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, inputAFRodName, null, context.getLocation(), true, false);
if (compVCs.isEmpty())
return 0;
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
// We then pick a variant with probablity AF*desiredFraction
for (VariantContext compVC : compVCs) {
if ( compVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = compVC.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
double af;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double af;
double afBoost = 1.0;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double[] afd = new double[afs.length];
double[] afd = new double[afs.length];
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
}
else
af = Double.valueOf(afo);
//System.out.format("%s .. %4.4f\n",afo.toString(), af);
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * af)
vcfWriter.add(sub, ref.getBase());
}
break; // do only one vc
else
af = Double.valueOf(afo);
// now boost af by table read from file if desired
//double bkpt = 0.0;
int bkidx = 0;
if (!afBreakpoints.isEmpty()) {
for ( Double bkpt : afBreakpoints) {
if (af < bkpt + bkDelta)
break;
else bkidx++;
}
if (bkidx >=afBoosts.size())
bkidx = afBoosts.size()-1;
afBoost = afBoosts.get(bkidx);
//System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost);
}
//System.out.format("%s .. %4.4f\n",afo.toString(), af);
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost)
vcfWriter.add(sub, ref.getBase());
}
}
}
}
@ -406,8 +509,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean haveSameGenotypes(Genotype g1, Genotype g2) {
if ((g1.isCalled() && g2.isFiltered()) ||
(g2.isCalled() && g1.isFiltered()) ||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
(g2.isCalled() && g1.isFiltered()) ||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
return false;
List<Allele> a1s = g1.getAlleles();
@ -440,7 +543,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
* @param vc the VariantContext record to subset
* @param samples the samples to extract
* @return the subsetted VariantContext
*/
*/
private VariantContext subsetRecord(VariantContext vc, Set<String> samples) {
if ( samples == null || samples.isEmpty() )
return vc;
@ -450,7 +553,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if ( samples.contains(genotypePair.getKey()) )
genotypes.add(genotypePair.getValue());
}
VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles());
HashMap<String, Object> attributes = new HashMap<String, Object>(sub.getAttributes());
@ -460,7 +563,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
Genotype g = sub.getGenotype(sample);
if (g.isNotFiltered() && g.isCalled()) {
String dp = (String) g.getAttribute("DP");
if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) {
depth += Integer.valueOf(dp);

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -33,7 +35,6 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.PrintStream;
@ -74,17 +75,29 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
getters.put("REF", new Getter() { public String get(VariantContext vc) { return vc.getReference().toString(); } });
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) {
Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY));
x=x+new String(new byte[]{refByte});
}
return x+vc.getReference().getDisplayString();
}
});
getters.put("ALT", new Getter() {
public String get(VariantContext vc) {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) {
Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY));
x.append(new String(new byte[]{refByte}));
}
for ( int i = 0; i < n; i++ ) {
if ( i != 0 ) x.append(",");
x.append(vc.getAlternateAllele(i).toString());
x.append(vc.getAlternateAllele(i).getDisplayString());
}
return x.toString();
}
@ -168,6 +181,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc));
}
if (field.equals("AF") || field.equals("AC")) {
String afo = val;
double af=0;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double[] afd = new double[afs.length];
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
}
else
if (!afo.equals("NA"))
af = Double.valueOf(afo);
val = Double.toString(af);
}
vals.add(val);
}

View File

@ -0,0 +1,27 @@
package org.broadinstitute.sting.alignment;
import org.testng.annotations.Test;
import org.broadinstitute.sting.WalkerTest;
import java.util.Arrays;
/**
* Integration tests for the aligner.
*
* @author mhanna
* @version 0.1
*/
public class AlignerIntegrationTest extends WalkerTest {
@Test
public void testBasicAlignment() {
String md5 = "34eb4323742999d6d250a0aaa803c6d5";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + GATKDataLocation + "bwa/human_b36_both.fasta" +
" -T Align" +
" -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" +
" -o %s",
1, // just one output file
Arrays.asList(md5));
executeTest("testBasicAlignment", spec);
}
}

View File

@ -1,34 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.sequenom;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class PickSequenomProbesIntegrationTest extends WalkerTest {
@Test
public void testProbes() {
String testVCF = validationDataLocation + "complexExample.vcf4";
String testArgs = "-R " + b36KGReference + " -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B:input,VCF "+testVCF+" -o %s";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("6b5409cc78960f1be855536ed89ea9dd"));
executeTest("Test probes", spec);
}
@Test
public void testProbesUsingDbSNPMask() {
String md5 = "46d53491af1d3aa0ee1f1e13d68b732d";
String testVCF = validationDataLocation + "pickSeqIntegrationTest.vcf";
String testArgs = "-snp_mask " + validationDataLocation + "pickSeqIntegrationTest.bed -R "
+ b36KGReference + " -omitWindow -nameConvention "
+ "-project_id 1kgp3_s4_lf -T PickSequenomProbes -B:input,VCF "+testVCF+" -o %s";
WalkerTestSpec spec1 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5));
executeTest("Test probes", spec1);
testArgs += " -nmw 1";
WalkerTestSpec spec2 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5));
executeTest("Test probes", spec2);
}
}

View File

@ -0,0 +1,56 @@
package org.broadinstitute.sting.gatk.walkers.validation;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: 7/19/11
* Time: 7:39 PM
* To change this template use File | Settings | File Templates.
*/
public class ValidationAmpliconsIntegrationTest extends WalkerTest {
@Test
public void testWikiExample() {
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
testArgs += " --virtualPrimerSize 30";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("27f9450afa132888a8994167f0035fd7"));
executeTest("Test probes", spec);
}
@Test
public void testWikiExampleNoBWA() {
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
testArgs += " --virtualPrimerSize 30 --doNotUseBWA";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("f2611ff1d9cd5bedaad003251fed8bc1"));
executeTest("Test probes", spec);
}
@Test
public void testWikiExampleMonoFilter() {
String siteVCF = validationDataLocation + "sites_to_validate.vcf";
String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf";
String intervalTable = validationDataLocation + "amplicon_interval_table1.table";
String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s";
testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF;
testArgs += " --virtualPrimerSize 30 --filterMonomorphic";
WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1,
Arrays.asList("77b3f30e38fedad812125bdf6cf3255f"));
executeTest("Test probes", spec);
}
}

View File

@ -44,7 +44,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testComplexVariantsToTable() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"),
Arrays.asList("b2a3712c1bfad8f1383ffada8b5017ba"));
Arrays.asList("e8f771995127b727fb433da91dd4ee98"));
executeTest("testComplexVariantsToTable", spec).getFirst();
}

View File

@ -7,6 +7,8 @@
<class name="edu.mit.broad.picard.genotype.geli.GeliFileWriter" />
<class name="edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods" />
<class name="edu.mit.broad.picard.util.PasteParser" />
<class name="edu.mit.broad.picard.util.PicardAggregationFsUtil" />
<class name="edu.mit.broad.picard.util.PicardFileSystemUtil" />
<class name="edu.mit.broad.picard.variation.KnownVariantCodecV2" />
<class name="edu.mit.broad.picard.variation.KnownVariantCodec" />
<class name="edu.mit.broad.picard.variation.KnownVariantFileHeader" />

View File

@ -72,6 +72,9 @@ class DataProcessingPipeline extends QScript {
@Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
var bwaThreads: Int = 1
@Input(doc="Dont perform validation on the BAM files", fullName="no_validation", shortName="nv", required=false)
var noValidation: Boolean = false
/****************************************************************************
* Global Variables
@ -135,7 +138,7 @@ class DataProcessingPipeline extends QScript {
}
}
println("\n\n*** DEBUG ***\n")
println("\n\n*** INPUT FILES ***\n")
// Creating one file for each sample in the dataset
val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
for ((sample, flist) <- sampleTable) {
@ -149,7 +152,7 @@ class DataProcessingPipeline extends QScript {
sampleBamFiles(sample) = sampleFileName
add(joinBams(flist, sampleFileName))
}
println("*** DEBUG ***\n\n")
println("*** INPUT FILES ***\n\n")
return sampleBamFiles.toMap
}
@ -246,7 +249,12 @@ class DataProcessingPipeline extends QScript {
val preValidateLog = swapExt(bam, ".bam", ".pre.validation")
val postValidateLog = swapExt(bam, ".bam", ".post.validation")
add(validate(bam, preValidateLog))
// Validation is an optional step for the BAM file generated after
// alignment and the final bam file of the pipeline.
if (!noValidation) {
add(validate(bam, preValidateLog),
validate(recalBam, postValidateLog))
}
if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(bam, targetIntervals))
@ -257,8 +265,8 @@ class DataProcessingPipeline extends QScript {
recal(dedupedBam, preRecalFile, recalBam),
cov(recalBam, postRecalFile),
analyzeCovariates(preRecalFile, preOutPath),
analyzeCovariates(postRecalFile, postOutPath),
validate(recalBam, postValidateLog))
analyzeCovariates(postRecalFile, postOutPath))
cohortList :+= recalBam
}
@ -282,6 +290,13 @@ class DataProcessingPipeline extends QScript {
this.isIntermediate = true
}
// General arguments to non-GATK tools
trait ExternalCommonArgs extends CommandLineFunction {
this.memoryLimit = 4
this.isIntermediate = true
}
case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
this.input_file :+= inBams
@ -300,8 +315,8 @@ class DataProcessingPipeline extends QScript {
this.targetIntervals = tIntervals
this.out = outBam
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
if (!indels.isEmpty)
this.rodBind :+= RodBind("indels", "VCF", indels)
if (!qscript.indels.isEmpty)
this.rodBind :+= RodBind("indels", "VCF", qscript.indels)
this.consensusDeterminationModel = consensusDeterminationModel
this.compress = 0
this.scatterCount = nContigs
@ -332,7 +347,6 @@ class DataProcessingPipeline extends QScript {
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
}
@ -350,48 +364,41 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + inRecalFile + ".analyze_covariates"
}
case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates {
case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs {
this.input = List(inBam)
this.output = outBam
this.metrics = metricsFile
this.memoryLimit = 6
this.isIntermediate = true
this.analysisName = queueLogDir + outBam + ".dedup"
this.jobName = queueLogDir + outBam + ".dedup"
}
case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles {
case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs {
this.input = inBams
this.output = outBam
this.memoryLimit = 4
this.isIntermediate = true
this.analysisName = queueLogDir + outBam + ".joinBams"
this.jobName = queueLogDir + outBam + ".joinBams"
}
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam {
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs {
this.input = List(inSam)
this.output = outBam
this.sortOrder = sortOrderP
this.memoryLimit = 4
this.isIntermediate = true
this.analysisName = queueLogDir + outBam + ".sortSam"
this.jobName = queueLogDir + outBam + ".sortSam"
}
case class validate (inBam: File, outLog: File) extends ValidateSamFile {
case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs {
this.input = List(inBam)
this.output = outLog
this.maxRecordsInRam = 100000
this.REFERENCE_SEQUENCE = qscript.reference
this.memoryLimit = 4
this.isIntermediate = false
this.analysisName = queueLogDir + outLog + ".validate"
this.jobName = queueLogDir + outLog + ".validate"
}
case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups {
case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs {
this.input = List(inBam)
this.output = outBam
this.RGID = readGroup.id
@ -407,12 +414,7 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outBam + ".rg"
}
trait BWACommonArgs extends CommandLineFunction {
this.memoryLimit = 4
this.isIntermediate = true
}
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs {
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file") var sai = outSai
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
@ -420,7 +422,7 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
}
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs {
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
@ -428,7 +430,7 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
}
case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with BWACommonArgs {
case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Input(doc="bwa alignment index file") var sai = inSai
@Output(doc="output aligned bam file") var alignedBam = outBam
@ -437,7 +439,7 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outBam + ".bwa_sam_se"
}
case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with BWACommonArgs {
case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Input(doc="bwa alignment index file for 1st mating pair") var sai1 = inSai1
@Input(doc="bwa alignment index file for 2nd mating pair") var sai2 = inSai2

View File

@ -20,14 +20,14 @@ class RecalibrateBaseQualities extends QScript {
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
var input: File = _
@Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false)
var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R")
@Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true)
var R: String = _
@Input(doc="Reference fasta file", shortName="R", required=false)
var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
@Input(doc="Reference fasta file", shortName="R", required=true)
var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)
var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true)
var dbSNP: File = _ // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="edu.mit.broad" module="picard-private-parts" revision="1941" status="integration" publication="20110614114100" />
</ivy-module>

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="edu.mit.broad" module="picard-private-parts" revision="1954" status="integration" publication="20110712113100" />
<info organisation="edu.mit.broad" module="picard-private-parts" revision="1959" status="integration" publication="20110718185300" />
</ivy-module>

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.48.889" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="picard" revision="1.49.895" status="release" />
</ivy-module>

View File

@ -1,3 +0,0 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.48.889" status="release" />
</ivy-module>

View File

@ -0,0 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf" module="sam" revision="1.49.895" status="release" />
</ivy-module>