diff --git a/README.md b/README.md
index da02e39..3472e91 100644
--- a/README.md
+++ b/README.md
@@ -87,16 +87,12 @@ settings:
* Illumina paired-end reads no longer than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
- bwa sampe ref.fa reads1.sai reads2.sai reads1.fq reads2.fq > aln-pe.sam
+ bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
* PacBio subreads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam
-* PacBio subreads to themselves (the output is not SAM):
-
- bwa mem -x pbread reads.fq reads.fq > overlap.pas
-
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small.
@@ -130,38 +126,6 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
as well. BWA-MEM does not have this problem.
-####6. How to map sequences to GRCh38 with ALT contigs?
-
-BWA-backtrack and BWA-MEM partially support mapping to a reference containing
-ALT contigs that represent alternative alleles highly divergent from the
-reference genome.
-
- # download the K8 executable required by bwa-helper.js
- wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download
- tar -jxf k8-0.2.1.tar.bz2
-
- # download the ALT-to-GRCh38 alignment in the SAM format
- wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download
-
- # download the GRCh38 sequences with ALT contigs
- wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
-
- # index and mapping
- bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
- bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam
-
-Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the
-read has 50 or fewer hits. For each SAM line containing the XA tag,
-`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted
-to the same chromosomal region, adjusts mapping quality and outputs all the
-hits overlapping the reported hit. A read may be mapped to both the primary
-assembly and one or more ALT contigs all with high mapping quality.
-
-Note that this procedure assumes reads are single-end and may miss hits to
-highly repetitive regions as these hits will not be reported with option
-`-h50`. `bwa-helper.js` is a prototype implementation not recommended for
-production uses.
-
[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
diff --git a/bntseq.c b/bntseq.c
index eddae84..93cdba1 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -37,6 +37,9 @@
#include "kseq.h"
KSEQ_DECLARE(gzFile)
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
@@ -94,7 +97,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{
- char str[1024];
+ char str[8192];
FILE *fp;
const char *fname;
bntseq_t *bns;
@@ -117,14 +120,14 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
if (scanres != 2) goto badread;
p->name = strdup(str);
// read fasta comments
- while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
+ while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
while (c != '\n' && c != EOF) c = fgetc(fp);
if (c == EOF) {
scanres = EOF;
goto badread;
}
*q = 0;
- if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
+ if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
else p->anno = strdup("");
// read the rest
scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
@@ -165,11 +168,33 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
bntseq_t *bns_restore(const char *prefix)
{
- char ann_filename[1024], amb_filename[1024], pac_filename[1024];
+ char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
+ FILE *fp;
+ bntseq_t *bns;
strcat(strcpy(ann_filename, prefix), ".ann");
strcat(strcpy(amb_filename, prefix), ".amb");
strcat(strcpy(pac_filename, prefix), ".pac");
- return bns_restore_core(ann_filename, amb_filename, pac_filename);
+ bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
+ if (bns == 0) return 0;
+ if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
+ char str[1024];
+ khash_t(str) *h;
+ int i, absent;
+ khint_t k;
+ h = kh_init(str);
+ for (i = 0; i < bns->n_seqs; ++i) {
+ k = kh_put(str, h, bns->anns[i].name, &absent);
+ kh_val(h, k) = i;
+ }
+ while (fscanf(fp, "%s", str) == 1) {
+ k = kh_get(str, h, str);
+ if (k != kh_end(h))
+ bns->anns[kh_val(h, k)].is_alt = 1;
+ }
+ kh_destroy(str, h);
+ fclose(fp);
+ }
+ return bns;
}
void bns_destroy(bntseq_t *bns)
diff --git a/bntseq.h b/bntseq.h
index 6437cf6..03671d6 100644
--- a/bntseq.h
+++ b/bntseq.h
@@ -43,6 +43,7 @@ typedef struct {
int32_t len;
int32_t n_ambs;
uint32_t gi;
+ int32_t is_alt;
char *name, *anno;
} bntann1_t;
diff --git a/bwamem.c b/bwamem.c
index 0e4c6cc..95d7c2f 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -2,6 +2,7 @@
#include
#include
#include
+#include
#include
#ifdef HAVE_PTHREAD
#include
@@ -61,6 +62,9 @@ mem_opt_t *mem_opt_init()
o->zdrop = 100;
o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5;
+
+ o->max_mem_intv = 0;
+
o->min_seed_len = 19;
o->split_width = 10;
o->max_occ = 500;
@@ -119,7 +123,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
// first pass: find all SMEMs
while (x < len) {
if (seq[x] < 4) {
- x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
+ x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
@@ -129,6 +133,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
} else ++x;
}
// second pass: find MEMs inside a long SMEM
+ if (opt->max_mem_intv > 0) goto sort_intv;
old_n = a->mem.n;
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &a->mem.a[k];
@@ -136,10 +141,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if (end - start < split_len || p->x[2] > opt->split_width) continue;
bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i)
- if ((a->mem1.a[i].info>>32) - (uint32_t)a->mem1.a[i].info >= opt->min_seed_len)
+ if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
// sort
+sort_intv:
ks_introsort(mem_intv, a->mem.n, a->mem.a);
}
@@ -155,7 +161,7 @@ typedef struct {
typedef struct {
int n, m, first, rid;
- uint32_t w:30, kept:2;
+ uint32_t w:29, kept:2, is_alt:1;
float frac_rep;
int64_t pos;
mem_seed_t *seeds;
@@ -276,6 +282,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s;
tmp.rid = rid;
+ tmp.is_alt = !!bns->anns[rid].is_alt;
kb_putp(chn, tree, &tmp);
}
}
@@ -329,7 +336,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
int j = chains.a[k];
int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
- if (e_min > b_max) { // have overlap
+ if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary
int li = chn_end(a[i]) - chn_beg(a[i]);
int lj = chn_end(a[j]) - chn_beg(a[j]);
int min_l = li < lj? li : lj;
@@ -375,9 +382,12 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
-#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))
+#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash))))
KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
+#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
+KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
+
#define PATCH_MAX_R_BW 0.05f
#define PATCH_MIN_SC_RATIO 0.90f
@@ -473,21 +483,19 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int
return n - 1;
}
-void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+typedef kvec_t(int) int_v;
+
+static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
- kvec_t(int) z;
- if (n == 0) return;
- kv_init(z);
- for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
- ks_introsort(mem_ars_hash, n, a);
tmp = opt->a + opt->b;
tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
- kv_push(int, z, 0);
+ z->n = 0;
+ kv_push(int, *z, 0);
for (i = 1; i < n; ++i) {
- for (k = 0; k < z.n; ++k) {
- int j = z.a[k];
+ for (k = 0; k < z->n; ++k) {
+ int j = z->a[k];
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
if (e_min > b_max) { // have overlap
@@ -499,10 +507,32 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
}
}
}
- if (k == z.n) kv_push(int, z, i);
- else a[i].secondary = z.a[k];
+ if (k == z->n) kv_push(int, *z, i);
+ else a[i].secondary = z->a[k];
+ }
+}
+
+int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+{
+ int i, n_pri;
+ int_v z = {0,0,0};
+ if (n == 0) return 0;
+ for (i = n_pri = 0; i < n; ++i) {
+ a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
+ if (!a[i].is_alt) ++n_pri;
+ }
+ ks_introsort(mem_ars_hash, n, a);
+ mem_mark_primary_se_core(opt, n, a, &z);
+ for (i = 0; i < n; ++i) // don't track the "parent" hit of ALT secondary hits
+ if (a[i].is_alt && a[i].secondary >= 0)
+ a[i].secondary = INT_MAX;
+ if (n_pri > 0 || n_pri != n) {
+ ks_introsort(mem_ars_hash2, n, a);
+ for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
+ mem_mark_primary_se_core(opt, n_pri, a, &z);
}
free(z.a);
+ return n_pri;
}
/*********************************
@@ -625,12 +655,12 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
// qd: distance ahead of the seed on query; rd: on reference
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
- w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
+ w = max_gap < p->w? max_gap : p->w; // bounded by the band width
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
// similar to the previous four lines, but this time we look at the region behind
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
max_gap = cal_max_gap(opt, qd < rd? qd : rd);
- w = max_gap < opt->w? max_gap : opt->w;
+ w = max_gap < p->w? max_gap : p->w;
if (qd - rd < w && rd - qd < w) break;
}
if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
@@ -757,7 +787,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar)
return l;
}
-void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all, bam_hdr_t *h)
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, bam_hdr_t *h)
{
int i, l_name;
mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
@@ -786,7 +816,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
if (p->n_cigar) { // aligned
for (i = 0; i < p->n_cigar; ++i) {
int c = p->cigar[i]&0xf;
- if (!softclip_all && (c == 3 || c == 4))
+ if (!(opt->flag&MEM_F_SOFTCLIP) && (c == 3 || c == 4))
c = which? 4 : 3; // use hard clipping for supplementary alignments
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
}
@@ -814,7 +844,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
kputsn("*\t*", 3, str);
} else if (!p->is_rev) { // the forward strand
int i, qb = 0, qe = s->l_seq;
- if (p->n_cigar && which && !softclip_all) { // have cigar && not the primary alignment && not softclip all
+ if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) { // have cigar && not the primary alignment && not softclip all
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4;
}
@@ -828,7 +858,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
} else kputc('*', str);
} else { // the reverse strand
int i, qb = 0, qe = s->l_seq;
- if (p->n_cigar && which && !softclip_all) {
+ if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) {
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4;
}
@@ -873,6 +903,14 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
}
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+ if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
+ int tmp;
+ kputsn("\tXR:Z:", 6, str);
+ tmp = str->l;
+ kputs(bns->anns[p->rid].anno, str);
+ for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE
+ if (str->s[i] == '\t') str->s[i] = ' ';
+ }
kputc('\n', str);
#ifdef USE_HTSLIB
@@ -916,7 +954,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
}
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
-void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h)
+void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h)
{
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str;
@@ -932,8 +970,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
mem_alnreg_t *p = &a->a[k];
mem_aln_t *q;
if (p->score < opt->T) continue;
- if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
- if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
+ if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue;
+ if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
assert(q->rid >= 0); // this should not happen with the new code
@@ -948,10 +986,10 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
mem_aln_t t;
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
t.flag |= extra_flag;
- mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP, h);
+ mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m, h);
} else {
for (k = 0; k < aa.n; ++k)
- mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP, h);
+ mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m, h);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
free(aa.a);
}
@@ -996,6 +1034,11 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
}
}
+ for (i = 0; i < regs.n; ++i) {
+ mem_alnreg_t *p = ®s.a[i];
+ if (p->rid >= 0 && bns->anns[p->rid].is_alt)
+ p->is_alt = 1;
+ }
return regs;
}
@@ -1111,7 +1154,7 @@ static void worker2(void *data, int i, int tid)
mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
} else {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
- mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, w->h);
+ mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, w->h);
}
free(w->regs[i].a);
} else {
@@ -1129,16 +1172,15 @@ void mem_process_seqs2(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *b
{
extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
worker_t w;
- mem_alnreg_v *regs;
mem_pestat_t pes[4];
double ctime, rtime;
int i;
ctime = cputime(); rtime = realtime();
global_bns = bns;
- regs = malloc(n * sizeof(mem_alnreg_v));
- w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
- w.seqs = seqs; w.regs = regs; w.n_processed = n_processed;
+ w.regs = malloc(n * sizeof(mem_alnreg_v));
+ w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
+ w.seqs = seqs; w.n_processed = n_processed;
w.pes = &pes[0];
w.aux = malloc(opt->n_threads * sizeof(smem_aux_t));
for (i = 0; i < opt->n_threads; ++i)
@@ -1150,10 +1192,10 @@ void mem_process_seqs2(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *b
free(w.aux);
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
- else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data
+ else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data
}
kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
- free(regs);
+ free(w.regs);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
}
diff --git a/bwamem.h b/bwamem.h
index f74cb44..e65e3b2 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -18,6 +18,7 @@ typedef struct __smem_i smem_i;
#define MEM_F_NO_RESCUE 0x20
#define MEM_F_SELF_OVLP 0x40
#define MEM_F_ALN_REG 0x80
+#define MEM_F_REF_HDR 0x100
#define MEM_F_SOFTCLIP 0x200
typedef struct {
@@ -29,6 +30,8 @@ typedef struct {
int w; // band width
int zdrop; // Z-dropoff
+ uint64_t max_mem_intv;
+
int T; // output score threshold; only affecting output
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
@@ -68,7 +71,7 @@ typedef struct {
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int seedlen0; // length of the starting seed
- int n_comp; // number of sub-alignments chained together
+ int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep;
uint64_t hash;
} mem_alnreg_t;
@@ -100,6 +103,7 @@ extern "C" {
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
+ void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
const bwtintv_v *smem_next(smem_i *itr);
mem_opt_t *mem_opt_init(void);
diff --git a/bwamem_extra.c b/bwamem_extra.c
index ff28555..b56a51c 100644
--- a/bwamem_extra.c
+++ b/bwamem_extra.c
@@ -1,3 +1,4 @@
+#include
#include "bwa.h"
#include "bwamem.h"
#include "bntseq.h"
@@ -11,6 +12,8 @@ struct __smem_i {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
+ int min_intv, max_len;
+ uint64_t max_intv;
bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays
@@ -25,6 +28,9 @@ smem_i *smem_itr_init(const bwt_t *bwt)
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
itr->matches = calloc(1, sizeof(bwtintv_v));
itr->sub = calloc(1, sizeof(bwtintv_v));
+ itr->min_intv = 1;
+ itr->max_len = INT_MAX;
+ itr->max_intv = 0;
return itr;
}
@@ -44,6 +50,13 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len;
}
+void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
+{
+ itr->min_intv = min_intv;
+ itr->max_len = max_len;
+ itr->max_intv = max_intv;
+}
+
const bwtintv_v *smem_next(smem_i *itr)
{
int i, max, max_i, ori_start;
@@ -52,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
- itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
+ itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i];
@@ -127,7 +140,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
cnt = calloc(a->n, sizeof(int));
for (i = 0, tot = 0; i < a->n; ++i) {
int j = a->a[i].secondary;
- if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
+ if (j >= 0 && j < INT_MAX && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
++cnt[j], ++tot;
}
if (tot == 0) goto end_gen_alt;
@@ -135,7 +148,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
int j = a->a[i].secondary;
- if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
+ if (j < 0 || j == INT_MAX || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
if (cnt[j] > opt->max_hits) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
kputs(bns->anns[t.rid].name, &aln[j]);
diff --git a/bwamem_pair.c b/bwamem_pair.c
index 80e98ac..66dd8d1 100644
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@@ -177,14 +177,14 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
return n;
}
-int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
+int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
{
pair64_v v, u;
int r, i, k, y[4], ret; // y[] keeps the last hit
int64_t l_pac = bns->l_pac;
kv_init(v); kv_init(u);
for (r = 0; r < 2; ++r) { // loop through read number
- for (i = 0; i < a[r].n; ++i) {
+ for (i = 0; i < n_pri[r]; ++i) {
pair64_t key;
mem_alnreg_t *e = &a[r].a[i];
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
@@ -244,13 +244,13 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], bam_hdr_t *header)
{
- extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
+ extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
- extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h);
- extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all, bam_hdr_t *h);
+ extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h);
+ extern void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, bam_hdr_t *h);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
- int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1;
+ int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2];
kstring_t str;
mem_aln_t h[2];
@@ -267,18 +267,18 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
}
- mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
- mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
+ n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
+ n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits
- if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
+ if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
int is_multi[2], q_pe, score_un, q_se[2];
char **XA[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
- for (j = 1; j < a[i].n; ++j)
+ for (j = 1; j < n_pri[i]; ++j)
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
- is_multi[i] = j < a[i].n? 1 : 0;
+ is_multi[i] = j < n_pri[i]? 1 : 0;
}
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit
@@ -316,7 +316,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
int k = a[i].a[z[i]].secondary;
if (k >= 0) { // switch secondary and primary
assert(a[i].a[k].secondary < 0);
- for (j = 0; j < a[i].n; ++j)
+ for (j = 0; j < n_pri[i]; ++j)
if (a[i].a[j].secondary == k || j == k)
a[i].a[j].secondary = z[i];
a[i].a[z[i]].secondary = -1;
@@ -327,13 +327,13 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
// write SAM
h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0;
h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0;
- mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP, header);
+ mem_aln2sam(opt, bns, &str, &s[0], 1, &h[0], 0, &h[1], header);
#ifndef USE_HTSLIB
- s[0].sam = strdup(str.s); str.l = 0; /// not using HTSLIB
+ s[0].sam = strdup(str.s); str.l = 0;
#endif
- mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP, header);
+ mem_aln2sam(opt, bns, &str, &s[1], 1, &h[1], 0, &h[0], header);
#ifndef USE_HTSLIB
- s[1].sam = str.s; // not using HTSLIB
+ s[1].sam = str.s;
#endif
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); // h[0].XA will be freed in the following block
@@ -360,8 +360,8 @@ no_pairing:
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
}
- mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1], header);
- mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0], header);
+ mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1], header);
+ mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0], header);
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); free(h[1].cigar);
return n;
diff --git a/bwt.c b/bwt.c
index c9bf6a3..a719f53 100644
--- a/bwt.c
+++ b/bwt.c
@@ -30,6 +30,7 @@
#include
#include
#include
+#include
#include "utils.h"
#include "bwt.h"
#include "kvec.h"
@@ -285,7 +286,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
-int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
@@ -307,6 +308,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
+ if (i - x > max_len && ik.x[2] < max_intv) break;
}
ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base
@@ -323,8 +325,13 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
+ int max_stop = 0;
bwt_extend(bwt, p, ok, 1);
- if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
+ if (ok[c].x[2] != p->x[2]) { // change of interval size
+ if (p->info - i - 1 > max_len && p->x[2] < max_intv)
+ max_stop = 1;
+ }
+ if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
@@ -346,6 +353,11 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
return ret;
}
+int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+{
+ return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
+}
+
/*************************
* Read/write BWT and SA *
*************************/
diff --git a/bwt.h b/bwt.h
index d2ff0ac..7ab49bc 100644
--- a/bwt.h
+++ b/bwt.h
@@ -92,6 +92,7 @@ extern "C" {
void bwt_destroy(bwt_t *bwt);
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
+ void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_bwtupdate_core(bwt_t *bwt);
@@ -118,6 +119,7 @@ extern "C" {
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
// SMEM iterator interface
diff --git a/bwt_gen.c b/bwt_gen.c
index 6139d80..76f28c9 100644
--- a/bwt_gen.c
+++ b/bwt_gen.c
@@ -1598,15 +1598,20 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
}
}
-void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size)
{
BWTInc *bwtInc;
- bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000);
+ bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size);
printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
BWTIncFree(bwtInc);
}
+void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+{
+ bwt_bwtgen2(fn_pac, fn_bwt, 10000000);
+}
+
int bwt_bwtgen_main(int argc, char *argv[])
{
if (argc < 3) {
diff --git a/bwtindex.c b/bwtindex.c
index 9e3ec15..2bfd667 100644
--- a/bwtindex.c
+++ b/bwtindex.c
@@ -189,11 +189,11 @@ int bwa_index(int argc, char *argv[]) // the "index" command
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *prefix = 0, *str, *str2, *str3;
- int c, algo_type = 0, is_64 = 0;
+ int c, algo_type = 0, is_64 = 0, block_size = 10000000;
clock_t t;
int64_t l_pac;
- while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
+ while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1;
@@ -203,20 +203,26 @@ int bwa_index(int argc, char *argv[]) // the "index" command
break;
case 'p': prefix = strdup(optarg); break;
case '6': is_64 = 1; break;
+ case 'b':
+ block_size = strtol(optarg, &str, 10);
+ if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
+ else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
+ else if (*str == 'K' || *str == 'k') block_size *= 1024;
+ break;
default: return 1;
}
}
if (optind + 1 > argc) {
fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] \n\n");
+ fprintf(stderr, "Usage: bwa index [options] \n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
+ fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as .64.* instead of .* \n");
fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
- fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
- fprintf(stderr, " according to the length of the genome.\n\n");
+ fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
return 1;
}
if (prefix == 0) {
@@ -242,7 +248,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(str2, prefix); strcat(str2, ".bwt");
t = clock();
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
- if (algo_type == 2) bwt_bwtgen(str, str2);
+ if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
else if (algo_type == 1 || algo_type == 3) {
bwt_t *bwt;
bwt = bwt_pac2bwt(str, algo_type == 3);
diff --git a/fastmap.c b/fastmap.c
index b528ae9..25d2b96 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
#include
#include
#include "bwa.h"
@@ -38,6 +39,7 @@ int main_mem(int argc, char *argv[])
{
mem_opt_t *opt, opt0;
int fd, fd2, i, c, n, copy_comment = 0;
+ int fixed_chunk_size = -1, actual_chunk_size;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
bseq1_t *seqs;
@@ -54,10 +56,11 @@ int main_mem(int argc, char *argv[])
opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
#ifdef USE_HTSLIB
- while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:o:")) >= 0) {
+ while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:o:")) >= 0)
#else
- while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) {
+ while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0)
#endif
+ {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == 'x') mode = optarg;
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
@@ -74,6 +77,7 @@ int main_mem(int argc, char *argv[])
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
+ else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
else if (c == 'v') bwa_verbose = atoi(optarg);
@@ -85,7 +89,9 @@ int main_mem(int argc, char *argv[])
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
+ else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
else if (c == 'C') copy_comment = 1;
+ else if (c == 'K') fixed_chunk_size = atoi(optarg);
else if (c == 'Q') {
opt0.mapQ_coef_len = 1;
opt->mapQ_coef_len = atoi(optarg);
@@ -141,6 +147,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+ fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
@@ -149,15 +156,16 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -e discard full-length exact matches\n");
+ fprintf(stderr, "\nScoring options:\n\n");
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
- fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+ fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n");
- fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n");
- fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n");
+ fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0\n");
+// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
@@ -167,6 +175,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -h INT if there are 80%% of the max score, output all in XA [%d]\n", opt->max_hits);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
+ fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
@@ -185,13 +194,13 @@ int main_mem(int argc, char *argv[])
if (mode) {
if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
- if (!opt0.a) opt->a = 2, opt0.a = 1;
+ if (!opt0.a) opt->a = 1, opt0.a = 1;
update_a(opt, &opt0);
- if (!opt0.o_del) opt->o_del = 2;
+ if (!opt0.o_del) opt->o_del = 1;
if (!opt0.e_del) opt->e_del = 1;
- if (!opt0.o_ins) opt->o_ins = 2;
+ if (!opt0.o_ins) opt->o_ins = 1;
if (!opt0.e_ins) opt->e_ins = 1;
- if (!opt0.b) opt->b = 5;
+ if (!opt0.b) opt->b = 1;
if (opt0.split_factor == 0.) opt->split_factor = 10.;
if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
@@ -210,7 +219,6 @@ int main_mem(int argc, char *argv[])
return 1; // FIXME memory leak
}
} else update_a(opt, &opt0);
-// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF
bwa_fill_scmat(opt->a, opt->b, opt->mat);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
@@ -264,7 +272,8 @@ int main_mem(int argc, char *argv[])
bwa_print_sam_hdr(idx->bns, rg_line);
#endif
}
- while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
+ actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
+ while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) {
int64_t size = 0;
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
if (bwa_verbose >= 2)
@@ -315,7 +324,8 @@ int main_mem(int argc, char *argv[])
int main_fastmap(int argc, char *argv[])
{
- int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
+ int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
+ uint64_t max_intv = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
@@ -323,16 +333,26 @@ int main_fastmap(int argc, char *argv[])
const bwtintv_v *a;
bwaidx_t *idx;
- while ((c = getopt(argc, argv, "w:l:p")) >= 0) {
+ while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
switch (c) {
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
+ case 'i': min_intv = atoi(optarg); break;
+ case 'I': max_intv = atol(optarg); break;
+ case 'L': max_len = atoi(optarg); break;
default: return 1;
}
}
if (optind + 1 >= argc) {
- fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa fastmap [options] \n\n");
+ fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
+ fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
+ fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
+ fprintf(stderr, " -l INT max MEM length [%d]\n", max_len);
+ fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
+ fprintf(stderr, "\n");
return 1;
}
@@ -340,6 +360,7 @@ int main_fastmap(int argc, char *argv[])
seq = kseq_init(fp);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
itr = smem_itr_init(idx->bwt);
+ smem_config(itr, min_intv, max_len, max_intv);
while (kseq_read(seq) >= 0) {
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
diff --git a/main.c b/main.c
index c202a7c..e7ade60 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r806-dirty"
+#define PACKAGE_VERSION "0.7.10-r830-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);
diff --git a/utils.c b/utils.c
index 00be7f0..2983261 100644
--- a/utils.c
+++ b/utils.c
@@ -219,6 +219,17 @@ int err_fputs(const char *s, FILE *stream)
return ret;
}
+int err_puts(const char *s)
+{
+ int ret = puts(s);
+ if (EOF == ret)
+ {
+ _err_fatal_simple("puts", strerror(errno));
+ }
+
+ return ret;
+}
+
int err_fflush(FILE *stream)
{
int ret = fflush(stream);
diff --git a/utils.h b/utils.h
index 5ef6ac4..11966b8 100644
--- a/utils.h
+++ b/utils.h
@@ -80,7 +80,7 @@ extern "C" {
int err_fputc(int c, FILE *stream);
#define err_putchar(C) err_fputc((C), stdout)
int err_fputs(const char *s, FILE *stream);
-#define err_puts(S) err_fputs((S), stdout)
+ int err_puts(const char *s);
int err_fflush(FILE *stream);
int err_fclose(FILE *stream);
int err_gzclose(gzFile file);