Merge branch 'dev'

Conflicts:
	README.md
	bwamem.c
	bwamem_pair.c
	fastmap.c
	main.c
This commit is contained in:
Heng Li 2014-09-16 00:24:41 -04:00
commit df09f38ad3
15 changed files with 230 additions and 124 deletions

View File

@ -87,16 +87,12 @@ settings:
* Illumina paired-end reads no longer than ~70bp: * Illumina paired-end reads no longer than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa reads1.sai reads2.sai reads1.fq reads2.fq > aln-pe.sam bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
* PacBio subreads to a reference genome: * PacBio subreads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam bwa mem -x pacbio ref.fa reads.fq > aln.sam
* PacBio subreads to themselves (the output is not SAM):
bwa mem -x pbread reads.fq reads.fq > overlap.pas
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small. errors given longer query sequences as the chance of missing all seeds is small.
@ -130,38 +126,6 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
as well. BWA-MEM does not have this problem. as well. BWA-MEM does not have this problem.
####<a name="h38"></a>6. How to map sequences to GRCh38 with ALT contigs?
BWA-backtrack and BWA-MEM partially support mapping to a reference containing
ALT contigs that represent alternative alleles highly divergent from the
reference genome.
# download the K8 executable required by bwa-helper.js
wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download
tar -jxf k8-0.2.1.tar.bz2
# download the ALT-to-GRCh38 alignment in the SAM format
wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download
# download the GRCh38 sequences with ALT contigs
wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
# index and mapping
bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam
Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the
read has 50 or fewer hits. For each SAM line containing the XA tag,
`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted
to the same chromosomal region, adjusts mapping quality and outputs all the
hits overlapping the reported hit. A read may be mapped to both the primary
assembly and one or more ALT contigs all with high mapping quality.
Note that this procedure assumes reads are single-end and may miss hits to
highly repetitive regions as these hits will not be reported with option
`-h50`. `bwa-helper.js` is a prototype implementation not recommended for
production uses.
[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License [1]: http://en.wikipedia.org/wiki/GNU_General_Public_License

View File

@ -37,6 +37,9 @@
#include "kseq.h" #include "kseq.h"
KSEQ_DECLARE(gzFile) KSEQ_DECLARE(gzFile)
#include "khash.h"
KHASH_MAP_INIT_STR(str, int)
#ifdef USE_MALLOC_WRAPPERS #ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h" # include "malloc_wrap.h"
#endif #endif
@ -94,7 +97,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{ {
char str[1024]; char str[8192];
FILE *fp; FILE *fp;
const char *fname; const char *fname;
bntseq_t *bns; bntseq_t *bns;
@ -117,14 +120,14 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
if (scanres != 2) goto badread; if (scanres != 2) goto badread;
p->name = strdup(str); p->name = strdup(str);
// read fasta comments // read fasta comments
while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
while (c != '\n' && c != EOF) c = fgetc(fp); while (c != '\n' && c != EOF) c = fgetc(fp);
if (c == EOF) { if (c == EOF) {
scanres = EOF; scanres = EOF;
goto badread; goto badread;
} }
*q = 0; *q = 0;
if (q - str > 1) p->anno = strdup(str + 1); // skip leading space if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
else p->anno = strdup(""); else p->anno = strdup("");
// read the rest // read the rest
scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
@ -165,11 +168,33 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
bntseq_t *bns_restore(const char *prefix) bntseq_t *bns_restore(const char *prefix)
{ {
char ann_filename[1024], amb_filename[1024], pac_filename[1024]; char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
FILE *fp;
bntseq_t *bns;
strcat(strcpy(ann_filename, prefix), ".ann"); strcat(strcpy(ann_filename, prefix), ".ann");
strcat(strcpy(amb_filename, prefix), ".amb"); strcat(strcpy(amb_filename, prefix), ".amb");
strcat(strcpy(pac_filename, prefix), ".pac"); strcat(strcpy(pac_filename, prefix), ".pac");
return bns_restore_core(ann_filename, amb_filename, pac_filename); bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
if (bns == 0) return 0;
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
char str[1024];
khash_t(str) *h;
int i, absent;
khint_t k;
h = kh_init(str);
for (i = 0; i < bns->n_seqs; ++i) {
k = kh_put(str, h, bns->anns[i].name, &absent);
kh_val(h, k) = i;
}
while (fscanf(fp, "%s", str) == 1) {
k = kh_get(str, h, str);
if (k != kh_end(h))
bns->anns[kh_val(h, k)].is_alt = 1;
}
kh_destroy(str, h);
fclose(fp);
}
return bns;
} }
void bns_destroy(bntseq_t *bns) void bns_destroy(bntseq_t *bns)

View File

@ -43,6 +43,7 @@ typedef struct {
int32_t len; int32_t len;
int32_t n_ambs; int32_t n_ambs;
uint32_t gi; uint32_t gi;
int32_t is_alt;
char *name, *anno; char *name, *anno;
} bntann1_t; } bntann1_t;

108
bwamem.c
View File

@ -2,6 +2,7 @@
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include <assert.h> #include <assert.h>
#include <limits.h>
#include <math.h> #include <math.h>
#ifdef HAVE_PTHREAD #ifdef HAVE_PTHREAD
#include <pthread.h> #include <pthread.h>
@ -61,6 +62,9 @@ mem_opt_t *mem_opt_init()
o->zdrop = 100; o->zdrop = 100;
o->pen_unpaired = 17; o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5; o->pen_clip5 = o->pen_clip3 = 5;
o->max_mem_intv = 0;
o->min_seed_len = 19; o->min_seed_len = 19;
o->split_width = 10; o->split_width = 10;
o->max_occ = 500; o->max_occ = 500;
@ -119,7 +123,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
// first pass: find all SMEMs // first pass: find all SMEMs
while (x < len) { while (x < len) {
if (seq[x] < 4) { if (seq[x] < 4) {
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i) { for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i]; bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length int slen = (uint32_t)p->info - (p->info>>32); // seed length
@ -129,6 +133,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
} else ++x; } else ++x;
} }
// second pass: find MEMs inside a long SMEM // second pass: find MEMs inside a long SMEM
if (opt->max_mem_intv > 0) goto sort_intv;
old_n = a->mem.n; old_n = a->mem.n;
for (k = 0; k < old_n; ++k) { for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &a->mem.a[k]; bwtintv_t *p = &a->mem.a[k];
@ -136,10 +141,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if (end - start < split_len || p->x[2] > opt->split_width) continue; if (end - start < split_len || p->x[2] > opt->split_width) continue;
bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i) for (i = 0; i < a->mem1.n; ++i)
if ((a->mem1.a[i].info>>32) - (uint32_t)a->mem1.a[i].info >= opt->min_seed_len) if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]); kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
} }
// sort // sort
sort_intv:
ks_introsort(mem_intv, a->mem.n, a->mem.a); ks_introsort(mem_intv, a->mem.n, a->mem.a);
} }
@ -155,7 +161,7 @@ typedef struct {
typedef struct { typedef struct {
int n, m, first, rid; int n, m, first, rid;
uint32_t w:30, kept:2; uint32_t w:29, kept:2, is_alt:1;
float frac_rep; float frac_rep;
int64_t pos; int64_t pos;
mem_seed_t *seeds; mem_seed_t *seeds;
@ -276,6 +282,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s; tmp.seeds[0] = s;
tmp.rid = rid; tmp.rid = rid;
tmp.is_alt = !!bns->anns[rid].is_alt;
kb_putp(chn, tree, &tmp); kb_putp(chn, tree, &tmp);
} }
} }
@ -329,7 +336,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
int j = chains.a[k]; int j = chains.a[k];
int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
if (e_min > b_max) { // have overlap if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary
int li = chn_end(a[i]) - chn_beg(a[i]); int li = chn_end(a[i]) - chn_beg(a[i]);
int lj = chn_end(a[j]) - chn_beg(a[j]); int lj = chn_end(a[j]) - chn_beg(a[j]);
int min_l = li < lj? li : lj; int min_l = li < lj? li : lj;
@ -375,9 +382,12 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) #define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash))))
KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
#define PATCH_MAX_R_BW 0.05f #define PATCH_MAX_R_BW 0.05f
#define PATCH_MIN_SC_RATIO 0.90f #define PATCH_MIN_SC_RATIO 0.90f
@ -473,21 +483,19 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int
return n - 1; return n - 1;
} }
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) typedef kvec_t(int) int_v;
static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
{ // similar to the loop in mem_chain_flt() { // similar to the loop in mem_chain_flt()
int i, k, tmp; int i, k, tmp;
kvec_t(int) z;
if (n == 0) return;
kv_init(z);
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
ks_introsort(mem_ars_hash, n, a);
tmp = opt->a + opt->b; tmp = opt->a + opt->b;
tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
kv_push(int, z, 0); z->n = 0;
kv_push(int, *z, 0);
for (i = 1; i < n; ++i) { for (i = 1; i < n; ++i) {
for (k = 0; k < z.n; ++k) { for (k = 0; k < z->n; ++k) {
int j = z.a[k]; int j = z->a[k];
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
if (e_min > b_max) { // have overlap if (e_min > b_max) { // have overlap
@ -499,10 +507,32 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
} }
} }
} }
if (k == z.n) kv_push(int, z, i); if (k == z->n) kv_push(int, *z, i);
else a[i].secondary = z.a[k]; else a[i].secondary = z->a[k];
}
}
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
{
int i, n_pri;
int_v z = {0,0,0};
if (n == 0) return 0;
for (i = n_pri = 0; i < n; ++i) {
a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
if (!a[i].is_alt) ++n_pri;
}
ks_introsort(mem_ars_hash, n, a);
mem_mark_primary_se_core(opt, n, a, &z);
for (i = 0; i < n; ++i) // don't track the "parent" hit of ALT secondary hits
if (a[i].is_alt && a[i].secondary >= 0)
a[i].secondary = INT_MAX;
if (n_pri > 0 || n_pri != n) {
ks_introsort(mem_ars_hash2, n, a);
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
mem_mark_primary_se_core(opt, n_pri, a, &z);
} }
free(z.a); free(z.a);
return n_pri;
} }
/********************************* /*********************************
@ -625,12 +655,12 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
// qd: distance ahead of the seed on query; rd: on reference // qd: distance ahead of the seed on query; rd: on reference
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width w = max_gap < p->w? max_gap : p->w; // bounded by the band width
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
// similar to the previous four lines, but this time we look at the region behind // similar to the previous four lines, but this time we look at the region behind
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
max_gap = cal_max_gap(opt, qd < rd? qd : rd); max_gap = cal_max_gap(opt, qd < rd? qd : rd);
w = max_gap < opt->w? max_gap : opt->w; w = max_gap < p->w? max_gap : p->w;
if (qd - rd < w && rd - qd < w) break; if (qd - rd < w && rd - qd < w) break;
} }
if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
@ -757,7 +787,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar)
return l; return l;
} }
void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all, bam_hdr_t *h) void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, bam_hdr_t *h)
{ {
int i, l_name; int i, l_name;
mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
@ -786,7 +816,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
if (p->n_cigar) { // aligned if (p->n_cigar) { // aligned
for (i = 0; i < p->n_cigar; ++i) { for (i = 0; i < p->n_cigar; ++i) {
int c = p->cigar[i]&0xf; int c = p->cigar[i]&0xf;
if (!softclip_all && (c == 3 || c == 4)) if (!(opt->flag&MEM_F_SOFTCLIP) && (c == 3 || c == 4))
c = which? 4 : 3; // use hard clipping for supplementary alignments c = which? 4 : 3; // use hard clipping for supplementary alignments
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
} }
@ -814,7 +844,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
kputsn("*\t*", 3, str); kputsn("*\t*", 3, str);
} else if (!p->is_rev) { // the forward strand } else if (!p->is_rev) { // the forward strand
int i, qb = 0, qe = s->l_seq; int i, qb = 0, qe = s->l_seq;
if (p->n_cigar && which && !softclip_all) { // have cigar && not the primary alignment && not softclip all if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) { // have cigar && not the primary alignment && not softclip all
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4; if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4;
} }
@ -828,7 +858,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
} else kputc('*', str); } else kputc('*', str);
} else { // the reverse strand } else { // the reverse strand
int i, qb = 0, qe = s->l_seq; int i, qb = 0, qe = s->l_seq;
if (p->n_cigar && which && !softclip_all) { if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) {
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4; if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4;
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4;
} }
@ -873,6 +903,14 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
} }
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
int tmp;
kputsn("\tXR:Z:", 6, str);
tmp = str->l;
kputs(bns->anns[p->rid].anno, str);
for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE
if (str->s[i] == '\t') str->s[i] = ' ';
}
kputc('\n', str); kputc('\n', str);
#ifdef USE_HTSLIB #ifdef USE_HTSLIB
@ -916,7 +954,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
} }
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h) void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h)
{ {
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str; kstring_t str;
@ -932,8 +970,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
mem_alnreg_t *p = &a->a[k]; mem_alnreg_t *p = &a->a[k];
mem_aln_t *q; mem_aln_t *q;
if (p->score < opt->T) continue; if (p->score < opt->T) continue;
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue;
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
q = kv_pushp(mem_aln_t, aa); q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
assert(q->rid >= 0); // this should not happen with the new code assert(q->rid >= 0); // this should not happen with the new code
@ -948,10 +986,10 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
mem_aln_t t; mem_aln_t t;
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
t.flag |= extra_flag; t.flag |= extra_flag;
mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP, h); mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m, h);
} else { } else {
for (k = 0; k < aa.n; ++k) for (k = 0; k < aa.n; ++k)
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP, h); mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m, h);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
free(aa.a); free(aa.a);
} }
@ -996,6 +1034,11 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
} }
} }
for (i = 0; i < regs.n; ++i) {
mem_alnreg_t *p = &regs.a[i];
if (p->rid >= 0 && bns->anns[p->rid].is_alt)
p->is_alt = 1;
}
return regs; return regs;
} }
@ -1111,7 +1154,7 @@ static void worker2(void *data, int i, int tid)
mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
} else { } else {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, w->h); mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, w->h);
} }
free(w->regs[i].a); free(w->regs[i].a);
} else { } else {
@ -1129,16 +1172,15 @@ void mem_process_seqs2(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *b
{ {
extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
worker_t w; worker_t w;
mem_alnreg_v *regs;
mem_pestat_t pes[4]; mem_pestat_t pes[4];
double ctime, rtime; double ctime, rtime;
int i; int i;
ctime = cputime(); rtime = realtime(); ctime = cputime(); rtime = realtime();
global_bns = bns; global_bns = bns;
regs = malloc(n * sizeof(mem_alnreg_v)); w.regs = malloc(n * sizeof(mem_alnreg_v));
w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; w.seqs = seqs; w.n_processed = n_processed;
w.pes = &pes[0]; w.pes = &pes[0];
w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); w.aux = malloc(opt->n_threads * sizeof(smem_aux_t));
for (i = 0; i < opt->n_threads; ++i) for (i = 0; i < opt->n_threads; ++i)
@ -1150,10 +1192,10 @@ void mem_process_seqs2(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *b
free(w.aux); free(w.aux);
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data
} }
kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
free(regs); free(w.regs);
if (bwa_verbose >= 3) if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
} }

View File

@ -18,6 +18,7 @@ typedef struct __smem_i smem_i;
#define MEM_F_NO_RESCUE 0x20 #define MEM_F_NO_RESCUE 0x20
#define MEM_F_SELF_OVLP 0x40 #define MEM_F_SELF_OVLP 0x40
#define MEM_F_ALN_REG 0x80 #define MEM_F_ALN_REG 0x80
#define MEM_F_REF_HDR 0x100
#define MEM_F_SOFTCLIP 0x200 #define MEM_F_SOFTCLIP 0x200
typedef struct { typedef struct {
@ -29,6 +30,8 @@ typedef struct {
int w; // band width int w; // band width
int zdrop; // Z-dropoff int zdrop; // Z-dropoff
uint64_t max_mem_intv;
int T; // output score threshold; only affecting output int T; // output score threshold; only affecting output
int flag; // see MEM_F_* macros int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length int min_seed_len; // minimum seed length
@ -68,7 +71,7 @@ typedef struct {
int seedcov; // length of regions coverged by seeds int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int seedlen0; // length of the starting seed int seedlen0; // length of the starting seed
int n_comp; // number of sub-alignments chained together int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep; float frac_rep;
uint64_t hash; uint64_t hash;
} mem_alnreg_t; } mem_alnreg_t;
@ -100,6 +103,7 @@ extern "C" {
smem_i *smem_itr_init(const bwt_t *bwt); smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr); void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query); void smem_set_query(smem_i *itr, int len, const uint8_t *query);
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
const bwtintv_v *smem_next(smem_i *itr); const bwtintv_v *smem_next(smem_i *itr);
mem_opt_t *mem_opt_init(void); mem_opt_t *mem_opt_init(void);

View File

@ -1,3 +1,4 @@
#include <limits.h>
#include "bwa.h" #include "bwa.h"
#include "bwamem.h" #include "bwamem.h"
#include "bntseq.h" #include "bntseq.h"
@ -11,6 +12,8 @@ struct __smem_i {
const bwt_t *bwt; const bwt_t *bwt;
const uint8_t *query; const uint8_t *query;
int start, len; int start, len;
int min_intv, max_len;
uint64_t max_intv;
bwtintv_v *matches; // matches; to be returned by smem_next() bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays bwtintv_v *tmpvec[2]; // temporary arrays
@ -25,6 +28,9 @@ smem_i *smem_itr_init(const bwt_t *bwt)
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
itr->matches = calloc(1, sizeof(bwtintv_v)); itr->matches = calloc(1, sizeof(bwtintv_v));
itr->sub = calloc(1, sizeof(bwtintv_v)); itr->sub = calloc(1, sizeof(bwtintv_v));
itr->min_intv = 1;
itr->max_len = INT_MAX;
itr->max_intv = 0;
return itr; return itr;
} }
@ -44,6 +50,13 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
itr->len = len; itr->len = len;
} }
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
{
itr->min_intv = min_intv;
itr->max_len = max_len;
itr->max_intv = max_intv;
}
const bwtintv_v *smem_next(smem_i *itr) const bwtintv_v *smem_next(smem_i *itr)
{ {
int i, max, max_i, ori_start; int i, max, max_i, ori_start;
@ -52,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0; if (itr->start == itr->len) return 0;
ori_start = itr->start; ori_start = itr->start;
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i]; bwtintv_t *p = &itr->matches->a[i];
@ -127,7 +140,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
cnt = calloc(a->n, sizeof(int)); cnt = calloc(a->n, sizeof(int));
for (i = 0, tot = 0; i < a->n; ++i) { for (i = 0, tot = 0; i < a->n; ++i) {
int j = a->a[i].secondary; int j = a->a[i].secondary;
if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) if (j >= 0 && j < INT_MAX && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
++cnt[j], ++tot; ++cnt[j], ++tot;
} }
if (tot == 0) goto end_gen_alt; if (tot == 0) goto end_gen_alt;
@ -135,7 +148,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
for (i = 0; i < a->n; ++i) { for (i = 0; i < a->n; ++i) {
mem_aln_t t; mem_aln_t t;
int j = a->a[i].secondary; int j = a->a[i].secondary;
if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later if (j < 0 || j == INT_MAX || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
if (cnt[j] > opt->max_hits) continue; if (cnt[j] > opt->max_hits) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
kputs(bns->anns[t.rid].name, &aln[j]); kputs(bns->anns[t.rid].name, &aln[j]);

View File

@ -177,14 +177,14 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
return n; return n;
} }
int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
{ {
pair64_v v, u; pair64_v v, u;
int r, i, k, y[4], ret; // y[] keeps the last hit int r, i, k, y[4], ret; // y[] keeps the last hit
int64_t l_pac = bns->l_pac; int64_t l_pac = bns->l_pac;
kv_init(v); kv_init(u); kv_init(v); kv_init(u);
for (r = 0; r < 2; ++r) { // loop through read number for (r = 0; r < 2; ++r) { // loop through read number
for (i = 0; i < a[r].n; ++i) { for (i = 0; i < n_pri[r]; ++i) {
pair64_t key; pair64_t key;
mem_alnreg_t *e = &a[r].a[i]; mem_alnreg_t *e = &a[r].a[i];
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
@ -244,13 +244,13 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], bam_hdr_t *header) int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], bam_hdr_t *header)
{ {
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h); extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, bam_hdr_t *h);
extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all, bam_hdr_t *h); extern void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, bam_hdr_t *h);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2];
kstring_t str; kstring_t str;
mem_aln_t h[2]; mem_aln_t h[2];
@ -267,18 +267,18 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a); free(b[0].a); free(b[1].a);
} }
mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits // pairing single-end hits
if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
int is_multi[2], q_pe, score_un, q_se[2]; int is_multi[2], q_pe, score_un, q_se[2];
char **XA[2]; char **XA[2];
// check if an end has multiple hits even after mate-SW // check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) { for (i = 0; i < 2; ++i) {
for (j = 1; j < a[i].n; ++j) for (j = 1; j < n_pri[i]; ++j)
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
is_multi[i] = j < a[i].n? 1 : 0; is_multi[i] = j < n_pri[i]? 1 : 0;
} }
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit // compute mapQ for the best SE hit
@ -316,7 +316,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
int k = a[i].a[z[i]].secondary; int k = a[i].a[z[i]].secondary;
if (k >= 0) { // switch secondary and primary if (k >= 0) { // switch secondary and primary
assert(a[i].a[k].secondary < 0); assert(a[i].a[k].secondary < 0);
for (j = 0; j < a[i].n; ++j) for (j = 0; j < n_pri[i]; ++j)
if (a[i].a[j].secondary == k || j == k) if (a[i].a[j].secondary == k || j == k)
a[i].a[j].secondary = z[i]; a[i].a[j].secondary = z[i];
a[i].a[z[i]].secondary = -1; a[i].a[z[i]].secondary = -1;
@ -327,13 +327,13 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
// write SAM // write SAM
h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0; h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0;
h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0; h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0;
mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP, header); mem_aln2sam(opt, bns, &str, &s[0], 1, &h[0], 0, &h[1], header);
#ifndef USE_HTSLIB #ifndef USE_HTSLIB
s[0].sam = strdup(str.s); str.l = 0; /// not using HTSLIB s[0].sam = strdup(str.s); str.l = 0;
#endif #endif
mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP, header); mem_aln2sam(opt, bns, &str, &s[1], 1, &h[1], 0, &h[0], header);
#ifndef USE_HTSLIB #ifndef USE_HTSLIB
s[1].sam = str.s; // not using HTSLIB s[1].sam = str.s;
#endif #endif
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); // h[0].XA will be freed in the following block free(h[0].cigar); // h[0].XA will be freed in the following block
@ -360,8 +360,8 @@ no_pairing:
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
} }
mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1], header); mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1], header);
mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0], header); mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0], header);
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); free(h[1].cigar); free(h[0].cigar); free(h[1].cigar);
return n; return n;

16
bwt.c
View File

@ -30,6 +30,7 @@
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include <stdint.h> #include <stdint.h>
#include <limits.h>
#include "utils.h" #include "utils.h"
#include "bwt.h" #include "bwt.h"
#include "kvec.h" #include "kvec.h"
@ -285,7 +286,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
} }
} }
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{ {
int i, j, c, ret; int i, j, c, ret;
bwtintv_t ik, ok[4]; bwtintv_t ik, ok[4];
@ -307,6 +308,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
if (ok[c].x[2] != ik.x[2]) { // change of the interval size if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik); kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
if (i - x > max_len && ik.x[2] < max_intv) break;
} }
ik = ok[c]; ik.info = i + 1; ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base } else { // an ambiguous base
@ -323,8 +325,13 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) { for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j]; bwtintv_t *p = &prev->a[j];
int max_stop = 0;
bwt_extend(bwt, p, ok, 1); bwt_extend(bwt, p, ok, 1);
if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough if (ok[c].x[2] != p->x[2]) { // change of interval size
if (p->info - i - 1 > max_len && p->x[2] < max_intv)
max_stop = 1;
}
if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32; ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
@ -346,6 +353,11 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
return ret; return ret;
} }
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
}
/************************* /*************************
* Read/write BWT and SA * * Read/write BWT and SA *
*************************/ *************************/

2
bwt.h
View File

@ -92,6 +92,7 @@ extern "C" {
void bwt_destroy(bwt_t *bwt); void bwt_destroy(bwt_t *bwt);
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv); void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_bwtupdate_core(bwt_t *bwt); void bwt_bwtupdate_core(bwt_t *bwt);
@ -118,6 +119,7 @@ extern "C" {
* Return the end of the longest exact match starting from _x_. * Return the end of the longest exact match starting from _x_.
*/ */
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
// SMEM iterator interface // SMEM iterator interface

View File

@ -1598,15 +1598,20 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
} }
} }
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size)
{ {
BWTInc *bwtInc; BWTInc *bwtInc;
bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000); bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size);
printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
BWTIncFree(bwtInc); BWTIncFree(bwtInc);
} }
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
{
bwt_bwtgen2(fn_pac, fn_bwt, 10000000);
}
int bwt_bwtgen_main(int argc, char *argv[]) int bwt_bwtgen_main(int argc, char *argv[])
{ {
if (argc < 3) { if (argc < 3) {

View File

@ -189,11 +189,11 @@ int bwa_index(int argc, char *argv[]) // the "index" command
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *prefix = 0, *str, *str2, *str3; char *prefix = 0, *str, *str2, *str3;
int c, algo_type = 0, is_64 = 0; int c, algo_type = 0, is_64 = 0, block_size = 10000000;
clock_t t; clock_t t;
int64_t l_pac; int64_t l_pac;
while ((c = getopt(argc, argv, "6a:p:")) >= 0) { while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) { switch (c) {
case 'a': // if -a is not set, algo_type will be determined later case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1; if (strcmp(optarg, "div") == 0) algo_type = 1;
@ -203,20 +203,26 @@ int bwa_index(int argc, char *argv[]) // the "index" command
break; break;
case 'p': prefix = strdup(optarg); break; case 'p': prefix = strdup(optarg); break;
case '6': is_64 = 1; break; case '6': is_64 = 1; break;
case 'b':
block_size = strtol(optarg, &str, 10);
if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
else if (*str == 'K' || *str == 'k') block_size *= 1024;
break;
default: return 1; default: return 1;
} }
} }
if (optind + 1 > argc) { if (optind + 1 > argc) {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n"); fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n"); fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
fprintf(stderr, " according to the length of the genome.\n\n");
return 1; return 1;
} }
if (prefix == 0) { if (prefix == 0) {
@ -242,7 +248,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(str2, prefix); strcat(str2, ".bwt"); strcpy(str2, prefix); strcat(str2, ".bwt");
t = clock(); t = clock();
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
if (algo_type == 2) bwt_bwtgen(str, str2); if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
else if (algo_type == 1 || algo_type == 3) { else if (algo_type == 1 || algo_type == 3) {
bwt_t *bwt; bwt_t *bwt;
bwt = bwt_pac2bwt(str, algo_type == 3); bwt = bwt_pac2bwt(str, algo_type == 3);

View File

@ -3,6 +3,7 @@
#include <unistd.h> #include <unistd.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <limits.h>
#include <ctype.h> #include <ctype.h>
#include <math.h> #include <math.h>
#include "bwa.h" #include "bwa.h"
@ -38,6 +39,7 @@ int main_mem(int argc, char *argv[])
{ {
mem_opt_t *opt, opt0; mem_opt_t *opt, opt0;
int fd, fd2, i, c, n, copy_comment = 0; int fd, fd2, i, c, n, copy_comment = 0;
int fixed_chunk_size = -1, actual_chunk_size;
gzFile fp, fp2 = 0; gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0; kseq_t *ks, *ks2 = 0;
bseq1_t *seqs; bseq1_t *seqs;
@ -54,10 +56,11 @@ int main_mem(int argc, char *argv[])
opt = mem_opt_init(); opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t)); memset(&opt0, 0, sizeof(mem_opt_t));
#ifdef USE_HTSLIB #ifdef USE_HTSLIB
while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:o:")) >= 0) { while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:o:")) >= 0)
#else #else
while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) { while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0)
#endif #endif
{
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == 'x') mode = optarg; else if (c == 'x') mode = optarg;
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
@ -74,6 +77,7 @@ int main_mem(int argc, char *argv[])
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
else if (c == 'F') opt->flag |= MEM_F_ALN_REG; else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP; else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg);
@ -85,7 +89,9 @@ int main_mem(int argc, char *argv[])
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
else if (c == 'C') copy_comment = 1; else if (c == 'C') copy_comment = 1;
else if (c == 'K') fixed_chunk_size = atoi(optarg);
else if (c == 'Q') { else if (c == 'Q') {
opt0.mapQ_coef_len = 1; opt0.mapQ_coef_len = 1;
opt->mapQ_coef_len = atoi(optarg); opt->mapQ_coef_len = atoi(optarg);
@ -141,6 +147,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
@ -149,15 +156,16 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -e discard full-length exact matches\n"); fprintf(stderr, " -e discard full-length exact matches\n");
fprintf(stderr, "\nScoring options:\n\n");
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n");
fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n"); fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0\n");
fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n"); // fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
@ -167,6 +175,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -h INT if there are <INT hits with score >80%% of the max score, output all in XA [%d]\n", opt->max_hits); fprintf(stderr, " -h INT if there are <INT hits with score >80%% of the max score, output all in XA [%d]\n", opt->max_hits);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
@ -185,13 +194,13 @@ int main_mem(int argc, char *argv[])
if (mode) { if (mode) {
if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
if (!opt0.a) opt->a = 2, opt0.a = 1; if (!opt0.a) opt->a = 1, opt0.a = 1;
update_a(opt, &opt0); update_a(opt, &opt0);
if (!opt0.o_del) opt->o_del = 2; if (!opt0.o_del) opt->o_del = 1;
if (!opt0.e_del) opt->e_del = 1; if (!opt0.e_del) opt->e_del = 1;
if (!opt0.o_ins) opt->o_ins = 2; if (!opt0.o_ins) opt->o_ins = 1;
if (!opt0.e_ins) opt->e_ins = 1; if (!opt0.e_ins) opt->e_ins = 1;
if (!opt0.b) opt->b = 5; if (!opt0.b) opt->b = 1;
if (opt0.split_factor == 0.) opt->split_factor = 10.; if (opt0.split_factor == 0.) opt->split_factor = 10.;
if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
@ -210,7 +219,6 @@ int main_mem(int argc, char *argv[])
return 1; // FIXME memory leak return 1; // FIXME memory leak
} }
} else update_a(opt, &opt0); } else update_a(opt, &opt0);
// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF
bwa_fill_scmat(opt->a, opt->b, opt->mat); bwa_fill_scmat(opt->a, opt->b, opt->mat);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
@ -264,7 +272,8 @@ int main_mem(int argc, char *argv[])
bwa_print_sam_hdr(idx->bns, rg_line); bwa_print_sam_hdr(idx->bns, rg_line);
#endif #endif
} }
while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) {
int64_t size = 0; int64_t size = 0;
if ((opt->flag & MEM_F_PE) && (n&1) == 1) { if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
if (bwa_verbose >= 2) if (bwa_verbose >= 2)
@ -315,7 +324,8 @@ int main_mem(int argc, char *argv[])
int main_fastmap(int argc, char *argv[]) int main_fastmap(int argc, char *argv[])
{ {
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
uint64_t max_intv = 0;
kseq_t *seq; kseq_t *seq;
bwtint_t k; bwtint_t k;
gzFile fp; gzFile fp;
@ -323,16 +333,26 @@ int main_fastmap(int argc, char *argv[])
const bwtintv_v *a; const bwtintv_v *a;
bwaidx_t *idx; bwaidx_t *idx;
while ((c = getopt(argc, argv, "w:l:p")) >= 0) { while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
switch (c) { switch (c) {
case 'p': print_seq = 1; break; case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break; case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break; case 'l': min_len = atoi(optarg); break;
case 'i': min_intv = atoi(optarg); break;
case 'I': max_intv = atol(optarg); break;
case 'L': max_len = atoi(optarg); break;
default: return 1; default: return 1;
} }
} }
if (optind + 1 >= argc) { if (optind + 1 >= argc) {
fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth); fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
fprintf(stderr, " -l INT max MEM length [%d]\n", max_len);
fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
fprintf(stderr, "\n");
return 1; return 1;
} }
@ -340,6 +360,7 @@ int main_fastmap(int argc, char *argv[])
seq = kseq_init(fp); seq = kseq_init(fp);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
itr = smem_itr_init(idx->bwt); itr = smem_itr_init(idx->bwt);
smem_config(itr, min_intv, max_len, max_intv);
while (kseq_read(seq) >= 0) { while (kseq_read(seq) >= 0) {
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) { if (print_seq) {

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r806-dirty" #define PACKAGE_VERSION "0.7.10-r830-dirty"
#endif #endif
int bwa_fa2pac(int argc, char *argv[]); int bwa_fa2pac(int argc, char *argv[]);

11
utils.c
View File

@ -219,6 +219,17 @@ int err_fputs(const char *s, FILE *stream)
return ret; return ret;
} }
int err_puts(const char *s)
{
int ret = puts(s);
if (EOF == ret)
{
_err_fatal_simple("puts", strerror(errno));
}
return ret;
}
int err_fflush(FILE *stream) int err_fflush(FILE *stream)
{ {
int ret = fflush(stream); int ret = fflush(stream);

View File

@ -80,7 +80,7 @@ extern "C" {
int err_fputc(int c, FILE *stream); int err_fputc(int c, FILE *stream);
#define err_putchar(C) err_fputc((C), stdout) #define err_putchar(C) err_fputc((C), stdout)
int err_fputs(const char *s, FILE *stream); int err_fputs(const char *s, FILE *stream);
#define err_puts(S) err_fputs((S), stdout) int err_puts(const char *s);
int err_fflush(FILE *stream); int err_fflush(FILE *stream);
int err_fclose(FILE *stream); int err_fclose(FILE *stream);
int err_gzclose(gzFile file); int err_gzclose(gzFile file);