Merge branch 'dev'

2014-10-22 16:17:36 -04:00 · 2014-10-22 16:17:36 -04:00 · af606fc31a
parent acf4922fe4 3d129be642
commit af606fc31a
6 changed files with 24 additions and 19 deletions
--- a/README-alt.md
+++ b/README-alt.md
@ -14,13 +14,13 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/H
 gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
 mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
 bwa index hs38a.fa
-cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
+cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt
 ```

 Perform mapping:
 ```sh
 bwa mem hs38a.fa read1.fq read2.fq \
-  | bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
+  | bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \
  | samtools view -bS - > aln.unsrt.bam
 ```
 For short reads, the postprocessing script `bwa-postalt.js` runs at about the
@ -30,14 +30,14 @@ same speed as BAM compression.

 Construct the index:
 ```sh
-cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
+cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa
 bwa index hs38d4.fa
-cp bwa-hs38-res/hs38d4.fa.alt .
+cp bwa-hs38-bundle/hs38d4.fa.alt .
 ```
 Perform mapping:
 ```sh
 bwa mem hs38d4.fa read1.fq read2.fq \
-  | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
+  | bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
  | samtools view -bS - > aln.unsrt.bam
 ```
 This command line generates `postinfo.ctw` which loosely evaluates the presence
@ -88,7 +88,9 @@ alignments and assigns mapQ following these two rules:

 In theory, non-ALT alignments from step 1 should be identical to alignments
 against a reference genome with ALT contigs. In practice, the two types of
-alignments may differ in rare cases due to seeding heuristics.
+alignments may differ in rare cases due to seeding heuristics. When an ALT hit
+is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
+non-ALT hits. This happens more often for contig mapping.

 If we don't care about ALT hits, we may skip postprocessing (step 2).
 Nonetheless, postprocessing is recommended as it improves mapQ and gives more
@ -134,7 +136,7 @@ not to high resolution for now.

 ### Evaluating ALT Mapping

-(To come later...)
+(Coming soon...)

 ## Problems and Future Development

--- a/bwamem.c
+++ b/bwamem.c
@ -59,7 +59,7 @@ mem_opt_t *mem_opt_init()
 	o->pen_unpaired = 17;
 	o->pen_clip5 = o->pen_clip3 = 5;

-	o->max_mem_intv = 10;
+	o->max_mem_intv = 20;

 	o->min_seed_len = 19;
 	o->split_width = 10;
@ -147,7 +147,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
 			if (seq[x] < 4) {
 				if (1) {
 					bwtintv_t m;
-					x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m);
+					x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
 					if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
 				} else { // for now, we never come to this block which is slower
 					x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
@ -274,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
 		bwtintv_t *p = &aux->mem.a[i];
 		int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
 		int64_t k;
-		if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
+		// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
 		step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
 		for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
 			mem_chain_t tmp, *lower, *upper;
@ -514,7 +514,8 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
 				int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
 				if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
 					if (a[j].sub == 0) a[j].sub = a[i].score;
-					if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
+					if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
+						++a[j].sub_n;
 					break;
 				}
 			}
@ -980,14 +981,14 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
 	extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
 	kstring_t str;
 	kvec_t(mem_aln_t) aa;
-	int k;
+	int k, l;
 	char **XA = 0;

 	if (!(opt->flag & MEM_F_ALL))
 		XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
 	kv_init(aa);
 	str.l = str.m = 0; str.s = 0;
-	for (k = 0; k < a->n; ++k) {
+	for (k = l = 0; k < a->n; ++k) {
 		mem_alnreg_t *p = &a->a[k];
 		mem_aln_t *q;
 		if (p->score < opt->T) continue;
@ -999,9 +1000,10 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
 		q->XA = XA? XA[k] : 0;
 		q->flag |= extra_flag; // flag secondary
 		if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
-		if (k && p->secondary < 0) // if supplementary
+		if (l && p->secondary < 0) // if supplementary
 			q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
-		if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+		if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+		++l;
 	}
 	if (aa.n == 0) { // no alignments good enough; then write an unaligned record
 		mem_aln_t t;
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
 		if (q->n < MIN_DIR_CNT) {
 			fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
 			r->failed = 1;
+			free(q->a);
 			continue;
 		} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
 		ks_introsort_64(q->n, q->a);
--- a/bwt.c
+++ b/bwt.c
@ -355,7 +355,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
 	return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
 }

-int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem)
+int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
 {
 	int i, c;
 	bwtintv_t ik, ok[4];
@ -367,7 +367,7 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m
 		if (q[i] < 4) { // an A/C/G/T base
 			c = 3 - q[i]; // complement of q[i]
 			bwt_extend(bwt, &ik, ok, 0);
-			if (ok[c].x[2] < max_intv) {
+			if (ok[c].x[2] < max_intv && i - x >= min_len) {
 				*mem = ok[c];
 				mem->info = (uint64_t)x<<32 | (i + 1);
 				return i + 1;
--- a/bwt.h
+++ b/bwt.h
@ -121,7 +121,7 @@ extern "C" {
 	int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
 	int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);

-	int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem);
+	int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);

 #ifdef __cplusplus
 }
--- a/main.c
+++ b/main.c
@ -4,7 +4,7 @@
 #include "utils.h"

 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r933-dirty"
+#define PACKAGE_VERSION "0.7.10-r943-dirty"
 #endif

 int bwa_fa2pac(int argc, char *argv[]);