diff --git a/Makefile b/Makefile
index 70b89b0..390819f 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ AOBJS=		QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli
 			is.o bwtindex.o bwape.o \
 			bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
 			bwtsw2_chain.o fastmap.o bwtsw2_pair.o
-PROG=		bwa
+PROG=		bwa bwamem-lite
 INCLUDES=	
 LIBS=		-lm -lz -lpthread
 SUBDIRS=	.
@@ -25,6 +25,9 @@ all:$(PROG)
 bwa:libbwa.a $(AOBJS) main.o
 		$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
 
+bwamem-lite:libbwa.a example.o
+		$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
+
 libbwa.a:$(LOBJS)
 		$(AR) -csru $@ $(LOBJS)
 
diff --git a/NEWS b/NEWS
index d68c693..35202f1 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,56 @@
+Beta Release 0.7.0 (28 Feburary, 2013)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
+sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
+algorithm and extends seeds with banded affine-gap-penalty dynamic programming
+(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
+longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
+and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
+and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
+to find hits within a certain edit distance, but BWA is not efficient for such
+task given longer reads anyway, and the edit-distance criterion is arguably
+not as important in long-read alignment.
+
+In addition to the algorithmic improvements, BWA-MEM also implements a few
+handy features in practical aspects:
+
+ 1. BWA-MEM automatically switches between local and glocal (global wrt reads;
+	local wrt reference) alignment. It reports the end-to-end glocal alignment
+	if the glocal alignment is not much worse than the optimal local alignment.
+	Glocal alignment reduces reference bias.
+
+ 2. BWA-MEM automatically infers pair orientation from a batch of single-end
+    alignments. It allows more than one orientations if there are sufficient
+	supporting reads. This feature has not been tested on reads from Illumina
+	jumping library yet. (EXPERIMENTAL)
+
+ 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
+    is possible to convert a name-sorted BAM to an interleaved fastq on the fly
+    and feed the data stream to BWA-MEM for mapping.
+
+ 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
+    helps to transfer individual read annotations to the output.
+
+ 5. BWA-MEM supports more advanced piping. Users can now run:
+    (bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
+    files without replying on bash features.
+
+ 6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
+	program in the source code directory implements a full single-end mapper in
+	50 lines of code.
+
+The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
+for production use yet. However, when the implementation becomes stable after a
+few release cycles, existing BWA users are recommended to migrate to BWA-MEM
+for 76bp or longer Illumina reads and long query sequences. The original BWA
+short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
+reads. Change of mappers will be necessary sooner or later.
+
+(0.7.0 beta: 28 Feburary 2013, r313)
+
+
+
 Release 0.6.2 (19 June, 2012)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/bwa.1 b/bwa.1
index 66bc9a2..45b9921 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,47 +1,45 @@
-.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools"
+.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
 .SH SYNOPSIS
 .PP
-bwa index -a bwtsw database.fasta
+bwa index ref.fa
 .PP
-bwa aln database.fasta short_read.fastq > aln_sa.sai
+bwa mem ref.fa reads.fq > aln-se.sam
 .PP
-bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
+bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
 .PP
-bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
+bwa aln ref.fa short_read.fq > aln_sa.sai
 .PP
-bwa bwasw database.fasta long_read.fastq > aln.sam
+bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+.PP
+bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+.PP
+bwa bwasw ref.fa long_read.fq > aln.sam
 
 .SH DESCRIPTION
 .PP
-BWA is a fast light-weighted tool that aligns relatively short sequences
-(queries) to a sequence database (targe), such as the human reference
-genome. It implements two different algorithms, both based on
-Burrows-Wheeler Transform (BWT). The first algorithm is designed for
-short queries up to ~150bp with low error rate (<3%). It does gapped
-global alignment w.r.t. queries, supports paired-end reads, and is one
-of the fastest short read alignment algorithms to date while also
-visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
-reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like
-alignment to find high-scoring local hits and split hits. On
-low-error short queries, BWA-SW is a little slower and less accurate than the
-first algorithm, but on long queries, it is better.
-.PP
-For both algorithms, the database file in the FASTA format must be
-first indexed with the
-.B `index'
-command, which typically takes a few hours for a 3GB genome. The first algorithm is
-implemented via the
-.B `aln'
-command, which finds the suffix array (SA) coordinates of good hits of
-each individual read, and the
-.B `samse/sampe'
-command, which converts SA coordinates to chromosomal coordinate and
-pairs reads (for `sampe'). The second algorithm is invoked by the
-.B `bwasw'
-command. It works for single-end reads only.
+BWA is a software package for mapping low-divergent sequences against a large
+reference genome, such as the human genome. It consists of three algorithms:
+BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
+sequence reads up to 100bp, while the rest two for longer sequences ranged from
+70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
+support and split alignment, but BWA-MEM, which is the latest, is generally
+recommended for high-quality queries as it is faster and more accurate.
+BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
+reads.
+
+For all the algorithms, BWA first needs to construct the FM-index for
+the reference genome (the
+.B index
+command). Alignment algorithms are invoked with different sub-commands:
+.BR aln / samse / sampe
+for BWA-backtrack,
+.B bwasw
+for BWA-SW and
+.B mem
+for the BWA-MEM algorithm.
 
 .SH COMMANDS AND OPTIONS
 .TP
@@ -53,9 +51,6 @@ Index database sequences in the FASTA format.
 .B OPTIONS:
 .RS
 .TP 10
-.B -c
-Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x)
-.TP
 .BI -p \ STR
 Prefix of the output database [same as db filename]
 .TP
@@ -76,6 +71,175 @@ genome.
 .RE
 .RE
 
+.TP
+.B mem
+.B bwa mem
+.RB [ -aCHMpP ]
+.RB [ -t
+.IR nThreads ]
+.RB [ -k
+.IR minSeedLen ]
+.RB [ -w
+.IR bandWidth ]
+.RB [ -r
+.IR seedSplitRatio ]
+.RB [ -c
+.IR maxOcc ]
+.RB [ -A
+.IR matchScore ]
+.RB [ -B
+.IR mmPenalty ]
+.RB [ -O
+.IR gapOpenPen ]
+.RB [ -E
+.IR gapExtPen ]
+.RB [ -L
+.IR clipPen ]
+.RB [ -U
+.IR unpairPen ]
+.RB [ -R
+.IR RGline ]
+.RB [ -v
+.IR verboseLevel ]
+.I db.prefix
+.I reads.fq
+.RI [ mates.fq ]
+
+Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
+algorithm works by seeding alignments with maximal exact matches (MEMs) and
+then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
+
+If
+.I mates.fq
+file is absent and option
+.B -p
+is not set, this command regards input reads are single-end. If
+.I mates.fq
+is present, this command assumes the
+.IR i -th
+read in
+.I reads.fq
+and the
+.IR i -th
+read in
+.I mates.fq
+constitute a read pair. If
+.B -p
+is used, the command assumes the
+.RI 2 i -th
+and the
+.RI (2 i +1)-th
+read in
+.I reads.fq
+constitute a read pair (such input file is said to be interleaved). In this case,
+.I mates.fq
+is ignored. In the paired-end mode, the
+.B mem
+command will infer the read orientation and the insert size distribution from a
+batch of reads.
+
+The BWA-MEM algorithm performs local alignment. It may produce multiple primary
+alignments for different part of a query sequence. This is a crucial feature
+for long sequences. However, some tools such as Picard's markDuplicates does
+not work with split alignments. One may consider to use option
+.B -M
+to flag shorter split hits as secondary.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -t \ INT
+Number of threads [1]
+.TP
+.BI -k \ INT
+Minimum seed length. Matches shorter than
+.I INT
+will be missed. The alignment speed is usually insensitive to this value unless
+it significantly deviates 20. [19]
+.TP
+.BI -w \ INT
+Band width. Essentially, gaps longer than
+.I INT
+will not be found. Note that the maximum gap length is also affected by the
+scoring matrix and the hit length, not solely determined by this option. [100]
+.TP
+.BI -r \ FLOAT
+Trigger re-seeding for a MEM longer than
+.IR minSeedLen * FLOAT .
+This is a key heuristic parameter for tuning the performance. Larger value
+yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
+.TP
+.BI -c \ INT
+Discard a MEM if it has more than
+.I INT
+occurence in the genome. This is an insensitive parameter. [10000]
+.TP
+.B -P
+In the paired-end mode, perform SW to rescue missing hits only but do not try to find
+hits that fit a proper pair.
+.TP
+.BI -A \ INT
+Matching score. [1]
+.TP
+.BI -B \ INT
+Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
+.TP
+.BI -O \ INT
+Gap open penalty. [6]
+.TP
+.BI -E \ INT
+Gap extension penalty. A gap of length k costs O + k*E (i.e.
+.B -O
+is for opening a zero-length gap). [1]
+.TP
+.BI -L \ INT
+Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
+score reaching the end of query. If this score is larger than the best SW score
+minus the clipping penalty, clipping will not be applied. Note that in this
+case, the SAM AS tag reports the best SW score; clipping penalty is not
+deducted. [5]
+.TP
+.BI -U \ INT
+Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
+.RI scoreRead1+scoreRead2- INT
+and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
+two scores to determine whether we should force pairing. [9]
+.TP
+.B -p
+Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
+.TP
+.BI -R \ STR
+Complete read group header line. '\\t' can be used in
+.I STR
+and will be converted to a TAB in the output SAM. The read group ID will be
+attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
+[null]
+.TP
+.B -a
+Output all found alignments for single-end or unpaired paired-end reads. These
+alignments will be flagged as secondary alignments.
+.TP
+.B -C
+Append append FASTA/Q comment to SAM output. This option can be used to
+transfer read meta information (e.g. barcode) to the SAM output. Note that the
+FASTA/Q comment (the string after a space in the header line) must conform the SAM
+spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
+.TP
+.B -H
+Use hard clipping 'H' in the SAM output. This option may dramatically reduce
+the redundancy of output when mapping long contig or BAC sequences.
+.TP
+.B -M
+Mark shorter split hits as secondary (for Picard compatibility).
+.TP
+.BI -v \ INT
+Control the verbose level of the output. This option has not been fully
+supported throughout BWA. Ideally, a value 0 for disabling all the output to
+stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
+all normal messages; 4 or higher for debugging. When this option takes value
+4, the output is not SAM. [3]
+.RE
+
 .TP
 .B aln
 bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
@@ -482,24 +646,6 @@ Pairing is slower for shorter reads. This is mainly because shorter
 reads have more spurious hits and converting SA coordinates to
 chromosomal coordinates are very costly.
 
-.SH NOTES ON LONG-READ ALIGNMENT
-.PP
-Command
-.B bwasw
-is designed for long-read alignment. BWA-SW essentially aligns the trie
-of the reference genome against the directed acyclic word graph (DAWG) of a
-read to find seeds not highly repetitive in the genome, and then performs a
-standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called
-the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the
-top Z reference suffix intervals that match the vertex. BWA-SW is more accurate
-if the resultant alignment is supported by more seeds, and therefore BWA-SW
-usually performs better on long queries or queries with low divergence to the
-reference genome.
-
-BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads
-mainly because it gives better gapped alignment. For paired-end reads, it is yet
-to know whether BWA-short or BWA-SW yield overall better results.
-
 .SH CHANGES IN BWA-0.6
 .PP
 Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
@@ -534,16 +680,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes
 from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
 libraries are distributed under the MIT license.
 .PP
-If you use the short-read alignment component, please cite the following
+If you use the BWA-backtrack algorithm, please cite the following
 paper:
 .PP
 Li H. and Durbin R. (2009) Fast and accurate short read alignment with
 Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
 .PP
-If you use the long-read component (BWA-SW), please cite:
+If you use the BWA-SW algorithm, please cite:
 .PP
 Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
 Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
+.PP
+If you use the fastmap component of BWA, please cite:
+.PP
+Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de
+novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178]
+.PP
+The BWA-MEM algorithm has not been published yet.
 
 .SH HISTORY
 BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
@@ -569,3 +722,11 @@ short-read aligners are being implemented.
 
 The BWA-SW algorithm is a new component of BWA. It was conceived in
 November 2008 and implemented ten months later.
+
+The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
+matches (SMEMs), which was first published with the fermi assembler paper
+in 2012. I first implemented the basic SMEM algorithm in the
+.B fastmap
+command for an experiment and then extended the basic algorithm and added the
+extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
+
diff --git a/bwa.c b/bwa.c
index 55a0a9b..b3c9191 100644
--- a/bwa.c
+++ b/bwa.c
@@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
 {
 	uint32_t *cigar = 0;
 	uint8_t tmp, *rseq;
-	int i, w;
+	int i;
 	int64_t rlen;
 	*n_cigar = 0; *NM = -1;
 	if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
@@ -86,15 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
 		for (i = 0; i < rlen>>1; ++i)
 			tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
 	}
-	//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
-	//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
-	// set the band-width
-	w = (int)((double)(l_query * mat[0] - q) / r + 1.);
-	w = w < 1? w : 1;
-	w = w < w_? w : w_;
-	w += abs(rlen - l_query);
-	// NW alignment
-	*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+	if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
+		cigar = xmalloc(4);
+		cigar[0] = l_query<<4 | 0;
+		*n_cigar = 1;
+		for (i = 0, *score = 0; i < l_query; ++i)
+			*score += mat[rseq[i]*5 + query[i]];
+	} else {
+		int w, max_gap, min_w;
+		//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+		//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+		// set the band-width
+		max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
+		max_gap = max_gap > 1? max_gap : 1;
+		w = (max_gap + abs(rlen - l_query) + 1) >> 1;
+		w = w < w_? w : w_;
+		min_w = abs(rlen - l_query) + 3;
+		w = w > min_w? w : min_w;
+		// NW alignment
+		*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+	}
 	{// compute NM
 		int k, x, y, n_mm = 0, n_gap = 0;
 		for (k = 0, x = y = 0; k < *n_cigar; ++k) {
diff --git a/bwamem.c b/bwamem.c
index 5a8a4dc..b11a790 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init()
 {
 	mem_opt_t *o;
 	o = xcalloc(1, sizeof(mem_opt_t));
-	o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
 	o->flag = 0;
+	o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
+	o->pen_unpaired = 9;
+	o->pen_clip = 5;
 	o->min_seed_len = 19;
 	o->split_width = 10;
 	o->max_occ = 10000;
@@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init()
 	o->split_factor = 1.5;
 	o->chunk_size = 10000000;
 	o->n_threads = 1;
-	o->pen_unpaired = 9;
 	o->max_matesw = 100;
 	mem_fill_scmat(o->a, o->b, o->mat);
 	return o;
@@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
 
 		if (s->qbeg) { // left extension
 			uint8_t *rs, *qs;
-			int qle, tle;
+			int qle, tle, gtle, gscore;
 			qs = xmalloc(s->qbeg);
 			for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
 			tmp = s->rbeg - rmax[0];
 			rs = xmalloc(tmp);
 			for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
-			a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
-			a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
+			a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, &gtle, &gscore);
+			// check whether we prefer to reach the end of the query
+			if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
+			else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
 			free(qs); free(rs);
 		} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
 
 		if (s->qbeg + s->len != l_query) { // right extension
-			int qle, tle, qe, re;
+			int qle, tle, qe, re, gtle, gscore;
 			qe = s->qbeg + s->len;
 			re = s->rbeg + s->len - rmax[0];
-			a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
-			a->qe = qe + qle; a->re = rmax[0] + re + tle;
+			a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, &gtle, &gscore);
+			// similar to the above
+			if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
+			else a->qe = l_query, a->re = rmax[0] + re + gtle;
 		} else a->qe = l_query, a->re = s->rbeg + s->len;
 		if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
 
@@ -521,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
  * Basic hit->SAM conversion *
  *****************************/
 
+static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
+{
+	int w;
+	if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps
+	w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
+	if (w < abs(l1 - l2)) w = abs(l1 - l2);
+	return w;
+}
+
 void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
 {
 #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
@@ -547,7 +561,10 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
 		int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
 		if (p->flag&0x10000) sam_flag |= 0x100;
 		if (!copy_mate) {
-			cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
+			int w2;
+			w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r);
+			w2 = w2 < w? w2 : w;
+			cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
 			p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
 		} else n_cigar = 0, cigar = 0;
 		pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
@@ -674,7 +691,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
 	s->sam = str.s;
 }
 
-mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
+mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
 {
 	int i;
 	mem_chain_v chn;
@@ -698,6 +715,46 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
 	return regs;
 }
 
+mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
+{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
+	mem_alnreg_v ar;
+	ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
+	mem_mark_primary_se(opt, ar.n, ar.a);
+	return ar;
+}
+
+// This routine is only used for the API purpose
+mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar)
+{
+	mem_aln_t a;
+	int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
+	int64_t pos, rb = ar->rb, re = ar->re;
+	memset(&a, 0, sizeof(mem_aln_t));
+	a.mapq = mem_approx_mapq_se(opt, ar);
+	bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
+	w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
+	w2 = w2 < opt->w? w2 : opt->w;
+	a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
+	a.NM = NM;
+	pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
+	a.is_rev = is_rev;
+	if (qb != 0 || qe != l_query) { // add clipping to CIGAR
+		int clip5, clip3;
+		clip5 = is_rev? l_query - qe : qb;
+		clip3 = is_rev? qb : l_query - qe;
+		a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
+		if (clip5) {
+			memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
+			a.cigar[0] = clip5<<4|3;
+			++a.n_cigar;
+		}
+		if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
+	}
+	a.rid = bns_pos2rid(bns, pos);
+	a.pos = pos - bns->anns[a.rid].offset;
+	return a;
+}
+
 typedef struct {
 	int start, step, n;
 	const mem_opt_t *opt;
@@ -715,11 +772,11 @@ static void *worker1(void *data)
 	int i;
 	if (!(w->opt->flag&MEM_F_PE)) {
 		for (i = w->start; i < w->n; i += w->step)
-			w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
+			w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
 	} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
 		for (i = w->start; i < w->n>>1; i += w->step) {
-			w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
-			w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
+			w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
+			w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
 		}
 	}
 	return 0;
diff --git a/bwamem.h b/bwamem.h
index 8a7c7b8..c2f124c 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -19,7 +19,10 @@ typedef struct __smem_i smem_i;
 
 typedef struct {
 	int a, b, q, r;         // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
+	int pen_unpaired;       // phred-scaled penalty for unpaired reads
+	int pen_clip;           // clipping penalty. This score is not deducted from the DP score.
 	int w;                  // band width
+
 	int flag;               // see MEM_F_* macros
 	int min_seed_len;       // minimum seed length
 	float split_factor;     // split into a seed if MEM is longer than min_seed_len*split_factor
@@ -30,7 +33,6 @@ typedef struct {
 	int chunk_size;         // process chunk_size-bp sequences in a batch
 	float mask_level;       // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
 	float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
-	int pen_unpaired;       // phred-scaled penalty for unpaired reads
 	int max_ins;            // when estimating insert size distribution, skip pairs with insert longer than this value
 	int max_matesw;         // perform maximally max_matesw rounds of mate-SW for each end
 	int8_t mat[25];         // scoring matrix; mat[0] == 0 if unset
@@ -47,19 +49,27 @@ typedef struct {
 	int secondary;  // index of the parent hit shadowing the current hit; <0 if primary
 } mem_alnreg_t;
 
+typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
+
 typedef struct {
 	int low, high, failed;
 	double avg, std;
 } mem_pestat_t;
 
-typedef struct {
+typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
 	int64_t rb, re;
 	int qb, qe, flag, qual;
 	// optional info
 	int score, sub;
 } bwahit_t;
 
-typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
+typedef struct { // This struct is only used for the convenience of API.
+	int rid;
+	int pos;
+	uint32_t is_rev:1, mapq:8, NM:23;
+	int n_cigar;
+	uint32_t *cigar;
+} mem_aln_t;
 
 #ifdef __cplusplus
 extern "C" {
@@ -112,6 +122,8 @@ extern "C" {
 	 */
 	mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
 
+	mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
+
 	/**
 	 * Infer the insert size distribution from interleaved alignment regions
 	 *
diff --git a/example.c b/example.c
new file mode 100644
index 0000000..31bb231
--- /dev/null
+++ b/example.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <zlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include "bwamem.h"
+#include "kseq.h" // for the FASTA/Q parser
+KSEQ_DECLARE(gzFile)
+
+int main(int argc, char *argv[])
+{
+	bwaidx_t *idx;
+	gzFile fp;
+	kseq_t *ks;
+	mem_opt_t *opt;
+
+	if (argc < 3) {
+		fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
+		return 1;
+	}
+
+	idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
+	if (NULL == idx) {
+		fprintf(stderr, "Index load failed.\n");
+		exit(EXIT_FAILURE);
+	}
+	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+	if (NULL == fp) {
+		fprintf(stderr, "Couldn't open %s : %s\n",
+				strcmp(argv[2], "-") ? argv[2] : "stdin",
+				errno ? strerror(errno) : "Out of memory");
+		exit(EXIT_FAILURE);
+	}
+	ks = kseq_init(fp); // initialize the FASTA/Q parser
+	opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
+
+	while (kseq_read(ks) >= 0) { // read one sequence
+		mem_alnreg_v ar;
+		int i, k;
+		ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
+		for (i = 0; i < ar.n; ++i) { // traverse each hit
+			mem_aln_t a;
+			if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
+			a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
+			// print alignment
+			err_printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
+			for (k = 0; k < a.n_cigar; ++k) // print CIGAR
+				err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
+			err_printf("\t%d\n", a.NM); // print edit distance
+			free(a.cigar); // don't forget to deallocate CIGAR
+		}
+		free(ar.a); // and deallocate the hit list
+	}
+
+	free(opt);
+	kseq_destroy(ks);
+	gzclose(fp);
+	bwa_idx_destroy(idx);
+	return 0;
+}
diff --git a/fastmap.c b/fastmap.c
index ec517e2..0d87007 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -27,13 +27,15 @@ int main_mem(int argc, char *argv[])
 	void *ko = 0, *ko2 = 0;
 
 	opt = mem_opt_init();
-	while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) {
+	while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
 		if (c == 'k') opt->min_seed_len = atoi(optarg);
 		else if (c == 'w') opt->w = atoi(optarg);
 		else if (c == 'A') opt->a = atoi(optarg);
 		else if (c == 'B') opt->b = atoi(optarg);
 		else if (c == 'O') opt->q = atoi(optarg);
 		else if (c == 'E') opt->r = atoi(optarg);
+		else if (c == 'L') opt->pen_clip = atoi(optarg);
+		else if (c == 'U') opt->pen_unpaired = atoi(optarg);
 		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
 		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
 		else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
@@ -57,13 +59,15 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -k INT     minimum seed length [%d]\n", opt->min_seed_len);
 		fprintf(stderr, "       -w INT     band width for banded alignment [%d]\n", opt->w);
 		fprintf(stderr, "       -r FLOAT   look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
-		fprintf(stderr, "       -s INT     look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+//		fprintf(stderr, "       -s INT     look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
 		fprintf(stderr, "       -c INT     skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
 		fprintf(stderr, "       -P         skip pairing; perform mate SW only\n");
 		fprintf(stderr, "       -A INT     score for a sequence match [%d]\n", opt->a);
 		fprintf(stderr, "       -B INT     penalty for a mismatch [%d]\n", opt->b);
 		fprintf(stderr, "       -O INT     gap open penalty [%d]\n", opt->q);
 		fprintf(stderr, "       -E INT     gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
+		fprintf(stderr, "       -L INT     penalty for clipping [%d]\n", opt->pen_clip);
+		fprintf(stderr, "       -U INT     penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
 		fprintf(stderr, "\nInput/output options:\n\n");
 		fprintf(stderr, "       -p         first query file consists of interleaved paired-end sequences\n");
 		fprintf(stderr, "       -R STR     read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
@@ -73,6 +77,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -C         append FASTA/FASTQ comment to SAM output\n");
 		fprintf(stderr, "       -H         hard clipping\n");
 		fprintf(stderr, "       -M         mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
+		fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
 		fprintf(stderr, "\n");
 		free(opt);
 		return 1;
diff --git a/ksw.c b/ksw.c
index 32b8a3a..3c279d7 100644
--- a/ksw.c
+++ b/ksw.c
@@ -360,11 +360,11 @@ typedef struct {
 	int32_t h, e;
 } eh_t;
 
-int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
 {
 	eh_t *eh; // score array
 	int8_t *qp; // query profile
-	int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
+	int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
 	if (h0 < 0) h0 = 0;
 	// allocate memory
 	qp = xmalloc(qlen * m);
@@ -386,7 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 	max_gap = max_gap > 1? max_gap : 1;
 	w = w < max_gap? w : max_gap;
 	// DP loop
-	max = h0, max_i = max_j = -1;
+	max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
 	beg = 0, end = qlen;
 	for (i = 0; LIKELY(i < tlen); ++i) {
 		int f = 0, h1, m = 0, mj = -1;
@@ -422,6 +422,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 			f = f > h? f : h;   // computed F(i,j+1)
 		}
 		eh[end].h = h1; eh[end].e = 0;
+		if (j == qlen) {
+			max_ie = gscore > h1? max_ie : i;
+			gscore = gscore > h1? gscore : h1;
+		}
 		if (m == 0) break;
 		if (m > max) max = m, max_i = i, max_j = mj;
 		// update beg and end for the next round
@@ -434,6 +438,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
 	free(eh); free(qp);
 	if (_qle) *_qle = max_j + 1;
 	if (_tle) *_tle = max_i + 1;
+	if (_gtle) *_gtle = max_ie + 1;
+	if (_gscore) *_gscore = gscore;
 	return max;
 }
 
diff --git a/ksw.h b/ksw.h
index 5162dc0..d2975de 100644
--- a/ksw.h
+++ b/ksw.h
@@ -30,7 +30,7 @@ extern "C" {
 	 * @param tlen    length of the target sequence
 	 * @param target  target sequence
 	 * @param m       number of residue types
-	 * @param mat     m*m scoring matrix in one-dimention array
+	 * @param mat     m*m scoring matrix in one-dimension array
 	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
 	 * @param gape    gap extension penalty
 	 * @param xtra    extra information (see below)
@@ -62,8 +62,47 @@ extern "C" {
 	 */
 	kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
 
-	int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
-	int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
+	/**
+	 * Banded global alignment
+	 *
+	 * @param qlen    query length
+	 * @param query   query sequence with 0 <= query[i] < m
+	 * @param tlen    target length
+	 * @param target  target sequence with 0 <= target[i] < m
+	 * @param m       number of residue types
+	 * @param mat     m*m scoring mattrix in one-dimension array
+	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+	 * @param gape    gap extension penalty
+	 * @param w       band width
+	 * @param n_cigar (out) number of CIGAR elements
+	 * @param cigar   (out) BAM-encoded CIGAR; caller need to deallocate with free()
+	 *
+	 * @return        score of the alignment
+	 */
+	int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
+
+	/**
+	 * Extend alignment
+	 *
+	 * The routine aligns $query and $target, assuming their upstream sequences,
+	 * which are not provided, have been aligned with score $h0. In return,
+	 * region [0,*qle) on the query and [0,*tle) on the target sequences are
+	 * aligned together. If *gscore>=0, *gscore keeps the best score such that
+	 * the entire query sequence is aligned; *gtle keeps the position on the
+	 * target where *gscore is achieved. Returning *gscore and *gtle helps the
+	 * caller to decide whether an end-to-end hit or a partial hit is preferred.
+	 *
+	 * The first 9 parameters are identical to those in ksw_global()
+	 *
+	 * @param h0      alignment score of upstream sequences
+	 * @param _qle    (out) length of the query in the alignment
+	 * @param _tle    (out) length of the target in the alignment
+	 * @param _gtle   (out) length of the target if query is fully aligned
+	 * @param _gscore (out) score of the best end-to-end alignment; negative if not found
+	 *
+	 * @return        best semi-local alignment score
+	 */
+	int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore);
 
 #ifdef __cplusplus
 }
diff --git a/main.c b/main.c
index 764c0d2..db25f82 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.6.2-r301-beta"
+#define PACKAGE_VERSION "0.7.0-r313"
 #endif
 
 static int usage()