2018-02-09 23:00:35 +08:00
#!/usr/bin/env k8
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * * * Library functions * * * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Command line option parsing *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
var getopt = function ( args , ostr ) {
var oli ; // option letter list index
if ( typeof ( getopt . place ) == 'undefined' )
getopt . ind = 0 , getopt . arg = null , getopt . place = - 1 ;
if ( getopt . place == - 1 ) { // update scanning pointer
if ( getopt . ind >= args . length || args [ getopt . ind ] . charAt ( getopt . place = 0 ) != '-' ) {
getopt . place = - 1 ;
return null ;
}
if ( getopt . place + 1 < args [ getopt . ind ] . length && args [ getopt . ind ] . charAt ( ++ getopt . place ) == '-' ) { // found "--"
++ getopt . ind ;
getopt . place = - 1 ;
return null ;
}
}
var optopt = args [ getopt . ind ] . charAt ( getopt . place ++ ) ; // character checked for validity
if ( optopt == ':' || ( oli = ostr . indexOf ( optopt ) ) < 0 ) {
if ( optopt == '-' ) return null ; // if the user didn't specify '-' as an option, assume it means null.
if ( getopt . place < 0 ) ++ getopt . ind ;
return '?' ;
}
if ( oli + 1 >= ostr . length || ostr . charAt ( ++ oli ) != ':' ) { // don't need argument
getopt . arg = null ;
if ( getopt . place < 0 || getopt . place >= args [ getopt . ind ] . length ) ++ getopt . ind , getopt . place = - 1 ;
} else { // need an argument
if ( getopt . place >= 0 && getopt . place < args [ getopt . ind ] . length )
getopt . arg = args [ getopt . ind ] . substr ( getopt . place ) ;
else if ( args . length <= ++ getopt . ind ) { // no arg
getopt . place = - 1 ;
if ( ostr . length > 0 && ostr . charAt ( 0 ) == ':' ) return ':' ;
return '?' ;
} else getopt . arg = args [ getopt . ind ] ; // white space
getopt . place = - 1 ;
++ getopt . ind ;
}
return optopt ;
}
/ * * * * * * * * * * * * * * * * * * * * * * *
* Interval operations *
* * * * * * * * * * * * * * * * * * * * * * * /
Interval = { } ;
Interval . sort = function ( a )
{
if ( typeof a [ 0 ] == 'number' )
a . sort ( function ( x , y ) { return x - y } ) ;
else a . sort ( function ( x , y ) { return x [ 0 ] != y [ 0 ] ? x [ 0 ] - y [ 0 ] : x [ 1 ] - y [ 1 ] } ) ;
}
Interval . merge = function ( a , sorted )
{
if ( typeof sorted == 'undefined' ) sorted = true ;
if ( ! sorted ) Interval . sort ( a ) ;
var k = 0 ;
for ( var i = 1 ; i < a . length ; ++ i ) {
if ( a [ k ] [ 1 ] >= a [ i ] [ 0 ] )
a [ k ] [ 1 ] = a [ k ] [ 1 ] > a [ i ] [ 1 ] ? a [ k ] [ 1 ] : a [ i ] [ 1 ] ;
else a [ ++ k ] = a [ i ] . slice ( 0 ) ;
}
a . length = k + 1 ;
}
Interval . index _end = function ( a , sorted )
{
if ( a . length == 0 ) return ;
if ( typeof sorted == 'undefined' ) sorted = true ;
if ( ! sorted ) Interval . sort ( a ) ;
a [ 0 ] . push ( 0 ) ;
var k = 0 , k _en = a [ 0 ] [ 1 ] ;
for ( var i = 1 ; i < a . length ; ++ i ) {
if ( k _en <= a [ i ] [ 0 ] ) {
for ( ++ k ; k < i ; ++ k )
if ( a [ k ] [ 1 ] > a [ i ] [ 0 ] )
break ;
k _en = a [ k ] [ 1 ] ;
}
a [ i ] . push ( k ) ;
}
}
Interval . find _intv = function ( a , x )
{
var left = - 1 , right = a . length ;
if ( typeof a [ 0 ] == 'number' ) {
while ( right - left > 1 ) {
var mid = left + ( ( right - left ) >> 1 ) ;
if ( a [ mid ] > x ) right = mid ;
else if ( a [ mid ] < x ) left = mid ;
else return mid ;
}
} else {
while ( right - left > 1 ) {
var mid = left + ( ( right - left ) >> 1 ) ;
if ( a [ mid ] [ 0 ] > x ) right = mid ;
else if ( a [ mid ] [ 0 ] < x ) left = mid ;
else return mid ;
}
}
return left ;
}
Interval . find _ovlp = function ( a , st , en )
{
if ( a . length == 0 || st >= en ) return [ ] ;
var l = Interval . find _intv ( a , st ) ;
var k = l < 0 ? 0 : a [ l ] [ a [ l ] . length - 1 ] ;
var b = [ ] ;
for ( var i = k ; i < a . length ; ++ i ) {
if ( a [ i ] [ 0 ] >= en ) break ;
else if ( st < a [ i ] [ 1 ] )
b . push ( a [ i ] ) ;
}
return b ;
}
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Reverse and reverse complement *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
Bytes . prototype . reverse = function ( )
{
for ( var i = 0 ; i < this . length >> 1 ; ++ i ) {
var tmp = this [ i ] ;
this [ i ] = this [ this . length - i - 1 ] ;
this [ this . length - i - 1 ] = tmp ;
}
}
// reverse complement a DNA string
Bytes . prototype . revcomp = function ( )
{
if ( Bytes . rctab == null ) {
var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn' ;
var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn' ;
Bytes . rctab = [ ] ;
for ( var i = 0 ; i < 256 ; ++ i ) Bytes . rctab [ i ] = 0 ;
for ( var i = 0 ; i < s1 . length ; ++ i )
Bytes . rctab [ s1 . charCodeAt ( i ) ] = s2 . charCodeAt ( i ) ;
}
for ( var i = 0 ; i < this . length >> 1 ; ++ i ) {
var tmp = this [ this . length - i - 1 ] ;
this [ this . length - i - 1 ] = Bytes . rctab [ this [ i ] ] ;
this [ i ] = Bytes . rctab [ tmp ] ;
}
if ( this . length & 1 )
this [ this . length >> 1 ] = Bytes . rctab [ this [ this . length >> 1 ] ] ;
}
/ * * * * * * * * * * * * * * * * * * * *
* * * * * paftools * * * * *
* * * * * * * * * * * * * * * * * * * * /
// variant calling
function paf _call ( args )
{
var re _cs = /([:=*+-])(\d+|[A-Za-z]+)/g ;
var c , min _cov _len = 10000 , min _var _len = 50000 , gap _thres = 50 , min _mapq = 5 ;
while ( ( c = getopt ( args , "l:L:g:q:B:" ) ) != null ) {
if ( c == 'l' ) min _cov _len = parseInt ( getopt . arg ) ;
else if ( c == 'L' ) min _var _len = parseInt ( getopt . arg ) ;
else if ( c == 'g' ) gap _thres = parseInt ( getopt . arg ) ;
else if ( c == 'q' ) min _mapq = parseInt ( getopt . arg ) ;
}
if ( args . length == getopt . ind ) {
print ( "Usage: sort -k6,6 -k8,8n <with-cs.paf> | paftools.js call [options] -" ) ;
print ( "Options:" ) ;
print ( " -l INT min alignment length to compute coverage [" + min _cov _len + "]" ) ;
print ( " -L INT min alignment length to call variants [" + min _var _len + "]" ) ;
print ( " -q INT min mapping quality [" + min _mapq + "]" ) ;
print ( " -g INT short/long gap threshold (for statistics only) [" + gap _thres + "]" ) ;
exit ( 1 ) ;
}
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var tot _len = 0 , n _sub = [ 0 , 0 , 0 ] , n _ins = [ 0 , 0 , 0 , 0 ] , n _del = [ 0 , 0 , 0 , 0 ] ;
function count _var ( o )
{
if ( o [ 3 ] > 1 ) return ;
if ( o [ 5 ] == '-' && o [ 6 ] == '-' ) return ;
if ( o [ 5 ] == '-' ) { // insertion
var l = o [ 6 ] . length ;
if ( l == 1 ) ++ n _ins [ 0 ] ;
else if ( l == 2 ) ++ n _ins [ 1 ] ;
else if ( l < gap _thres ) ++ n _ins [ 2 ] ;
else ++ n _ins [ 3 ] ;
} else if ( o [ 6 ] == '-' ) { // deletion
var l = o [ 5 ] . length ;
if ( l == 1 ) ++ n _del [ 0 ] ;
else if ( l == 2 ) ++ n _del [ 1 ] ;
else if ( l < gap _thres ) ++ n _del [ 2 ] ;
else ++ n _del [ 3 ] ;
} else {
++ n _sub [ 0 ] ;
var s = o [ 5 ] + o [ 6 ] ;
if ( s == 'ag' || s == 'ga' || s == 'ct' || s == 'tc' )
++ n _sub [ 1 ] ;
else ++ n _sub [ 2 ] ;
}
}
var a = [ ] , out = [ ] ;
var c1 _ctg = null , c1 _start = 0 , c1 _end = 0 , c1 _counted = false , c1 _len = 0 ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
if ( ! /\ts2:i:/ . test ( line ) ) continue ; // skip secondary alignments
var m , t = line . split ( "\t" , 12 ) ;
for ( var i = 6 ; i <= 11 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
if ( t [ 10 ] < min _cov _len || t [ 11 ] < min _mapq ) continue ;
for ( var i = 1 ; i <= 3 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
var ctg = t [ 5 ] , x = t [ 7 ] , end = t [ 8 ] ;
var query = t [ 0 ] , rev = ( t [ 4 ] == '-' ) , y = rev ? t [ 3 ] : t [ 2 ] ;
// compute regions covered by 1 contig
if ( ctg != c1 _ctg || x >= c1 _end ) {
if ( c1 _counted && c1 _end > c1 _start ) {
c1 _len += c1 _end - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , c1 _end ) ;
}
c1 _ctg = ctg , c1 _start = x , c1 _end = end ;
c1 _counted = ( t [ 10 ] >= min _var _len ) ;
} else if ( end > c1 _end ) { // overlap
if ( c1 _counted && x > c1 _start ) {
c1 _len += x - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , x ) ;
}
c1 _start = c1 _end , c1 _end = end ;
c1 _counted = ( t [ 10 ] >= min _var _len ) ;
} else { // contained
if ( c1 _counted && x > c1 _start ) {
c1 _len += x - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , x ) ;
}
c1 _start = end ;
}
// output variants ahead of this alignment
while ( out . length ) {
if ( out [ 0 ] [ 0 ] != ctg || out [ 0 ] [ 2 ] <= x ) {
count _var ( out [ 0 ] ) ;
print ( 'V' , out [ 0 ] . join ( "\t" ) ) ;
out . shift ( ) ;
} else break ;
}
// update coverage
for ( var i = 0 ; i < out . length ; ++ i )
if ( out [ i ] [ 1 ] >= x && out [ i ] [ 2 ] <= end )
++ out [ i ] [ 3 ] ;
// drop alignments that don't overlap with the current one
var k = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ 0 ] [ 0 ] == ctg && a [ 0 ] [ 2 ] > x )
a [ k ++ ] = a [ i ] ;
a . length = k ;
// core loop
if ( t [ 10 ] >= min _var _len ) {
if ( ( m = /\tcs:Z:(\S+)/ . exec ( line ) ) == null ) continue ; // no cs tag
var cs = m [ 1 ] ;
var blen = 0 , n _diff = 0 ;
tot _len += t [ 10 ] ;
while ( ( m = re _cs . exec ( cs ) ) != null ) {
var cov = 1 ;
if ( m [ 1 ] == '*' || m [ 1 ] == '+' || m [ 1 ] == '-' )
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ 0 ] [ 2 ] > x ) ++ cov ;
var qs , qe ;
if ( m [ 1 ] == '=' || m [ 1 ] == ':' ) {
var l = m [ 1 ] == '=' ? m [ 2 ] . length : parseInt ( m [ 2 ] ) ;
if ( rev ) y -= l ;
else y += l ;
x += l , blen += l ;
} else if ( m [ 1 ] == '*' ) {
if ( rev ) qs = y - 1 , qe = y , -- y ;
else qs = y , qe = y + 1 , ++ y ;
out . push ( [ t [ 5 ] , x , x + 1 , cov , t [ 11 ] , m [ 2 ] . charAt ( 0 ) , m [ 2 ] . charAt ( 1 ) , query , qs , qe , rev ? '-' : '+' ] ) ;
++ x , ++ blen , ++ n _diff ;
} else if ( m [ 1 ] == '+' ) {
var l = m [ 2 ] . length ;
if ( rev ) qs = y - l , qe = y , y -= l ;
else qs = y , qe = y + l , y += l ;
out . push ( [ t [ 5 ] , x , x , cov , t [ 11 ] , '-' , m [ 2 ] , query , qs , qe , rev ? '-' : '+' ] ) ;
++ blen , ++ n _diff ;
} else if ( m [ 1 ] == '-' ) {
var l = m [ 2 ] . length ;
out . push ( [ t [ 5 ] , x , x + l , cov , t [ 11 ] , m [ 2 ] , '-' , query , y , y , rev ? '-' : '+' ] ) ;
x += l , ++ blen , ++ n _diff ;
}
}
}
a . push ( [ t [ 5 ] , t [ 7 ] , t [ 8 ] ] ) ;
}
if ( c1 _counted && c1 _end > c1 _start ) {
c1 _len += c1 _end - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , c1 _end ) ;
}
while ( out . length ) {
count _var ( out [ 0 ] ) ;
print ( 'V' , out [ 0 ] . join ( "\t" ) ) ;
out . shift ( ) ;
}
//warn(tot_len + " alignment columns considered in calling");
warn ( c1 _len + " reference bases covered by exactly one contig" ) ;
warn ( n _sub [ 0 ] + " substitutions; ts/tv = " + ( n _sub [ 1 ] / n _sub [ 2 ] ) . toFixed ( 3 ) ) ;
warn ( n _del [ 0 ] + " 1bp deletions" ) ;
warn ( n _ins [ 0 ] + " 1bp insertions" ) ;
warn ( n _del [ 1 ] + " 2bp deletions" ) ;
warn ( n _ins [ 1 ] + " 2bp insertions" ) ;
warn ( n _del [ 2 ] + " [3," + gap _thres + ") deletions" ) ;
warn ( n _ins [ 2 ] + " [3," + gap _thres + ") insertions" ) ;
warn ( n _del [ 3 ] + " >=" + gap _thres + " deletions" ) ;
warn ( n _ins [ 3 ] + " >=" + gap _thres + " insertions" ) ;
buf . destroy ( ) ;
file . close ( ) ;
}
2018-02-09 23:23:30 +08:00
/ * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * Conversion related * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * /
function paf _view ( args )
{
var c , line _len = 80 , fmt = "aln" ;
while ( ( c = getopt ( args , "f:l:" ) ) != null ) {
if ( c == 'f' ) {
fmt = getopt . arg ;
if ( fmt != "aln" && fmt != "lastz-cigar" && fmt != "maf" )
throw Error ( "format must be one of aln, lastz-cigar and maf" ) ;
} else if ( c == 'l' ) line _len = parseInt ( getopt . arg ) ;
}
if ( line _len == 0 ) line _len = 0x7fffffff ;
if ( getopt . ind == args . length ) {
print ( "Usage: paftools.js view [options] <in.paf>" ) ;
print ( "Options:" ) ;
print ( " -f STR output format: aln (BLAST-like), maf or lastz-cigar [aln]" ) ;
print ( " -l INT line length in BLAST-like output [80]" ) ;
exit ( 1 ) ;
}
function padding _str ( x , len , right )
{
var s = x . toString ( ) ;
if ( s . length < len ) {
if ( right ) s += Array ( len - s . length + 1 ) . join ( " " ) ;
else s = Array ( len - s . length + 1 ) . join ( " " ) + s ;
}
return s ;
}
function update _aln ( s _ref , s _qry , s _mid , type , seq , slen )
{
var l = type == '*' ? 1 : seq . length ;
if ( type == '=' || type == ':' ) {
s _ref . set ( seq ) ;
s _qry . set ( seq ) ;
s _mid . set ( Array ( l + 1 ) . join ( "|" ) ) ;
slen [ 0 ] += l , slen [ 1 ] += l ;
} else if ( type == '*' ) {
s _ref . set ( seq . charAt ( 0 ) ) ;
s _qry . set ( seq . charAt ( 1 ) ) ;
s _mid . set ( ' ' ) ;
slen [ 0 ] += 1 , slen [ 1 ] += 1 ;
} else if ( type == '+' ) {
s _ref . set ( Array ( l + 1 ) . join ( "-" ) ) ;
s _qry . set ( seq ) ;
s _mid . set ( Array ( l + 1 ) . join ( " " ) ) ;
slen [ 1 ] += l ;
} else if ( type == '-' ) {
s _ref . set ( seq ) ;
s _qry . set ( Array ( l + 1 ) . join ( "-" ) ) ;
s _mid . set ( Array ( l + 1 ) . join ( " " ) ) ;
slen [ 0 ] += l ;
}
}
function print _aln ( rs , qs , strand , slen , elen , s _ref , s _qry , s _mid )
{
print ( [ "Ref+:" , padding _str ( rs + slen [ 0 ] + 1 , 10 , false ) , s _ref . toString ( ) , padding _str ( rs + elen [ 0 ] , 10 , true ) ] . join ( " " ) ) ;
print ( " " + s _mid . toString ( ) ) ;
var st , en ;
if ( strand == '+' ) st = qs + slen [ 1 ] + 1 , en = qs + elen [ 1 ] ;
else st = qs - slen [ 1 ] , en = qs - elen [ 1 ] + 1 ;
print ( [ "Qry" + strand + ":" , padding _str ( st , 10 , false ) , s _qry . toString ( ) , padding _str ( en , 10 , true ) ] . join ( " " ) ) ;
}
var s _ref = new Bytes ( ) , s _qry = new Bytes ( ) , s _mid = new Bytes ( ) ; // these are used to show padded alignment
var re _cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g ;
var re _cg = /(\d+)([MIDNSH])/g ;
var buf = new Bytes ( ) ;
var file = args [ getopt . ind ] == "-" ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var lineno = 0 ;
if ( fmt == "maf" ) print ( "##maf version=1\n" ) ;
while ( file . readline ( buf ) >= 0 ) {
var m , line = buf . toString ( ) ;
var t = line . split ( "\t" , 12 ) ;
++ lineno ;
s _ref . length = s _qry . length = s _mid . length = 0 ;
var slen = [ 0 , 0 ] , elen = [ 0 , 0 ] ;
if ( fmt == "lastz-cigar" ) { // LASTZ-cigar output
var cg = ( m = /\tcg:Z:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
if ( cg == null ) {
warn ( "WARNING: converting to LASTZ-cigar format requires the 'cg' tag, which is absent on line " + lineno ) ;
continue ;
}
var score = ( m = /\tAS:i:(\d+)/ . exec ( line ) ) != null ? m [ 1 ] : 0 ;
var out = [ 'cigar:' , t [ 0 ] , t [ 2 ] , t [ 3 ] , t [ 4 ] , t [ 5 ] , t [ 7 ] , t [ 8 ] , '+' , score ] ;
while ( ( m = re _cg . exec ( cg ) ) != null )
out . push ( m [ 2 ] , m [ 1 ] ) ;
print ( out . join ( " " ) ) ;
} else if ( fmt == "maf" ) { // MAF output
var cs = ( m = /\tcs:Z:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
if ( cs == null ) {
warn ( "WARNING: converting to MAF requires the 'cs' tag, which is absent on line " + lineno ) ;
continue ;
}
while ( ( m = re _cs . exec ( cs ) ) != null ) {
if ( m [ 1 ] == ':' )
throw Error ( "converting to MAF only works with 'minimap2 --cs=long'" ) ;
update _aln ( s _ref , s _qry , s _mid , m [ 1 ] , m [ 2 ] , elen ) ;
}
var score = ( m = /\tAS:i:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : 0 ;
var len = t [ 0 ] . length > t [ 5 ] . length ? t [ 0 ] . length : t [ 5 ] . length ;
print ( "a " + score ) ;
print ( [ "s" , padding _str ( t [ 5 ] , len , true ) , padding _str ( t [ 7 ] , 10 , false ) , padding _str ( parseInt ( t [ 8 ] ) - parseInt ( t [ 7 ] ) , 10 , false ) ,
"+" , padding _str ( t [ 6 ] , 10 , false ) , s _ref . toString ( ) ] . join ( " " ) ) ;
var qs , qe , ql = parseInt ( t [ 1 ] ) ;
if ( t [ 4 ] == '+' ) {
qs = parseInt ( t [ 2 ] ) ;
qe = parseInt ( t [ 3 ] ) ;
} else {
qs = ql - parseInt ( t [ 3 ] ) ;
qe = ql - parseInt ( t [ 2 ] ) ;
}
print ( [ "s" , padding _str ( t [ 0 ] , len , true ) , padding _str ( qs , 10 , false ) , padding _str ( qe - qs , 10 , false ) ,
t [ 4 ] , padding _str ( ql , 10 , false ) , s _qry . toString ( ) ] . join ( " " ) ) ;
print ( "" ) ;
} else { // BLAST-like output
var cs = ( m = /\tcs:Z:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
if ( cs == null ) {
warn ( "WARNING: converting to BLAST-like alignment requires the 'cs' tag, which is absent on line " + lineno ) ;
continue ;
}
line = line . replace ( /\tc[sg]:Z:\S+/g , "" ) ; // get rid of cs or cg tags
print ( '>' + line ) ;
var rs = parseInt ( t [ 7 ] ) , qs = t [ 4 ] == '+' ? parseInt ( t [ 2 ] ) : parseInt ( t [ 3 ] ) ;
var n _blocks = 0 ;
while ( ( m = re _cs . exec ( cs ) ) != null ) {
if ( m [ 1 ] == ':' ) m [ 2 ] = Array ( parseInt ( m [ 2 ] ) + 1 ) . join ( "=" ) ;
var start = 0 , rest = m [ 1 ] == '*' ? 1 : m [ 2 ] . length ;
while ( rest > 0 ) {
var l _proc ;
if ( s _ref . length + rest >= line _len ) {
l _proc = line _len - s _ref . length ;
update _aln ( s _ref , s _qry , s _mid , m [ 1 ] , m [ 1 ] == '*' ? m [ 2 ] : m [ 2 ] . substr ( start , l _proc ) , elen ) ;
if ( n _blocks > 0 ) print ( "" ) ;
print _aln ( rs , qs , t [ 4 ] , slen , elen , s _ref , s _qry , s _mid ) ;
++ n _blocks ;
s _ref . length = s _qry . length = s _mid . length = 0 ;
slen [ 0 ] = elen [ 0 ] , slen [ 1 ] = elen [ 1 ] ;
} else {
l _proc = rest ;
update _aln ( s _ref , s _qry , s _mid , m [ 1 ] , m [ 1 ] == '*' ? m [ 2 ] : m [ 2 ] . substr ( start , l _proc ) , elen ) ;
}
rest -= l _proc , start += l _proc ;
}
}
if ( s _ref . length > 0 ) {
if ( n _blocks > 0 ) print ( "" ) ;
print _aln ( rs , qs , t [ 4 ] , slen , elen , s _ref , s _qry , s _mid ) ;
++ n _blocks ;
}
print ( "//" ) ;
}
}
file . close ( ) ;
buf . destroy ( ) ;
s _ref . destroy ( ) ; s _qry . destroy ( ) ; s _mid . destroy ( ) ;
}
function paf _sam2paf ( args )
{
var c , pri _only = false ;
while ( ( c = getopt ( args , "p" ) ) != null )
if ( c == 'p' ) pri _only = true ;
var file = args . length == getopt . ind || args [ getopt . ind ] == "-" ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var re = /(\d+)([MIDSHNX=])/g ;
var len = { } , lineno = 0 ;
while ( file . readline ( buf ) >= 0 ) {
var m , n _cigar = 0 , line = buf . toString ( ) ;
++ lineno ;
if ( line . charAt ( 0 ) == '@' ) {
if ( /^@SQ/ . test ( line ) ) {
var name = ( m = /\tSN:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
var l = ( m = /\tLN:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : null ;
if ( name != null && l != null ) len [ name ] = l ;
}
continue ;
}
var t = line . split ( "\t" ) ;
var flag = parseInt ( t [ 1 ] ) ;
if ( t [ 9 ] != '*' && t [ 10 ] != '*' && t [ 9 ] . length != t [ 10 ] . length ) throw Error ( "ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t [ 9 ] . length + " != " + t [ 10 ] . length ) ;
if ( t [ 2 ] == '*' || ( flag & 4 ) ) continue ;
if ( pri _only && ( flag & 0x100 ) ) continue ;
var tlen = len [ t [ 2 ] ] ;
if ( tlen == null ) throw Error ( "ERROR at line " + lineno + ": can't find the length of contig " + t [ 2 ] ) ;
var nn = ( m = /\tnn:i:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : 0 ;
var NM = ( m = /\tNM:i:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : null ;
var have _NM = NM == null ? false : true ;
NM += nn ;
var clip = [ 0 , 0 ] , I = [ 0 , 0 ] , D = [ 0 , 0 ] , M = 0 , N = 0 , ql = 0 , tl = 0 , mm = 0 , ext _cigar = false ;
while ( ( m = re . exec ( t [ 5 ] ) ) != null ) {
var l = parseInt ( m [ 1 ] ) ;
if ( m [ 2 ] == 'M' ) M += l , ql += l , tl += l , ext _cigar = false ;
else if ( m [ 2 ] == 'I' ) ++ I [ 0 ] , I [ 1 ] += l , ql += l ;
else if ( m [ 2 ] == 'D' ) ++ D [ 0 ] , D [ 1 ] += l , tl += l ;
else if ( m [ 2 ] == 'N' ) N += l , tl += l ;
else if ( m [ 2 ] == 'S' ) clip [ M == 0 ? 0 : 1 ] = l , ql += l ;
else if ( m [ 2 ] == 'H' ) clip [ M == 0 ? 0 : 1 ] = l ;
else if ( m [ 2 ] == '=' ) M += l , ql += l , tl += l , ext _cigar = true ;
else if ( m [ 2 ] == 'X' ) M += l , ql += l , tl += l , mm += l , ext _cigar = true ;
++ n _cigar ;
}
if ( n _cigar > 65535 )
warn ( "WARNING at line " + lineno + ": " + n _cigar + " CIGAR operations" ) ;
if ( tl + parseInt ( t [ 3 ] ) - 1 > tlen ) {
warn ( "WARNING at line " + lineno + ": alignment end position larger than ref length; skipped" ) ;
continue ;
}
if ( t [ 9 ] != '*' && t [ 9 ] . length != ql ) {
warn ( "WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t [ 9 ] . length + " != " + ql + "); skipped" ) ;
continue ;
}
if ( ! have _NM || ext _cigar ) NM = I [ 1 ] + D [ 1 ] + mm ;
if ( NM < I [ 1 ] + D [ 1 ] + mm ) {
warn ( "WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + ( I [ 1 ] + D [ 1 ] + mm ) + ")" ) ;
NM = I [ 1 ] + D [ 1 ] + mm ;
}
var extra = [ "mm:i:" + ( NM - I [ 1 ] - D [ 1 ] ) , "io:i:" + I [ 0 ] , "in:i:" + I [ 1 ] , "do:i:" + D [ 0 ] , "dn:i:" + D [ 1 ] ] ;
var match = M - ( NM - I [ 1 ] - D [ 1 ] ) ;
var blen = M + I [ 1 ] + D [ 1 ] ;
var qlen = M + I [ 1 ] + clip [ 0 ] + clip [ 1 ] ;
var qs , qe ;
if ( flag & 16 ) qs = clip [ 1 ] , qe = qlen - clip [ 0 ] ;
else qs = clip [ 0 ] , qe = qlen - clip [ 1 ] ;
var ts = parseInt ( t [ 3 ] ) - 1 , te = ts + M + D [ 1 ] + N ;
var qname = t [ 0 ] ;
if ( ( flag & 1 ) && ( flag & 0x40 ) ) qname += '/1' ;
if ( ( flag & 1 ) && ( flag & 0x80 ) ) qname += '/2' ;
var a = [ qname , qlen , qs , qe , flag & 16 ? '-' : '+' , t [ 2 ] , tlen , ts , te , match , blen , t [ 4 ] ] ;
print ( a . join ( "\t" ) , extra . join ( "\t" ) ) ;
}
buf . destroy ( ) ;
file . close ( ) ;
}
function paf _splice2bed ( args )
{
var colors = [ "0,128,255" , "255,0,0" , "0,192,0" ] ;
function print _lines ( a , fmt )
{
if ( a . length == 0 ) return ;
if ( fmt == "bed" ) {
var n _pri = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 8 ] == 0 ) ++ n _pri ;
if ( n _pri > 1 ) {
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 8 ] == 0 ) a [ i ] [ 8 ] = 1 ;
} else if ( n _pri == 0 ) {
warn ( "Warning: " + a [ 0 ] [ 3 ] + " doesn't have a primary alignment" ) ;
}
for ( var i = 0 ; i < a . length ; ++ i ) {
a [ i ] [ 8 ] = colors [ a [ i ] [ 8 ] ] ;
print ( a [ i ] . join ( "\t" ) ) ;
}
}
a . length = 0 ;
}
var re = /(\d+)([MIDNSH])/g ;
var c , fmt = "bed" , fn _name _conv = null ;
while ( ( c = getopt ( args , "f:n:" ) ) != null ) {
if ( c == 'f' ) fmt = getopt . arg ;
else if ( c == 'n' ) fn _name _conv = getopt . arg ;
}
if ( getopt . ind == args . length ) {
warn ( "Usage: paftools.js splice2bed <in.paf>|<in.sam>" ) ;
exit ( 1 ) ;
}
var conv = null ;
if ( fn _name _conv != null ) {
conv = new Map ( ) ;
var file = new File ( fn _name _conv ) ;
var buf = new Bytes ( ) ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
conv . put ( t [ 0 ] , t [ 1 ] ) ;
}
buf . destroy ( ) ;
file . close ( ) ;
}
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var a = [ ] ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
if ( line . charAt ( 0 ) == '@' ) continue ; // skip SAM header lines
var t = line . split ( "\t" ) ;
var is _pri = false , cigar = null , a1 ;
var qname = conv != null ? conv . get ( t [ 0 ] ) : null ;
if ( qname != null ) t [ 0 ] = qname ;
if ( t . length >= 10 && t [ 4 ] != '+' && t [ 4 ] != '-' && /^\d+/ . test ( t [ 1 ] ) ) { // SAM
var flag = parseInt ( t [ 1 ] ) ;
if ( flag & 1 ) t [ 0 ] += '/' + ( flag >> 6 & 3 ) ;
}
if ( a . length && a [ 0 ] [ 3 ] != t [ 0 ] ) {
print _lines ( a , fmt ) ;
a = [ ] ;
}
if ( t . length >= 12 && ( t [ 4 ] == '+' || t [ 4 ] == '-' ) ) { // PAF
for ( var i = 12 ; i < t . length ; ++ i ) {
if ( t [ i ] . substr ( 0 , 5 ) == 'cg:Z:' ) {
cigar = t [ i ] . substr ( 5 ) ;
} else if ( t [ i ] . substr ( 0 , 5 ) == 's2:i:' ) {
is _pri = true ;
}
}
a1 = [ t [ 5 ] , t [ 7 ] , t [ 8 ] , t [ 0 ] , Math . floor ( t [ 9 ] / t [ 10 ] * 1000 ) , t [ 4 ] ] ;
} else if ( t . length >= 10 ) { // SAM
var flag = parseInt ( t [ 1 ] ) ;
if ( ( flag & 4 ) || a [ 2 ] == '*' ) continue ;
cigar = t [ 5 ] ;
is _pri = ( flag & 0x100 ) ? false : true ;
a1 = [ t [ 2 ] , parseInt ( t [ 3 ] ) - 1 , null , t [ 0 ] , 1000 , ( flag & 16 ) ? '-' : '+' ] ;
} else {
throw Error ( "unrecognized input format" ) ;
}
if ( cigar == null ) throw Error ( "missing CIGAR" ) ;
var m , x0 = 0 , x = 0 , bs = [ ] , bl = [ ] ;
while ( ( m = re . exec ( cigar ) ) != null ) {
if ( m [ 2 ] == 'M' || m [ 2 ] == 'D' ) {
x += parseInt ( m [ 1 ] ) ;
} else if ( m [ 2 ] == 'N' ) {
bs . push ( x0 ) ;
bl . push ( x - x0 ) ;
x += parseInt ( m [ 1 ] ) ;
x0 = x ;
}
}
bs . push ( x0 ) ;
bl . push ( x - x0 ) ;
// write the BED12 line
if ( a1 [ 2 ] == null ) a1 [ 2 ] = a1 [ 1 ] + x ;
a1 . push ( a1 [ 1 ] , a1 [ 2 ] ) ; // thick start/end is the same as start/end
a1 . push ( is _pri ? 0 : 2 , bs . length , bl . join ( "," ) + "," , bs . join ( "," ) + "," ) ;
a . push ( a1 ) ;
}
print _lines ( a , fmt ) ;
buf . destroy ( ) ;
file . close ( ) ;
if ( conv != null ) conv . destroy ( ) ;
}
2018-02-09 23:00:35 +08:00
/ * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * Evaluation related * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * /
// evaluate mapping accuracy
function paf _mapeval ( args )
{
var c , max _mapq = 60 , mode = 0 , err _out _q = 256 , print _err = false , ovlp _ratio = 0.1 , cap _short _mapq = false ;
while ( ( c = getopt ( args , "Q:r:m:c" ) ) != null ) {
if ( c == 'Q' ) err _out _q = parseInt ( getopt . arg ) , print _err = true ;
else if ( c == 'r' ) ovlp _ratio = parseFloat ( getopt . arg ) ;
else if ( c == 'm' ) mode = parseInt ( getopt . arg ) ;
else if ( c == 'c' ) cap _short _mapq = true ;
}
if ( args . length == getopt . ind ) {
warn ( "Usage: paftools.js mapeval [options] <in.paf>|<in.sam>" ) ;
warn ( "Options:" ) ;
warn ( " -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp _ratio + "]" ) ;
warn ( " -Q INT print wrong mappings with mapQ>INT [don't print]" ) ;
warn ( " -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]" ) ;
exit ( 1 ) ;
}
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var tot = [ ] , err = [ ] ;
for ( var q = 0 ; q <= max _mapq ; ++ q )
tot [ q ] = err [ q ] = 0 ;
function is _correct ( s , b )
{
if ( s [ 0 ] != b [ 0 ] || s [ 3 ] != b [ 3 ] ) return false ;
var o , l ;
if ( s [ 1 ] < b [ 1 ] ) {
if ( s [ 2 ] <= b [ 1 ] ) return false ;
o = ( s [ 2 ] < b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - b [ 1 ] ;
l = ( s [ 2 ] > b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - s [ 1 ] ;
} else {
if ( b [ 2 ] <= s [ 1 ] ) return false ;
o = ( s [ 2 ] < b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - s [ 1 ] ;
l = ( s [ 2 ] > b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - b [ 1 ] ;
}
return o / l > ovlp _ratio ? true : false ;
}
function count _err ( qname , a , tot , err , mode )
{
if ( a . length == 0 ) return ;
var m , s ;
if ( ( m = /^(\S+)!(\S+)!(\d+)!(\d+)!([\+\-])$/ . exec ( qname ) ) != null ) { // pbsim single-end reads
s = [ m [ 1 ] , m [ 2 ] , parseInt ( m [ 3 ] ) , parseInt ( m [ 4 ] ) , m [ 5 ] ] ;
} else if ( ( m = /^(\S+)!(\S+)!(\d+)_(\d+)!(\d+)_(\d+)!([\+\-])([\+\-])\/([12])$/ . exec ( qname ) ) != null ) { // mason2 paired-end reads
if ( m [ 9 ] == '1' ) {
s = [ m [ 1 ] , m [ 2 ] , parseInt ( m [ 3 ] ) , parseInt ( m [ 5 ] ) , m [ 7 ] ] ;
} else {
s = [ m [ 1 ] , m [ 2 ] , parseInt ( m [ 4 ] ) , parseInt ( m [ 6 ] ) , m [ 8 ] ] ;
}
} else throw Error ( "Failed to parse simulated read names '" + qname + "'" ) ;
s . shift ( ) ; // skip the orginal read name
if ( mode == 0 || mode == 1 ) { // longest only or first only
var max _i = 0 ;
if ( mode == 0 ) { // longest only
var max = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 5 ] > max )
max = a [ i ] [ 5 ] , max _i = i ;
}
var mapq = a [ max _i ] [ 4 ] ;
++ tot [ mapq ] ;
if ( ! is _correct ( s , a [ max _i ] ) ) {
if ( mapq >= err _out _q )
print ( 'E' , qname , a [ max _i ] . join ( "\t" ) ) ;
++ err [ mapq ] ;
}
} else if ( mode == 2 ) { // all primary mode
var max _err _mapq = - 1 , max _mapq = 0 , max _err _i = - 1 ;
if ( cap _short _mapq ) {
var max = 0 , max _q = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 5 ] > max )
max = a [ i ] [ 5 ] , max _q = a [ i ] [ 4 ] ;
for ( var i = 0 ; i < a . length ; ++ i )
a [ i ] [ 4 ] = max _q < a [ i ] [ 4 ] ? max _q : a [ i ] [ 4 ] ;
}
for ( var i = 0 ; i < a . length ; ++ i ) {
max _mapq = max _mapq > a [ i ] [ 4 ] ? max _mapq : a [ i ] [ 4 ] ;
if ( ! is _correct ( s , a [ i ] ) )
if ( a [ i ] [ 4 ] > max _err _mapq )
max _err _mapq = a [ i ] [ 4 ] , max _err _i = i ;
}
if ( max _err _mapq >= 0 ) {
++ tot [ max _err _mapq ] , ++ err [ max _err _mapq ] ;
if ( max _err _mapq >= err _out _q )
print ( 'E' , qname , a [ max _err _i ] . join ( "\t" ) ) ;
} else ++ tot [ max _mapq ] ;
}
}
var lineno = 0 , last = null , a = [ ] , n _unmapped = null ;
var re _cigar = /(\d+)([MIDSHN])/g ;
while ( file . readline ( buf ) >= 0 ) {
var m , line = buf . toString ( ) ;
++ lineno ;
if ( line [ 0 ] != '@' ) {
var t = line . split ( "\t" ) ;
if ( t [ 4 ] == '+' || t [ 4 ] == '-' ) { // PAF
if ( last != t [ 0 ] ) {
if ( last != null ) count _err ( last , a , tot , err , mode ) ;
a = [ ] , last = t [ 0 ] ;
}
if ( /\ts1:i:\d+/ . test ( line ) && ! /\ts2:i:\d+/ . test ( line ) ) // secondary alignment in minimap2 PAF
continue ;
var mapq = parseInt ( t [ 11 ] ) ;
if ( mapq > max _mapq ) mapq = max _mapq ;
a . push ( [ t [ 5 ] , parseInt ( t [ 7 ] ) , parseInt ( t [ 8 ] ) , t [ 4 ] , mapq , parseInt ( t [ 9 ] ) ] ) ;
} else { // SAM
var flag = parseInt ( t [ 1 ] ) ;
var read _no = flag >> 6 & 0x3 ;
var qname = t [ 0 ] ;
if ( ! /\/[12]$/ . test ( qname ) )
qname = read _no == 1 || read _no == 2 ? t [ 0 ] + '/' + read _no : t [ 0 ] ;
if ( last != qname ) {
if ( last != null ) count _err ( last , a , tot , err , mode ) ;
a = [ ] , last = qname ;
}
if ( flag & 0x100 ) continue ; // secondary alignment
if ( ( flag & 0x4 ) || t [ 2 ] == '*' ) { // unmapped
if ( n _unmapped == null ) n _unmapped = 0 ;
++ n _unmapped ;
continue ;
}
var mapq = parseInt ( t [ 4 ] ) ;
if ( mapq > max _mapq ) mapq = max _mapq ;
var pos = parseInt ( t [ 3 ] ) - 1 , pos _end = pos ;
var n _gap = 0 , mlen = 0 ;
while ( ( m = re _cigar . exec ( t [ 5 ] ) ) != null ) {
var len = parseInt ( m [ 1 ] ) ;
if ( m [ 2 ] == 'M' ) pos _end += len , mlen += len ;
else if ( m [ 2 ] == 'I' ) n _gap += len ;
else if ( m [ 2 ] == 'D' ) n _gap += len , pos _end += len ;
}
var score = pos _end - pos ;
if ( ( m = /\tNM:i:(\d+)/ . exec ( line ) ) != null ) {
var NM = parseInt ( m [ 1 ] ) ;
if ( NM >= n _gap ) score = mlen - ( NM - n _gap ) ;
}
a . push ( [ t [ 2 ] , pos , pos _end , ( flag & 16 ) ? '-' : '+' , mapq , score ] ) ;
}
}
}
if ( last != null ) count _err ( last , a , tot , err , mode ) ;
buf . destroy ( ) ;
file . close ( ) ;
var sum _tot = 0 , sum _err = 0 , q _out = - 1 , sum _tot2 = 0 , sum _err2 = 0 ;
for ( var q = max _mapq ; q >= 0 ; -- q ) {
if ( tot [ q ] == 0 ) continue ;
if ( q _out < 0 || err [ q ] > 0 ) {
if ( q _out >= 0 ) print ( 'Q' , q _out , sum _tot , sum _err , ( sum _err2 / sum _tot2 ) . toFixed ( 9 ) , sum _tot2 ) ;
sum _tot = sum _err = 0 , q _out = q ;
}
sum _tot += tot [ q ] , sum _err += err [ q ] ;
sum _tot2 += tot [ q ] , sum _err2 += err [ q ] ;
}
print ( 'Q' , q _out , sum _tot , sum _err , ( sum _err2 / sum _tot2 ) . toFixed ( 9 ) , sum _tot2 ) ;
if ( n _unmapped != null ) print ( 'U' , n _unmapped ) ;
}
// convert mason2 SAM to FASTQ
function paf _mason2fq ( args )
{
if ( args . length == 0 ) {
print ( "Usage: paftools.js mason2fq <mason.sam>" ) ;
exit ( 1 ) ;
}
function print _se ( a )
{
print ( '@' + a . slice ( 0 , 5 ) . join ( "!" ) + " " + a [ 8 ] ) ;
print ( a [ 5 ] ) ;
print ( "+" ) ;
print ( a [ 6 ] ) ;
}
var buf = new Bytes ( ) , buf2 = new Bytes ( ) ;
var file = new File ( args [ 0 ] ) ;
var re = /(\d+)([MIDSHN])/g ;
var last = null ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
if ( t [ 0 ] . charAt ( 0 ) == '@' ) continue ;
var m , l _ref = 0 ;
while ( ( m = re . exec ( t [ 5 ] ) ) != null )
if ( m [ 2 ] == 'D' || m [ 2 ] == 'M' || m [ 2 ] == 'N' )
l _ref += parseInt ( m [ 1 ] ) ;
var flag = parseInt ( t [ 1 ] ) ;
var rev = ! ! ( flag & 16 ) ;
var seq , qual ;
if ( rev ) {
buf2 . length = 0 ;
buf2 . set ( t [ 9 ] , 0 ) ;
buf2 . revcomp ( ) ;
seq = buf2 . toString ( ) ;
buf2 . set ( t [ 10 ] , 0 ) ;
buf2 . reverse ( ) ;
qual = buf2 . toString ( ) ;
} else seq = t [ 9 ] , qual = t [ 10 ] ;
var qname = t [ 0 ] ;
qname = qname . replace ( /^simulated./ , "" ) ;
var chr = t [ 2 ] ;
var pos = parseInt ( t [ 3 ] ) - 1 ;
var strand = ( flag & 16 ) ? '-' : '+' ;
var read _no = flag & 0xc0 ;
if ( read _no == 0x40 ) read _no = 1 ;
else if ( read _no == 0x80 ) read _no = 2 ;
else read _no = 0 ;
var err = 0 , snp = 0 , indel = 0 ;
for ( var i = 11 ; i < t . length ; ++ i ) {
if ( ( m = /^XE:i:(\d+)/ . exec ( t [ i ] ) ) != null ) err = m [ 1 ] ;
else if ( ( m = /^XS:i:(\d+)/ . exec ( t [ i ] ) ) != null ) snp = m [ 1 ] ;
else if ( ( m = /^XI:i:(\d+)/ . exec ( t [ i ] ) ) != null ) indel = m [ 1 ] ;
}
var comment = [ err , snp , indel ] . join ( ":" ) ;
if ( last == null ) {
last = [ qname , chr , pos , pos + l _ref , strand , seq , qual , read _no , comment ] ;
} else if ( last [ 0 ] != qname ) {
print _se ( last ) ;
last = [ qname , chr , pos , pos + l _ref , strand , seq , qual , read _no , comment ] ;
} else {
if ( read _no == 2 ) { // last[] is the first read
if ( last [ 7 ] != 1 ) throw Error ( "ERROR: can't find read1" ) ;
var name = [ qname , chr , last [ 2 ] + "_" + pos , last [ 3 ] + "_" + ( pos + l _ref ) , last [ 4 ] + strand ] . join ( "!" ) ;
print ( '@' + name + '/1' + ' ' + last [ 8 ] ) ; print ( last [ 5 ] ) ; print ( "+" ) ; print ( last [ 6 ] ) ;
print ( '@' + name + '/2' + ' ' + comment ) ; print ( seq ) ; print ( "+" ) ; print ( qual ) ;
} else {
if ( last [ 7 ] != 2 ) throw Error ( "ERROR: can't find read2" ) ;
var name = [ qname , chr , pos + "_" + last [ 2 ] , ( pos + l _ref ) + "_" + last [ 3 ] , strand + last [ 4 ] ] . join ( "!" ) ;
print ( '@' + name + '/1' + ' ' + comment ) ; print ( seq ) ; print ( "+" ) ; print ( qual ) ;
print ( '@' + name + '/2' + ' ' + last [ 8 ] ) ; print ( last [ 5 ] ) ; print ( "+" ) ; print ( last [ 6 ] ) ;
}
last = null ;
}
}
if ( last != null ) print _se ( last ) ;
file . close ( ) ;
buf . destroy ( ) ;
buf2 . destroy ( ) ;
}
// convert pbsim MAF to FASTQ
function paf _pbsim2fq ( args )
{
if ( args . length < 2 ) {
print ( "Usage: paftools.js pbsim2fq <ref.fa.fai> <pbsim1.maf> [[pbsim2.maf] ...]" ) ;
exit ( 1 ) ;
}
var file , buf = new Bytes ( ) , buf2 = new Bytes ( ) ;
file = new File ( args [ 0 ] ) ;
var chr _list = [ ] ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( /\s+/ ) ;
chr _list . push ( t [ 0 ] ) ;
}
file . close ( ) ;
for ( var k = 1 ; k < args . length ; ++ k ) {
var fn = args [ k ] ;
file = new File ( fn ) ;
var state = 0 , reg ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
if ( state == 0 && line . charAt ( 0 ) == 'a' ) {
state = 1 ;
} else if ( state == 1 && line . charAt ( 0 ) == 's' ) {
var t = line . split ( /\s+/ ) ;
var st = parseInt ( t [ 2 ] ) ;
reg = [ st , st + parseInt ( t [ 3 ] ) ] ;
state = 2 ;
} else if ( state == 2 && line . charAt ( 0 ) == 's' ) {
var m , t = line . split ( /\s+/ ) ;
if ( ( m = /S(\d+)_\d+/ . exec ( t [ 1 ] ) ) == null ) throw Error ( "Failed to parse the read name" ) ;
var chr _id = parseInt ( m [ 1 ] ) - 1 ;
if ( chr _id >= chr _list . length ) throw Error ( "Index outside the chr list" ) ;
var name = [ t [ 1 ] , chr _list [ chr _id ] , reg [ 0 ] , reg [ 1 ] , t [ 4 ] ] . join ( "!" ) ;
var seq = t [ 6 ] . replace ( /\-/g , "" ) ;
if ( seq . length != parseInt ( t [ 5 ] ) ) throw Error ( "Inconsistent read length" ) ;
if ( seq . indexOf ( "NN" ) < 0 ) {
if ( t [ 4 ] == '-' ) {
buf2 . set ( seq , 0 ) ;
buf2 . length = seq . length ;
buf2 . revcomp ( ) ;
seq = buf2 . toString ( ) ;
}
print ( ">" + name ) ;
print ( seq ) ;
}
state = 0 ;
}
}
file . close ( ) ;
}
buf . destroy ( ) ;
buf2 . destroy ( ) ;
}
2018-02-09 23:23:30 +08:00
// evaluate overlap sensitivity
2018-02-09 23:00:35 +08:00
function paf _ov _eval ( args )
{
var c , min _ovlp = 2000 , min _frac = 0.95 , min _mapq = 10 ;
while ( ( c = getopt ( args , "q:l:f:" ) ) != null ) {
if ( c == 'q' ) min _mapq = parseInt ( getopt . arg ) ;
else if ( c == 'l' ) min _ovlp = parseInt ( getopt . arg ) ;
else if ( c == 'f' ) min _frac = parseFloat ( getopt . arg ) ;
}
if ( args . length - getopt . ind < 2 ) {
print ( "Usage: sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval [options] - <ovlp.paf>" ) ;
print ( "Options:" ) ;
print ( " -l INT min overlap length [2000]" ) ;
print ( " -q INT min mapping quality [10]" ) ;
print ( " -f FLOAT min fraction of mapped length [0.95]" ) ;
exit ( 1 ) ;
}
var buf = new Bytes ( ) ;
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var a = [ ] , h = { } ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
var is _pri = false ;
if ( parseInt ( t [ 11 ] ) < min _mapq ) continue ;
for ( var i = 12 ; i < t . length ; ++ i )
if ( t [ i ] == 'tp:A:P' )
is _pri = true ;
if ( ! is _pri ) continue ;
for ( var i = 1 ; i <= 3 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
for ( var i = 6 ; i <= 8 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
if ( t [ 3 ] - t [ 2 ] < min _ovlp || t [ 8 ] - t [ 7 ] < min _ovlp || ( t [ 3 ] - t [ 2 ] ) / t [ 1 ] < min _frac )
continue ;
var ctg = t [ 5 ] , st = t [ 7 ] , en = t [ 8 ] ;
while ( a . length > 0 ) {
if ( a [ 0 ] [ 0 ] == ctg && a [ 0 ] [ 2 ] > st )
break ;
else a . shift ( ) ;
}
for ( var j = 0 ; j < a . length ; ++ j ) {
if ( a [ j ] [ 3 ] == t [ 0 ] ) continue ;
var len = ( en > a [ j ] [ 2 ] ? a [ j ] [ 2 ] : en ) - st ;
if ( len >= min _ovlp ) {
var key = a [ j ] [ 3 ] < t [ 0 ] ? a [ j ] [ 3 ] + "\t" + t [ 0 ] : t [ 0 ] + "\t" + a [ j ] [ 3 ] ;
h [ key ] = len ;
}
}
a . push ( [ ctg , st , en , t [ 0 ] ] ) ;
}
file . close ( ) ;
file = new File ( args [ getopt . ind + 1 ] ) ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
var key = t [ 0 ] < t [ 5 ] ? t [ 0 ] + "\t" + t [ 5 ] : t [ 5 ] + "\t" + t [ 0 ] ;
if ( h [ key ] > 0 ) h [ key ] = - h [ key ] ;
}
file . close ( ) ;
buf . destroy ( ) ;
var n _ovlp = 0 , n _missing = 0 ;
for ( var key in h ) {
++ n _ovlp ;
if ( h [ key ] > 0 ) ++ n _missing ;
}
print ( n _ovlp + " overlaps inferred from the reference mapping" ) ;
print ( n _missing + " missed by the read overlapper" ) ;
print ( ( 100 * ( 1 - n _missing / n _ovlp ) ) . toFixed ( 2 ) + "% sensitivity" ) ;
}
/ * * * * * * * * * * * * * * * * * * * * * * * * *
* * * * * main function * * * * *
* * * * * * * * * * * * * * * * * * * * * * * * * /
function main ( args )
{
if ( args . length == 0 ) {
print ( "Usage: paftools.js <command> [arguments]" ) ;
print ( "Commands:" ) ;
2018-02-09 23:23:30 +08:00
print ( " view convert PAF to BLAST-like (for eyeballing) or MAF" ) ;
print ( " sam2paf convert SAM to PAF" ) ;
print ( " splice2bed convert spliced alignment in PAF/SAM to BED12" ) ;
print ( "" ) ;
2018-02-09 23:00:35 +08:00
print ( " call call variants from asm-to-ref alignment with the cs tag" ) ;
print ( "" ) ;
print ( " mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ" ) ;
print ( " mason2fq convert mason2-simulated SAM to FASTQ" ) ;
print ( " pbsim2fq convert PBSIM-simulated MAF to FASTQ" ) ;
print ( " ov-eval evaluate read overlap sensitivity using read-to-ref mapping" ) ;
exit ( 1 ) ;
}
var cmd = args . shift ( ) ;
2018-02-09 23:23:30 +08:00
if ( cmd == 'view' ) paf _view ( args ) ;
else if ( cmd == 'sam2paf' ) paf _sam2paf ( args ) ;
else if ( cmd == 'splice2bed' ) paf _splice2bed ( args ) ;
else if ( cmd == 'call' ) paf _call ( args ) ;
2018-02-09 23:00:35 +08:00
else if ( cmd == 'mapeval' ) paf _mapeval ( args ) ;
else if ( cmd == 'mason2fq' ) paf _mason2fq ( args ) ;
else if ( cmd == 'pbsim2fq' ) paf _pbsim2fq ( args ) ;
else if ( cmd == 'ov-eval' ) paf _ov _eval ( args ) ;
else throw Error ( "unrecognized command: " + cmd ) ;
}
main ( arguments ) ;