2018-02-09 23:00:35 +08:00
#!/usr/bin/env k8
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * * * Library functions * * * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Command line option parsing *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
var getopt = function ( args , ostr ) {
var oli ; // option letter list index
if ( typeof ( getopt . place ) == 'undefined' )
getopt . ind = 0 , getopt . arg = null , getopt . place = - 1 ;
if ( getopt . place == - 1 ) { // update scanning pointer
if ( getopt . ind >= args . length || args [ getopt . ind ] . charAt ( getopt . place = 0 ) != '-' ) {
getopt . place = - 1 ;
return null ;
}
if ( getopt . place + 1 < args [ getopt . ind ] . length && args [ getopt . ind ] . charAt ( ++ getopt . place ) == '-' ) { // found "--"
++ getopt . ind ;
getopt . place = - 1 ;
return null ;
}
}
var optopt = args [ getopt . ind ] . charAt ( getopt . place ++ ) ; // character checked for validity
if ( optopt == ':' || ( oli = ostr . indexOf ( optopt ) ) < 0 ) {
if ( optopt == '-' ) return null ; // if the user didn't specify '-' as an option, assume it means null.
if ( getopt . place < 0 ) ++ getopt . ind ;
return '?' ;
}
if ( oli + 1 >= ostr . length || ostr . charAt ( ++ oli ) != ':' ) { // don't need argument
getopt . arg = null ;
if ( getopt . place < 0 || getopt . place >= args [ getopt . ind ] . length ) ++ getopt . ind , getopt . place = - 1 ;
} else { // need an argument
if ( getopt . place >= 0 && getopt . place < args [ getopt . ind ] . length )
getopt . arg = args [ getopt . ind ] . substr ( getopt . place ) ;
else if ( args . length <= ++ getopt . ind ) { // no arg
getopt . place = - 1 ;
if ( ostr . length > 0 && ostr . charAt ( 0 ) == ':' ) return ':' ;
return '?' ;
} else getopt . arg = args [ getopt . ind ] ; // white space
getopt . place = - 1 ;
++ getopt . ind ;
}
return optopt ;
}
/ * * * * * * * * * * * * * * * * * * * * * * *
* Interval operations *
* * * * * * * * * * * * * * * * * * * * * * * /
Interval = { } ;
Interval . sort = function ( a )
{
if ( typeof a [ 0 ] == 'number' )
a . sort ( function ( x , y ) { return x - y } ) ;
else a . sort ( function ( x , y ) { return x [ 0 ] != y [ 0 ] ? x [ 0 ] - y [ 0 ] : x [ 1 ] - y [ 1 ] } ) ;
}
Interval . merge = function ( a , sorted )
{
if ( typeof sorted == 'undefined' ) sorted = true ;
if ( ! sorted ) Interval . sort ( a ) ;
var k = 0 ;
for ( var i = 1 ; i < a . length ; ++ i ) {
if ( a [ k ] [ 1 ] >= a [ i ] [ 0 ] )
a [ k ] [ 1 ] = a [ k ] [ 1 ] > a [ i ] [ 1 ] ? a [ k ] [ 1 ] : a [ i ] [ 1 ] ;
else a [ ++ k ] = a [ i ] . slice ( 0 ) ;
}
a . length = k + 1 ;
}
Interval . index _end = function ( a , sorted )
{
if ( a . length == 0 ) return ;
if ( typeof sorted == 'undefined' ) sorted = true ;
if ( ! sorted ) Interval . sort ( a ) ;
a [ 0 ] . push ( 0 ) ;
var k = 0 , k _en = a [ 0 ] [ 1 ] ;
for ( var i = 1 ; i < a . length ; ++ i ) {
if ( k _en <= a [ i ] [ 0 ] ) {
for ( ++ k ; k < i ; ++ k )
if ( a [ k ] [ 1 ] > a [ i ] [ 0 ] )
break ;
k _en = a [ k ] [ 1 ] ;
}
a [ i ] . push ( k ) ;
}
}
Interval . find _intv = function ( a , x )
{
var left = - 1 , right = a . length ;
if ( typeof a [ 0 ] == 'number' ) {
while ( right - left > 1 ) {
var mid = left + ( ( right - left ) >> 1 ) ;
if ( a [ mid ] > x ) right = mid ;
else if ( a [ mid ] < x ) left = mid ;
else return mid ;
}
} else {
while ( right - left > 1 ) {
var mid = left + ( ( right - left ) >> 1 ) ;
if ( a [ mid ] [ 0 ] > x ) right = mid ;
else if ( a [ mid ] [ 0 ] < x ) left = mid ;
else return mid ;
}
}
return left ;
}
Interval . find _ovlp = function ( a , st , en )
{
if ( a . length == 0 || st >= en ) return [ ] ;
var l = Interval . find _intv ( a , st ) ;
var k = l < 0 ? 0 : a [ l ] [ a [ l ] . length - 1 ] ;
var b = [ ] ;
for ( var i = k ; i < a . length ; ++ i ) {
if ( a [ i ] [ 0 ] >= en ) break ;
else if ( st < a [ i ] [ 1 ] )
b . push ( a [ i ] ) ;
}
return b ;
}
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Reverse and reverse complement *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
Bytes . prototype . reverse = function ( )
{
for ( var i = 0 ; i < this . length >> 1 ; ++ i ) {
var tmp = this [ i ] ;
this [ i ] = this [ this . length - i - 1 ] ;
this [ this . length - i - 1 ] = tmp ;
}
}
// reverse complement a DNA string
Bytes . prototype . revcomp = function ( )
{
if ( Bytes . rctab == null ) {
var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn' ;
var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn' ;
Bytes . rctab = [ ] ;
for ( var i = 0 ; i < 256 ; ++ i ) Bytes . rctab [ i ] = 0 ;
for ( var i = 0 ; i < s1 . length ; ++ i )
Bytes . rctab [ s1 . charCodeAt ( i ) ] = s2 . charCodeAt ( i ) ;
}
for ( var i = 0 ; i < this . length >> 1 ; ++ i ) {
var tmp = this [ this . length - i - 1 ] ;
this [ this . length - i - 1 ] = Bytes . rctab [ this [ i ] ] ;
this [ i ] = Bytes . rctab [ tmp ] ;
}
if ( this . length & 1 )
this [ this . length >> 1 ] = Bytes . rctab [ this [ this . length >> 1 ] ] ;
}
/ * * * * * * * * * * * * * * * * * * * *
* * * * * paftools * * * * *
* * * * * * * * * * * * * * * * * * * * /
// variant calling
function paf _call ( args )
{
var re _cs = /([:=*+-])(\d+|[A-Za-z]+)/g ;
var c , min _cov _len = 10000 , min _var _len = 50000 , gap _thres = 50 , min _mapq = 5 ;
while ( ( c = getopt ( args , "l:L:g:q:B:" ) ) != null ) {
if ( c == 'l' ) min _cov _len = parseInt ( getopt . arg ) ;
else if ( c == 'L' ) min _var _len = parseInt ( getopt . arg ) ;
else if ( c == 'g' ) gap _thres = parseInt ( getopt . arg ) ;
else if ( c == 'q' ) min _mapq = parseInt ( getopt . arg ) ;
}
if ( args . length == getopt . ind ) {
print ( "Usage: sort -k6,6 -k8,8n <with-cs.paf> | paftools.js call [options] -" ) ;
print ( "Options:" ) ;
print ( " -l INT min alignment length to compute coverage [" + min _cov _len + "]" ) ;
print ( " -L INT min alignment length to call variants [" + min _var _len + "]" ) ;
print ( " -q INT min mapping quality [" + min _mapq + "]" ) ;
print ( " -g INT short/long gap threshold (for statistics only) [" + gap _thres + "]" ) ;
exit ( 1 ) ;
}
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var tot _len = 0 , n _sub = [ 0 , 0 , 0 ] , n _ins = [ 0 , 0 , 0 , 0 ] , n _del = [ 0 , 0 , 0 , 0 ] ;
function count _var ( o )
{
if ( o [ 3 ] > 1 ) return ;
if ( o [ 5 ] == '-' && o [ 6 ] == '-' ) return ;
if ( o [ 5 ] == '-' ) { // insertion
var l = o [ 6 ] . length ;
if ( l == 1 ) ++ n _ins [ 0 ] ;
else if ( l == 2 ) ++ n _ins [ 1 ] ;
else if ( l < gap _thres ) ++ n _ins [ 2 ] ;
else ++ n _ins [ 3 ] ;
} else if ( o [ 6 ] == '-' ) { // deletion
var l = o [ 5 ] . length ;
if ( l == 1 ) ++ n _del [ 0 ] ;
else if ( l == 2 ) ++ n _del [ 1 ] ;
else if ( l < gap _thres ) ++ n _del [ 2 ] ;
else ++ n _del [ 3 ] ;
} else {
++ n _sub [ 0 ] ;
var s = o [ 5 ] + o [ 6 ] ;
if ( s == 'ag' || s == 'ga' || s == 'ct' || s == 'tc' )
++ n _sub [ 1 ] ;
else ++ n _sub [ 2 ] ;
}
}
var a = [ ] , out = [ ] ;
var c1 _ctg = null , c1 _start = 0 , c1 _end = 0 , c1 _counted = false , c1 _len = 0 ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
if ( ! /\ts2:i:/ . test ( line ) ) continue ; // skip secondary alignments
var m , t = line . split ( "\t" , 12 ) ;
for ( var i = 6 ; i <= 11 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
if ( t [ 10 ] < min _cov _len || t [ 11 ] < min _mapq ) continue ;
for ( var i = 1 ; i <= 3 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
var ctg = t [ 5 ] , x = t [ 7 ] , end = t [ 8 ] ;
var query = t [ 0 ] , rev = ( t [ 4 ] == '-' ) , y = rev ? t [ 3 ] : t [ 2 ] ;
// compute regions covered by 1 contig
if ( ctg != c1 _ctg || x >= c1 _end ) {
if ( c1 _counted && c1 _end > c1 _start ) {
c1 _len += c1 _end - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , c1 _end ) ;
}
c1 _ctg = ctg , c1 _start = x , c1 _end = end ;
c1 _counted = ( t [ 10 ] >= min _var _len ) ;
} else if ( end > c1 _end ) { // overlap
if ( c1 _counted && x > c1 _start ) {
c1 _len += x - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , x ) ;
}
c1 _start = c1 _end , c1 _end = end ;
c1 _counted = ( t [ 10 ] >= min _var _len ) ;
} else { // contained
if ( c1 _counted && x > c1 _start ) {
c1 _len += x - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , x ) ;
}
c1 _start = end ;
}
// output variants ahead of this alignment
while ( out . length ) {
if ( out [ 0 ] [ 0 ] != ctg || out [ 0 ] [ 2 ] <= x ) {
count _var ( out [ 0 ] ) ;
print ( 'V' , out [ 0 ] . join ( "\t" ) ) ;
out . shift ( ) ;
} else break ;
}
// update coverage
for ( var i = 0 ; i < out . length ; ++ i )
if ( out [ i ] [ 1 ] >= x && out [ i ] [ 2 ] <= end )
++ out [ i ] [ 3 ] ;
// drop alignments that don't overlap with the current one
var k = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ 0 ] [ 0 ] == ctg && a [ 0 ] [ 2 ] > x )
a [ k ++ ] = a [ i ] ;
a . length = k ;
// core loop
if ( t [ 10 ] >= min _var _len ) {
if ( ( m = /\tcs:Z:(\S+)/ . exec ( line ) ) == null ) continue ; // no cs tag
var cs = m [ 1 ] ;
var blen = 0 , n _diff = 0 ;
tot _len += t [ 10 ] ;
while ( ( m = re _cs . exec ( cs ) ) != null ) {
var cov = 1 ;
if ( m [ 1 ] == '*' || m [ 1 ] == '+' || m [ 1 ] == '-' )
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ 0 ] [ 2 ] > x ) ++ cov ;
var qs , qe ;
if ( m [ 1 ] == '=' || m [ 1 ] == ':' ) {
var l = m [ 1 ] == '=' ? m [ 2 ] . length : parseInt ( m [ 2 ] ) ;
if ( rev ) y -= l ;
else y += l ;
x += l , blen += l ;
} else if ( m [ 1 ] == '*' ) {
if ( rev ) qs = y - 1 , qe = y , -- y ;
else qs = y , qe = y + 1 , ++ y ;
out . push ( [ t [ 5 ] , x , x + 1 , cov , t [ 11 ] , m [ 2 ] . charAt ( 0 ) , m [ 2 ] . charAt ( 1 ) , query , qs , qe , rev ? '-' : '+' ] ) ;
++ x , ++ blen , ++ n _diff ;
} else if ( m [ 1 ] == '+' ) {
var l = m [ 2 ] . length ;
if ( rev ) qs = y - l , qe = y , y -= l ;
else qs = y , qe = y + l , y += l ;
out . push ( [ t [ 5 ] , x , x , cov , t [ 11 ] , '-' , m [ 2 ] , query , qs , qe , rev ? '-' : '+' ] ) ;
++ blen , ++ n _diff ;
} else if ( m [ 1 ] == '-' ) {
var l = m [ 2 ] . length ;
out . push ( [ t [ 5 ] , x , x + l , cov , t [ 11 ] , m [ 2 ] , '-' , query , y , y , rev ? '-' : '+' ] ) ;
x += l , ++ blen , ++ n _diff ;
}
}
}
a . push ( [ t [ 5 ] , t [ 7 ] , t [ 8 ] ] ) ;
}
if ( c1 _counted && c1 _end > c1 _start ) {
c1 _len += c1 _end - c1 _start ;
print ( 'R' , c1 _ctg , c1 _start , c1 _end ) ;
}
while ( out . length ) {
count _var ( out [ 0 ] ) ;
print ( 'V' , out [ 0 ] . join ( "\t" ) ) ;
out . shift ( ) ;
}
//warn(tot_len + " alignment columns considered in calling");
warn ( c1 _len + " reference bases covered by exactly one contig" ) ;
warn ( n _sub [ 0 ] + " substitutions; ts/tv = " + ( n _sub [ 1 ] / n _sub [ 2 ] ) . toFixed ( 3 ) ) ;
warn ( n _del [ 0 ] + " 1bp deletions" ) ;
warn ( n _ins [ 0 ] + " 1bp insertions" ) ;
warn ( n _del [ 1 ] + " 2bp deletions" ) ;
warn ( n _ins [ 1 ] + " 2bp insertions" ) ;
warn ( n _del [ 2 ] + " [3," + gap _thres + ") deletions" ) ;
warn ( n _ins [ 2 ] + " [3," + gap _thres + ") insertions" ) ;
warn ( n _del [ 3 ] + " >=" + gap _thres + " deletions" ) ;
warn ( n _ins [ 3 ] + " >=" + gap _thres + " insertions" ) ;
buf . destroy ( ) ;
file . close ( ) ;
}
2018-02-09 23:47:12 +08:00
function paf _stat ( args )
{
var c , gap _out _len = null ;
while ( ( c = getopt ( args , "l:" ) ) != null )
if ( c == 'l' ) gap _out _len = parseInt ( getopt . arg ) ;
if ( getopt . ind == args . length ) {
print ( "Usage: paftools.js stat [-l gapOutLen] <in.sam>|<in.paf>" ) ;
exit ( 1 ) ;
}
var buf = new Bytes ( ) ;
var file = new File ( args [ getopt . ind ] ) ;
var re = /(\d+)([MIDSHNX=])/g ;
var lineno = 0 , n _pri = 0 , n _2nd = 0 , n _seq = 0 , n _cigar _64k = 0 , l _tot = 0 , l _cov = 0 ;
var n _gap = [ [ 0 , 0 , 0 , 0 , 0 , 0 ] , [ 0 , 0 , 0 , 0 , 0 , 0 ] ] ;
function cov _len ( regs )
{
regs . sort ( function ( a , b ) { return a [ 0 ] - b [ 0 ] } ) ;
var st = regs [ 0 ] [ 0 ] , en = regs [ 0 ] [ 1 ] , l = 0 ;
for ( var i = 1 ; i < regs . length ; ++ i ) {
if ( regs [ i ] [ 0 ] < en )
en = en > regs [ i ] [ 1 ] ? en : regs [ i ] [ 1 ] ;
else l += en - st , st = regs [ i ] [ 0 ] , en = regs [ i ] [ 1 ] ;
}
l += en - st ;
return l ;
}
var last = null , last _qlen = null , regs = [ ] ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
++ lineno ;
if ( line . charAt ( 0 ) != '@' ) {
var t = line . split ( "\t" , 12 ) ;
var m , rs , cigar = null , is _pri = false , is _sam = false , is _rev = false , tname = null ;
var atlen = null , aqlen , qs , qe , mapq , ori _qlen ;
if ( t [ 4 ] == '+' || t [ 4 ] == '-' ) { // PAF
if ( ! /\ts2:i:\d+/ . test ( line ) ) {
++ n _2nd ;
continue ;
}
if ( ( m = /\tcg:Z:(\S+)/ . exec ( line ) ) != null )
cigar = m [ 1 ] ;
if ( cigar == null ) {
warn ( "WARNING: no CIGAR at line " + lineno ) ;
continue ;
}
tname = t [ 5 ] ;
qs = parseInt ( t [ 2 ] ) , qe = parseInt ( t [ 3 ] ) ;
aqlen = qe - qs ;
is _rev = t [ 4 ] == '+' ? false : true ;
rs = parseInt ( t [ 7 ] ) ;
atlen = parseInt ( t [ 8 ] ) - rs ;
mapq = parseInt ( t [ 11 ] ) ;
ori _qlen = parseInt ( t [ 1 ] ) ;
} else { // SAM
var flag = parseInt ( t [ 1 ] ) ;
if ( ( flag & 4 ) || t [ 2 ] == '*' || t [ 5 ] == '*' ) continue ;
if ( flag & 0x100 ) {
++ n _2nd ;
continue ;
}
cigar = t [ 5 ] ;
tname = t [ 2 ] ;
rs = parseInt ( t [ 3 ] ) - 1 ;
mapq = parseInt ( t [ 4 ] ) ;
aqlen = t [ 9 ] . length ;
is _sam = true ;
is _rev = ! ! ( flag & 0x10 ) ;
}
++ n _pri ;
if ( last != t [ 0 ] ) {
if ( last != null ) {
l _tot += last _qlen ;
l _cov += cov _len ( regs ) ;
}
regs = [ ] ;
++ n _seq , last = t [ 0 ] ;
}
var M = 0 , tl = 0 , ql = 0 , clip = [ 0 , 0 ] , n _cigar = 0 , sclip = 0 ;
while ( ( m = re . exec ( cigar ) ) != null ) {
var l = parseInt ( m [ 1 ] ) ;
++ n _cigar ;
if ( m [ 2 ] == 'M' || m [ 2 ] == '=' || m [ 2 ] == 'X' ) {
tl += l , ql += l , M += l ;
} else if ( m [ 2 ] == 'I' || m [ 2 ] == 'D' ) {
var type ;
if ( l < 50 ) type = 0 ;
else if ( l < 100 ) type = 1 ;
else if ( l < 300 ) type = 2 ;
else if ( l < 400 ) type = 3 ;
else if ( l < 1000 ) type = 4 ;
else type = 5 ;
if ( m [ 2 ] == 'I' ) ql += l , ++ n _gap [ 0 ] [ type ] ;
else tl += l , ++ n _gap [ 1 ] [ type ] ;
if ( gap _out _len != null && l >= gap _out _len )
print ( t [ 0 ] , ql , is _rev ? '-' : '+' , tname , rs + tl , m [ 2 ] , l ) ;
} else if ( m [ 2 ] == 'N' ) {
tl += l ;
} else if ( m [ 2 ] == 'S' ) {
clip [ M == 0 ? 0 : 1 ] = l , sclip += l ;
} else if ( m [ 2 ] == 'H' ) {
clip [ M == 0 ? 0 : 1 ] = l ;
}
}
if ( n _cigar > 65535 ) ++ n _cigar _64k ;
if ( ql + sclip != aqlen )
warn ( "WARNING: aligned query length is inconsistent with CIGAR at line " + lineno + " (" + ( ql + sclip ) + " != " + aqlen + ")" ) ;
if ( atlen != null && atlen != tl )
warn ( "WARNING: aligned reference length is inconsistent with CIGAR at line " + lineno ) ;
if ( is _sam ) {
qs = clip [ is _rev ? 1 : 0 ] , qe = qs + ql ;
ori _qlen = clip [ 0 ] + ql + clip [ 1 ] ;
}
regs . push ( [ qs , qe ] ) ;
last _qlen = ori _qlen ;
}
}
l _tot += last _qlen ;
l _cov += cov _len ( regs ) ;
file . close ( ) ;
buf . destroy ( ) ;
if ( gap _out _len == null ) {
print ( "Number of mapped sequences: " + n _seq ) ;
print ( "Number of primary alignments: " + n _pri ) ;
print ( "Number of secondary alignments: " + n _2nd ) ;
print ( "Number of primary alignments with >65535 CIGAR operations: " + n _cigar _64k ) ;
print ( "Number of bases in mapped sequences: " + l _tot ) ;
print ( "Number of mapped bases: " + l _cov ) ;
print ( "Number of insertions in [0,50): " + n _gap [ 0 ] [ 0 ] ) ;
print ( "Number of insertions in [50,100): " + n _gap [ 0 ] [ 1 ] ) ;
print ( "Number of insertions in [100,300): " + n _gap [ 0 ] [ 2 ] ) ;
print ( "Number of insertions in [300,400): " + n _gap [ 0 ] [ 3 ] ) ;
print ( "Number of insertions in [400,1000): " + n _gap [ 0 ] [ 4 ] ) ;
print ( "Number of insertions in [1000,inf): " + n _gap [ 0 ] [ 5 ] ) ;
print ( "Number of deletions in [0,50): " + n _gap [ 1 ] [ 0 ] ) ;
print ( "Number of deletions in [50,100): " + n _gap [ 1 ] [ 1 ] ) ;
print ( "Number of deletions in [100,300): " + n _gap [ 1 ] [ 2 ] ) ;
print ( "Number of deletions in [300,400): " + n _gap [ 1 ] [ 3 ] ) ;
print ( "Number of deletions in [400,1000): " + n _gap [ 1 ] [ 4 ] ) ;
print ( "Number of deletions in [1000,inf): " + n _gap [ 1 ] [ 5 ] ) ;
}
}
2018-02-09 23:23:30 +08:00
/ * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * Conversion related * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * /
function paf _view ( args )
{
var c , line _len = 80 , fmt = "aln" ;
while ( ( c = getopt ( args , "f:l:" ) ) != null ) {
if ( c == 'f' ) {
fmt = getopt . arg ;
if ( fmt != "aln" && fmt != "lastz-cigar" && fmt != "maf" )
throw Error ( "format must be one of aln, lastz-cigar and maf" ) ;
} else if ( c == 'l' ) line _len = parseInt ( getopt . arg ) ;
}
if ( line _len == 0 ) line _len = 0x7fffffff ;
if ( getopt . ind == args . length ) {
print ( "Usage: paftools.js view [options] <in.paf>" ) ;
print ( "Options:" ) ;
print ( " -f STR output format: aln (BLAST-like), maf or lastz-cigar [aln]" ) ;
print ( " -l INT line length in BLAST-like output [80]" ) ;
exit ( 1 ) ;
}
function padding _str ( x , len , right )
{
var s = x . toString ( ) ;
if ( s . length < len ) {
if ( right ) s += Array ( len - s . length + 1 ) . join ( " " ) ;
else s = Array ( len - s . length + 1 ) . join ( " " ) + s ;
}
return s ;
}
function update _aln ( s _ref , s _qry , s _mid , type , seq , slen )
{
var l = type == '*' ? 1 : seq . length ;
if ( type == '=' || type == ':' ) {
s _ref . set ( seq ) ;
s _qry . set ( seq ) ;
s _mid . set ( Array ( l + 1 ) . join ( "|" ) ) ;
slen [ 0 ] += l , slen [ 1 ] += l ;
} else if ( type == '*' ) {
s _ref . set ( seq . charAt ( 0 ) ) ;
s _qry . set ( seq . charAt ( 1 ) ) ;
s _mid . set ( ' ' ) ;
slen [ 0 ] += 1 , slen [ 1 ] += 1 ;
} else if ( type == '+' ) {
s _ref . set ( Array ( l + 1 ) . join ( "-" ) ) ;
s _qry . set ( seq ) ;
s _mid . set ( Array ( l + 1 ) . join ( " " ) ) ;
slen [ 1 ] += l ;
} else if ( type == '-' ) {
s _ref . set ( seq ) ;
s _qry . set ( Array ( l + 1 ) . join ( "-" ) ) ;
s _mid . set ( Array ( l + 1 ) . join ( " " ) ) ;
slen [ 0 ] += l ;
}
}
function print _aln ( rs , qs , strand , slen , elen , s _ref , s _qry , s _mid )
{
print ( [ "Ref+:" , padding _str ( rs + slen [ 0 ] + 1 , 10 , false ) , s _ref . toString ( ) , padding _str ( rs + elen [ 0 ] , 10 , true ) ] . join ( " " ) ) ;
print ( " " + s _mid . toString ( ) ) ;
var st , en ;
if ( strand == '+' ) st = qs + slen [ 1 ] + 1 , en = qs + elen [ 1 ] ;
else st = qs - slen [ 1 ] , en = qs - elen [ 1 ] + 1 ;
print ( [ "Qry" + strand + ":" , padding _str ( st , 10 , false ) , s _qry . toString ( ) , padding _str ( en , 10 , true ) ] . join ( " " ) ) ;
}
var s _ref = new Bytes ( ) , s _qry = new Bytes ( ) , s _mid = new Bytes ( ) ; // these are used to show padded alignment
var re _cs = /([:=\-\+\*])(\d+|[A-Za-z]+)/g ;
var re _cg = /(\d+)([MIDNSH])/g ;
var buf = new Bytes ( ) ;
var file = args [ getopt . ind ] == "-" ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var lineno = 0 ;
if ( fmt == "maf" ) print ( "##maf version=1\n" ) ;
while ( file . readline ( buf ) >= 0 ) {
var m , line = buf . toString ( ) ;
var t = line . split ( "\t" , 12 ) ;
++ lineno ;
s _ref . length = s _qry . length = s _mid . length = 0 ;
var slen = [ 0 , 0 ] , elen = [ 0 , 0 ] ;
if ( fmt == "lastz-cigar" ) { // LASTZ-cigar output
var cg = ( m = /\tcg:Z:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
if ( cg == null ) {
warn ( "WARNING: converting to LASTZ-cigar format requires the 'cg' tag, which is absent on line " + lineno ) ;
continue ;
}
var score = ( m = /\tAS:i:(\d+)/ . exec ( line ) ) != null ? m [ 1 ] : 0 ;
var out = [ 'cigar:' , t [ 0 ] , t [ 2 ] , t [ 3 ] , t [ 4 ] , t [ 5 ] , t [ 7 ] , t [ 8 ] , '+' , score ] ;
while ( ( m = re _cg . exec ( cg ) ) != null )
out . push ( m [ 2 ] , m [ 1 ] ) ;
print ( out . join ( " " ) ) ;
} else if ( fmt == "maf" ) { // MAF output
var cs = ( m = /\tcs:Z:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
if ( cs == null ) {
warn ( "WARNING: converting to MAF requires the 'cs' tag, which is absent on line " + lineno ) ;
continue ;
}
while ( ( m = re _cs . exec ( cs ) ) != null ) {
if ( m [ 1 ] == ':' )
throw Error ( "converting to MAF only works with 'minimap2 --cs=long'" ) ;
update _aln ( s _ref , s _qry , s _mid , m [ 1 ] , m [ 2 ] , elen ) ;
}
var score = ( m = /\tAS:i:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : 0 ;
var len = t [ 0 ] . length > t [ 5 ] . length ? t [ 0 ] . length : t [ 5 ] . length ;
print ( "a " + score ) ;
print ( [ "s" , padding _str ( t [ 5 ] , len , true ) , padding _str ( t [ 7 ] , 10 , false ) , padding _str ( parseInt ( t [ 8 ] ) - parseInt ( t [ 7 ] ) , 10 , false ) ,
"+" , padding _str ( t [ 6 ] , 10 , false ) , s _ref . toString ( ) ] . join ( " " ) ) ;
var qs , qe , ql = parseInt ( t [ 1 ] ) ;
if ( t [ 4 ] == '+' ) {
qs = parseInt ( t [ 2 ] ) ;
qe = parseInt ( t [ 3 ] ) ;
} else {
qs = ql - parseInt ( t [ 3 ] ) ;
qe = ql - parseInt ( t [ 2 ] ) ;
}
print ( [ "s" , padding _str ( t [ 0 ] , len , true ) , padding _str ( qs , 10 , false ) , padding _str ( qe - qs , 10 , false ) ,
t [ 4 ] , padding _str ( ql , 10 , false ) , s _qry . toString ( ) ] . join ( " " ) ) ;
print ( "" ) ;
} else { // BLAST-like output
var cs = ( m = /\tcs:Z:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
if ( cs == null ) {
warn ( "WARNING: converting to BLAST-like alignment requires the 'cs' tag, which is absent on line " + lineno ) ;
continue ;
}
line = line . replace ( /\tc[sg]:Z:\S+/g , "" ) ; // get rid of cs or cg tags
print ( '>' + line ) ;
var rs = parseInt ( t [ 7 ] ) , qs = t [ 4 ] == '+' ? parseInt ( t [ 2 ] ) : parseInt ( t [ 3 ] ) ;
var n _blocks = 0 ;
while ( ( m = re _cs . exec ( cs ) ) != null ) {
if ( m [ 1 ] == ':' ) m [ 2 ] = Array ( parseInt ( m [ 2 ] ) + 1 ) . join ( "=" ) ;
var start = 0 , rest = m [ 1 ] == '*' ? 1 : m [ 2 ] . length ;
while ( rest > 0 ) {
var l _proc ;
if ( s _ref . length + rest >= line _len ) {
l _proc = line _len - s _ref . length ;
update _aln ( s _ref , s _qry , s _mid , m [ 1 ] , m [ 1 ] == '*' ? m [ 2 ] : m [ 2 ] . substr ( start , l _proc ) , elen ) ;
if ( n _blocks > 0 ) print ( "" ) ;
print _aln ( rs , qs , t [ 4 ] , slen , elen , s _ref , s _qry , s _mid ) ;
++ n _blocks ;
s _ref . length = s _qry . length = s _mid . length = 0 ;
slen [ 0 ] = elen [ 0 ] , slen [ 1 ] = elen [ 1 ] ;
} else {
l _proc = rest ;
update _aln ( s _ref , s _qry , s _mid , m [ 1 ] , m [ 1 ] == '*' ? m [ 2 ] : m [ 2 ] . substr ( start , l _proc ) , elen ) ;
}
rest -= l _proc , start += l _proc ;
}
}
if ( s _ref . length > 0 ) {
if ( n _blocks > 0 ) print ( "" ) ;
print _aln ( rs , qs , t [ 4 ] , slen , elen , s _ref , s _qry , s _mid ) ;
++ n _blocks ;
}
print ( "//" ) ;
}
}
file . close ( ) ;
buf . destroy ( ) ;
s _ref . destroy ( ) ; s _qry . destroy ( ) ; s _mid . destroy ( ) ;
}
2018-02-09 23:47:12 +08:00
function paf _gff2bed ( args )
{
var c , fn _ucsc _fai = null , is _short = false ;
while ( ( c = getopt ( args , "u:s" ) ) != null ) {
if ( c == 'u' ) fn _ucsc _fai = getopt . arg ;
else if ( c == 's' ) is _short = true ;
}
if ( getopt . ind == args . length ) {
print ( "Usage: paftools.js gff2bed [-u ucsc-genome.fa.fai] <in.gff>" ) ;
exit ( 1 ) ;
}
var ens2ucsc = { } ;
if ( fn _ucsc _fai != null ) {
var buf = new Bytes ( ) ;
var file = new File ( fn _ucsc _fai ) ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
var s = t [ 0 ] ;
if ( /_(random|alt|decoy)$/ . test ( s ) ) {
s = s . replace ( /_(random|alt|decoy)$/ , '' ) ;
s = s . replace ( /^chr\S+_/ , '' ) ;
} else {
s = s . replace ( /^chrUn_/ , '' ) ;
}
s = s . replace ( /v(\d+)/ , ".$1" ) ;
if ( s != t [ 0 ] ) ens2ucsc [ s ] = t [ 0 ] ;
}
file . close ( ) ;
buf . destroy ( ) ;
}
var colors = {
'protein_coding' : '0,128,255' ,
'lincRNA' : '0,192,0' ,
'snRNA' : '0,192,0' ,
'miRNA' : '0,192,0' ,
'misc_RNA' : '0,192,0'
} ;
function print _bed12 ( exons , cds _st , cds _en , is _short )
{
if ( exons . length == 0 ) return ;
var name = is _short ? exons [ 0 ] [ 7 ] + "|" + exons [ 0 ] [ 5 ] : exons [ 0 ] . slice ( 4 , 7 ) . join ( "|" ) ;
var a = exons . sort ( function ( a , b ) { return a [ 1 ] - b [ 1 ] } ) ;
var sizes = [ ] , starts = [ ] , st , en ;
st = a [ 0 ] [ 1 ] ;
en = a [ a . length - 1 ] [ 2 ] ;
if ( cds _st == 1 << 30 ) cds _st = st ;
if ( cds _en == 0 ) cds _en = en ;
if ( cds _st < st || cds _en > en )
throw Error ( "inconsistent thick start or end for transcript " + a [ 0 ] [ 4 ] ) ;
for ( var i = 0 ; i < a . length ; ++ i ) {
sizes . push ( a [ i ] [ 2 ] - a [ i ] [ 1 ] ) ;
starts . push ( a [ i ] [ 1 ] - st ) ;
}
var color = colors [ a [ 0 ] [ 5 ] ] ;
if ( color == null ) color = '196,196,196' ;
print ( a [ 0 ] [ 0 ] , st , en , name , 1000 , a [ 0 ] [ 3 ] , cds _st , cds _en , color , a . length , sizes . join ( "," ) + "," , starts . join ( "," ) + "," ) ;
}
var re _gtf = /(transcript_id|transcript_type|transcript_biotype|gene_name|transcript_name) "([^"]+)";/g ;
var re _gff3 = /(transcript_id|transcript_type|transcript_biotype|gene_name|transcript_name)=([^;]+)/g ;
var buf = new Bytes ( ) ;
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var exons = [ ] , cds _st = 1 << 30 , cds _en = 0 , last _id = null ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
if ( t [ 0 ] . charAt ( 0 ) == '#' ) continue ;
if ( t [ 2 ] != "CDS" && t [ 2 ] != "exon" ) continue ;
t [ 3 ] = parseInt ( t [ 3 ] ) - 1 ;
t [ 4 ] = parseInt ( t [ 4 ] ) ;
var id = null , type = "" , gname = "N/A" , biotype = "" , m , tname = "N/A" ;
while ( ( m = re _gtf . exec ( t [ 8 ] ) ) != null ) {
if ( m [ 1 ] == "transcript_id" ) id = m [ 2 ] ;
else if ( m [ 1 ] == "transcript_type" ) type = m [ 2 ] ;
else if ( m [ 1 ] == "transcript_biotype" ) biotype = m [ 2 ] ;
else if ( m [ 1 ] == "gene_name" ) name = m [ 2 ] ;
else if ( m [ 1 ] == "transcript_name" ) tname = m [ 2 ] ;
}
while ( ( m = re _gff3 . exec ( t [ 8 ] ) ) != null ) {
if ( m [ 1 ] == "transcript_id" ) id = m [ 2 ] ;
else if ( m [ 1 ] == "transcript_type" ) type = m [ 2 ] ;
else if ( m [ 1 ] == "transcript_biotype" ) biotype = m [ 2 ] ;
else if ( m [ 1 ] == "gene_name" ) name = m [ 2 ] ;
else if ( m [ 1 ] == "transcript_name" ) tname = m [ 2 ] ;
}
if ( type == "" && biotype != "" ) type = biotype ;
if ( id == null ) throw Error ( "No transcript_id" ) ;
if ( id != last _id ) {
print _bed12 ( exons , cds _st , cds _en , is _short ) ;
exons = [ ] , cds _st = 1 << 30 , cds _en = 0 ;
last _id = id ;
}
if ( t [ 2 ] == "CDS" ) {
cds _st = cds _st < t [ 3 ] ? cds _st : t [ 3 ] ;
cds _en = cds _en > t [ 4 ] ? cds _en : t [ 4 ] ;
} else if ( t [ 2 ] == "exon" ) {
if ( fn _ucsc _fai != null ) {
if ( ens2ucsc [ t [ 0 ] ] != null )
t [ 0 ] = ens2ucsc [ t [ 0 ] ] ;
else if ( /^[A-Z]+\d+\.\d+$/ . test ( t [ 0 ] ) )
t [ 0 ] = t [ 0 ] . replace ( /([A-Z]+\d+)\.(\d+)/ , "chrUn_$1v$2" ) ;
}
exons . push ( [ t [ 0 ] , t [ 3 ] , t [ 4 ] , t [ 6 ] , id , type , name , tname ] ) ;
}
}
if ( last _id != null )
print _bed12 ( exons , cds _st , cds _en , is _short ) ;
file . close ( ) ;
buf . destroy ( ) ;
}
2018-02-09 23:23:30 +08:00
function paf _sam2paf ( args )
{
var c , pri _only = false ;
while ( ( c = getopt ( args , "p" ) ) != null )
if ( c == 'p' ) pri _only = true ;
var file = args . length == getopt . ind || args [ getopt . ind ] == "-" ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var re = /(\d+)([MIDSHNX=])/g ;
var len = { } , lineno = 0 ;
while ( file . readline ( buf ) >= 0 ) {
var m , n _cigar = 0 , line = buf . toString ( ) ;
++ lineno ;
if ( line . charAt ( 0 ) == '@' ) {
if ( /^@SQ/ . test ( line ) ) {
var name = ( m = /\tSN:(\S+)/ . exec ( line ) ) != null ? m [ 1 ] : null ;
var l = ( m = /\tLN:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : null ;
if ( name != null && l != null ) len [ name ] = l ;
}
continue ;
}
var t = line . split ( "\t" ) ;
var flag = parseInt ( t [ 1 ] ) ;
if ( t [ 9 ] != '*' && t [ 10 ] != '*' && t [ 9 ] . length != t [ 10 ] . length ) throw Error ( "ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t [ 9 ] . length + " != " + t [ 10 ] . length ) ;
if ( t [ 2 ] == '*' || ( flag & 4 ) ) continue ;
if ( pri _only && ( flag & 0x100 ) ) continue ;
var tlen = len [ t [ 2 ] ] ;
if ( tlen == null ) throw Error ( "ERROR at line " + lineno + ": can't find the length of contig " + t [ 2 ] ) ;
var nn = ( m = /\tnn:i:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : 0 ;
var NM = ( m = /\tNM:i:(\d+)/ . exec ( line ) ) != null ? parseInt ( m [ 1 ] ) : null ;
var have _NM = NM == null ? false : true ;
NM += nn ;
var clip = [ 0 , 0 ] , I = [ 0 , 0 ] , D = [ 0 , 0 ] , M = 0 , N = 0 , ql = 0 , tl = 0 , mm = 0 , ext _cigar = false ;
while ( ( m = re . exec ( t [ 5 ] ) ) != null ) {
var l = parseInt ( m [ 1 ] ) ;
if ( m [ 2 ] == 'M' ) M += l , ql += l , tl += l , ext _cigar = false ;
else if ( m [ 2 ] == 'I' ) ++ I [ 0 ] , I [ 1 ] += l , ql += l ;
else if ( m [ 2 ] == 'D' ) ++ D [ 0 ] , D [ 1 ] += l , tl += l ;
else if ( m [ 2 ] == 'N' ) N += l , tl += l ;
else if ( m [ 2 ] == 'S' ) clip [ M == 0 ? 0 : 1 ] = l , ql += l ;
else if ( m [ 2 ] == 'H' ) clip [ M == 0 ? 0 : 1 ] = l ;
else if ( m [ 2 ] == '=' ) M += l , ql += l , tl += l , ext _cigar = true ;
else if ( m [ 2 ] == 'X' ) M += l , ql += l , tl += l , mm += l , ext _cigar = true ;
++ n _cigar ;
}
if ( n _cigar > 65535 )
warn ( "WARNING at line " + lineno + ": " + n _cigar + " CIGAR operations" ) ;
if ( tl + parseInt ( t [ 3 ] ) - 1 > tlen ) {
warn ( "WARNING at line " + lineno + ": alignment end position larger than ref length; skipped" ) ;
continue ;
}
if ( t [ 9 ] != '*' && t [ 9 ] . length != ql ) {
warn ( "WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t [ 9 ] . length + " != " + ql + "); skipped" ) ;
continue ;
}
if ( ! have _NM || ext _cigar ) NM = I [ 1 ] + D [ 1 ] + mm ;
if ( NM < I [ 1 ] + D [ 1 ] + mm ) {
warn ( "WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + ( I [ 1 ] + D [ 1 ] + mm ) + ")" ) ;
NM = I [ 1 ] + D [ 1 ] + mm ;
}
var extra = [ "mm:i:" + ( NM - I [ 1 ] - D [ 1 ] ) , "io:i:" + I [ 0 ] , "in:i:" + I [ 1 ] , "do:i:" + D [ 0 ] , "dn:i:" + D [ 1 ] ] ;
var match = M - ( NM - I [ 1 ] - D [ 1 ] ) ;
var blen = M + I [ 1 ] + D [ 1 ] ;
var qlen = M + I [ 1 ] + clip [ 0 ] + clip [ 1 ] ;
var qs , qe ;
if ( flag & 16 ) qs = clip [ 1 ] , qe = qlen - clip [ 0 ] ;
else qs = clip [ 0 ] , qe = qlen - clip [ 1 ] ;
var ts = parseInt ( t [ 3 ] ) - 1 , te = ts + M + D [ 1 ] + N ;
var qname = t [ 0 ] ;
if ( ( flag & 1 ) && ( flag & 0x40 ) ) qname += '/1' ;
if ( ( flag & 1 ) && ( flag & 0x80 ) ) qname += '/2' ;
var a = [ qname , qlen , qs , qe , flag & 16 ? '-' : '+' , t [ 2 ] , tlen , ts , te , match , blen , t [ 4 ] ] ;
print ( a . join ( "\t" ) , extra . join ( "\t" ) ) ;
}
buf . destroy ( ) ;
file . close ( ) ;
}
2018-02-10 01:14:28 +08:00
function paf _delta2paf ( args )
{
if ( args . length == 0 ) {
print ( "Usage: paftools.js delta2paf <in.delta>" ) ;
exit ( 1 ) ;
}
var buf = new Bytes ( ) ;
var file = args [ 0 ] == '-' ? new File ( ) : new File ( args [ 0 ] ) ;
var rname , qname , rlen , qlen , qs , qe , rs , re , strand , NM , cigar , x , y , seen _gt = false ;
while ( file . readline ( buf ) >= 0 ) {
var m , line = buf . toString ( ) ;
if ( ( m = /^>(\S+)\s+(\S+)\s+(\d+)\s+(\d+)/ . exec ( line ) ) != null ) {
rname = m [ 1 ] , qname = m [ 2 ] , rlen = parseInt ( m [ 3 ] ) , qlen = parseInt ( m [ 4 ] ) ;
seen _gt = true ;
continue ;
}
if ( ! seen _gt ) continue ;
var t = line . split ( " " ) ;
if ( t . length == 7 ) {
for ( var i = 0 ; i < 5 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
strand = ( ( t [ 0 ] < t [ 1 ] && t [ 2 ] < t [ 3 ] ) || ( t [ 0 ] > t [ 1 ] && t [ 2 ] > t [ 3 ] ) ) ? 1 : - 1 ;
rs = ( t [ 0 ] < t [ 1 ] ? t [ 0 ] : t [ 1 ] ) - 1 ;
re = t [ 1 ] > t [ 0 ] ? t [ 1 ] : t [ 0 ] ;
qs = ( t [ 2 ] < t [ 3 ] ? t [ 2 ] : t [ 3 ] ) - 1 ;
qe = t [ 3 ] > t [ 2 ] ? t [ 3 ] : t [ 2 ] ;
x = y = 0 ;
NM = parseInt ( t [ 4 ] ) ;
cigar = [ ] ;
} else if ( t . length == 1 ) {
var d = parseInt ( t [ 0 ] ) ;
if ( d == 0 ) {
var blen = 0 , cigar _str = [ ] ;
if ( re - rs - x != qe - qs - y ) throw Error ( "inconsisnt alignment" ) ;
cigar . push ( ( re - rs - x ) << 4 ) ;
for ( var i = 0 ; i < cigar . length ; ++ i ) {
blen += cigar [ i ] >> 4 ;
cigar _str . push ( ( cigar [ i ] >> 4 ) + "MID" . charAt ( cigar [ i ] & 0xf ) ) ;
}
print ( [ qname , qlen , qs , qe , strand > 0 ? '+' : '-' , rname , rlen , rs , re , blen - NM , blen , 0 , "NM:i:" + NM , "cg:Z:" + cigar _str . join ( "" ) ] . join ( "\t" ) ) ;
} else if ( d > 0 ) {
var l = d - 1 ;
x += l + 1 , y += l ;
if ( l > 0 ) cigar . push ( l << 4 ) ;
if ( cigar . length > 0 && ( cigar [ cigar . length - 1 ] & 0xf ) == 2 )
cigar [ cigar . length - 1 ] += 1 << 4 ;
else cigar . push ( 1 << 4 | 2 ) ; // deletion
} else {
var l = - d - 1 ;
x += l , y += l + 1 ;
if ( l > 0 ) cigar . push ( l << 4 ) ;
if ( cigar . length > 0 && ( cigar [ cigar . length - 1 ] & 0xf ) == 1 )
cigar [ cigar . length - 1 ] += 1 << 4 ;
else cigar . push ( 1 << 4 | 1 ) ; // insertion
}
}
}
file . close ( ) ;
buf . destroy ( ) ;
}
2018-02-09 23:23:30 +08:00
function paf _splice2bed ( args )
{
var colors = [ "0,128,255" , "255,0,0" , "0,192,0" ] ;
function print _lines ( a , fmt )
{
if ( a . length == 0 ) return ;
if ( fmt == "bed" ) {
var n _pri = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 8 ] == 0 ) ++ n _pri ;
if ( n _pri > 1 ) {
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 8 ] == 0 ) a [ i ] [ 8 ] = 1 ;
} else if ( n _pri == 0 ) {
warn ( "Warning: " + a [ 0 ] [ 3 ] + " doesn't have a primary alignment" ) ;
}
for ( var i = 0 ; i < a . length ; ++ i ) {
a [ i ] [ 8 ] = colors [ a [ i ] [ 8 ] ] ;
print ( a [ i ] . join ( "\t" ) ) ;
}
}
a . length = 0 ;
}
var re = /(\d+)([MIDNSH])/g ;
var c , fmt = "bed" , fn _name _conv = null ;
while ( ( c = getopt ( args , "f:n:" ) ) != null ) {
if ( c == 'f' ) fmt = getopt . arg ;
else if ( c == 'n' ) fn _name _conv = getopt . arg ;
}
if ( getopt . ind == args . length ) {
warn ( "Usage: paftools.js splice2bed <in.paf>|<in.sam>" ) ;
exit ( 1 ) ;
}
var conv = null ;
if ( fn _name _conv != null ) {
conv = new Map ( ) ;
var file = new File ( fn _name _conv ) ;
var buf = new Bytes ( ) ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
conv . put ( t [ 0 ] , t [ 1 ] ) ;
}
buf . destroy ( ) ;
file . close ( ) ;
}
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var a = [ ] ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
if ( line . charAt ( 0 ) == '@' ) continue ; // skip SAM header lines
var t = line . split ( "\t" ) ;
var is _pri = false , cigar = null , a1 ;
var qname = conv != null ? conv . get ( t [ 0 ] ) : null ;
if ( qname != null ) t [ 0 ] = qname ;
if ( t . length >= 10 && t [ 4 ] != '+' && t [ 4 ] != '-' && /^\d+/ . test ( t [ 1 ] ) ) { // SAM
var flag = parseInt ( t [ 1 ] ) ;
if ( flag & 1 ) t [ 0 ] += '/' + ( flag >> 6 & 3 ) ;
}
if ( a . length && a [ 0 ] [ 3 ] != t [ 0 ] ) {
print _lines ( a , fmt ) ;
a = [ ] ;
}
if ( t . length >= 12 && ( t [ 4 ] == '+' || t [ 4 ] == '-' ) ) { // PAF
for ( var i = 12 ; i < t . length ; ++ i ) {
if ( t [ i ] . substr ( 0 , 5 ) == 'cg:Z:' ) {
cigar = t [ i ] . substr ( 5 ) ;
} else if ( t [ i ] . substr ( 0 , 5 ) == 's2:i:' ) {
is _pri = true ;
}
}
a1 = [ t [ 5 ] , t [ 7 ] , t [ 8 ] , t [ 0 ] , Math . floor ( t [ 9 ] / t [ 10 ] * 1000 ) , t [ 4 ] ] ;
} else if ( t . length >= 10 ) { // SAM
var flag = parseInt ( t [ 1 ] ) ;
if ( ( flag & 4 ) || a [ 2 ] == '*' ) continue ;
cigar = t [ 5 ] ;
is _pri = ( flag & 0x100 ) ? false : true ;
a1 = [ t [ 2 ] , parseInt ( t [ 3 ] ) - 1 , null , t [ 0 ] , 1000 , ( flag & 16 ) ? '-' : '+' ] ;
} else {
throw Error ( "unrecognized input format" ) ;
}
if ( cigar == null ) throw Error ( "missing CIGAR" ) ;
var m , x0 = 0 , x = 0 , bs = [ ] , bl = [ ] ;
while ( ( m = re . exec ( cigar ) ) != null ) {
if ( m [ 2 ] == 'M' || m [ 2 ] == 'D' ) {
x += parseInt ( m [ 1 ] ) ;
} else if ( m [ 2 ] == 'N' ) {
bs . push ( x0 ) ;
bl . push ( x - x0 ) ;
x += parseInt ( m [ 1 ] ) ;
x0 = x ;
}
}
bs . push ( x0 ) ;
bl . push ( x - x0 ) ;
// write the BED12 line
if ( a1 [ 2 ] == null ) a1 [ 2 ] = a1 [ 1 ] + x ;
a1 . push ( a1 [ 1 ] , a1 [ 2 ] ) ; // thick start/end is the same as start/end
a1 . push ( is _pri ? 0 : 2 , bs . length , bl . join ( "," ) + "," , bs . join ( "," ) + "," ) ;
a . push ( a1 ) ;
}
print _lines ( a , fmt ) ;
buf . destroy ( ) ;
file . close ( ) ;
if ( conv != null ) conv . destroy ( ) ;
}
2018-02-09 23:00:35 +08:00
/ * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * Evaluation related * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * /
// evaluate mapping accuracy
function paf _mapeval ( args )
{
var c , max _mapq = 60 , mode = 0 , err _out _q = 256 , print _err = false , ovlp _ratio = 0.1 , cap _short _mapq = false ;
while ( ( c = getopt ( args , "Q:r:m:c" ) ) != null ) {
if ( c == 'Q' ) err _out _q = parseInt ( getopt . arg ) , print _err = true ;
else if ( c == 'r' ) ovlp _ratio = parseFloat ( getopt . arg ) ;
else if ( c == 'm' ) mode = parseInt ( getopt . arg ) ;
else if ( c == 'c' ) cap _short _mapq = true ;
}
if ( args . length == getopt . ind ) {
warn ( "Usage: paftools.js mapeval [options] <in.paf>|<in.sam>" ) ;
warn ( "Options:" ) ;
warn ( " -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp _ratio + "]" ) ;
warn ( " -Q INT print wrong mappings with mapQ>INT [don't print]" ) ;
warn ( " -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]" ) ;
exit ( 1 ) ;
}
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var buf = new Bytes ( ) ;
var tot = [ ] , err = [ ] ;
for ( var q = 0 ; q <= max _mapq ; ++ q )
tot [ q ] = err [ q ] = 0 ;
function is _correct ( s , b )
{
if ( s [ 0 ] != b [ 0 ] || s [ 3 ] != b [ 3 ] ) return false ;
var o , l ;
if ( s [ 1 ] < b [ 1 ] ) {
if ( s [ 2 ] <= b [ 1 ] ) return false ;
o = ( s [ 2 ] < b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - b [ 1 ] ;
l = ( s [ 2 ] > b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - s [ 1 ] ;
} else {
if ( b [ 2 ] <= s [ 1 ] ) return false ;
o = ( s [ 2 ] < b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - s [ 1 ] ;
l = ( s [ 2 ] > b [ 2 ] ? s [ 2 ] : b [ 2 ] ) - b [ 1 ] ;
}
return o / l > ovlp _ratio ? true : false ;
}
function count _err ( qname , a , tot , err , mode )
{
if ( a . length == 0 ) return ;
var m , s ;
if ( ( m = /^(\S+)!(\S+)!(\d+)!(\d+)!([\+\-])$/ . exec ( qname ) ) != null ) { // pbsim single-end reads
s = [ m [ 1 ] , m [ 2 ] , parseInt ( m [ 3 ] ) , parseInt ( m [ 4 ] ) , m [ 5 ] ] ;
} else if ( ( m = /^(\S+)!(\S+)!(\d+)_(\d+)!(\d+)_(\d+)!([\+\-])([\+\-])\/([12])$/ . exec ( qname ) ) != null ) { // mason2 paired-end reads
if ( m [ 9 ] == '1' ) {
s = [ m [ 1 ] , m [ 2 ] , parseInt ( m [ 3 ] ) , parseInt ( m [ 5 ] ) , m [ 7 ] ] ;
} else {
s = [ m [ 1 ] , m [ 2 ] , parseInt ( m [ 4 ] ) , parseInt ( m [ 6 ] ) , m [ 8 ] ] ;
}
} else throw Error ( "Failed to parse simulated read names '" + qname + "'" ) ;
s . shift ( ) ; // skip the orginal read name
if ( mode == 0 || mode == 1 ) { // longest only or first only
var max _i = 0 ;
if ( mode == 0 ) { // longest only
var max = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 5 ] > max )
max = a [ i ] [ 5 ] , max _i = i ;
}
var mapq = a [ max _i ] [ 4 ] ;
++ tot [ mapq ] ;
if ( ! is _correct ( s , a [ max _i ] ) ) {
if ( mapq >= err _out _q )
print ( 'E' , qname , a [ max _i ] . join ( "\t" ) ) ;
++ err [ mapq ] ;
}
} else if ( mode == 2 ) { // all primary mode
var max _err _mapq = - 1 , max _mapq = 0 , max _err _i = - 1 ;
if ( cap _short _mapq ) {
var max = 0 , max _q = 0 ;
for ( var i = 0 ; i < a . length ; ++ i )
if ( a [ i ] [ 5 ] > max )
max = a [ i ] [ 5 ] , max _q = a [ i ] [ 4 ] ;
for ( var i = 0 ; i < a . length ; ++ i )
a [ i ] [ 4 ] = max _q < a [ i ] [ 4 ] ? max _q : a [ i ] [ 4 ] ;
}
for ( var i = 0 ; i < a . length ; ++ i ) {
max _mapq = max _mapq > a [ i ] [ 4 ] ? max _mapq : a [ i ] [ 4 ] ;
if ( ! is _correct ( s , a [ i ] ) )
if ( a [ i ] [ 4 ] > max _err _mapq )
max _err _mapq = a [ i ] [ 4 ] , max _err _i = i ;
}
if ( max _err _mapq >= 0 ) {
++ tot [ max _err _mapq ] , ++ err [ max _err _mapq ] ;
if ( max _err _mapq >= err _out _q )
print ( 'E' , qname , a [ max _err _i ] . join ( "\t" ) ) ;
} else ++ tot [ max _mapq ] ;
}
}
var lineno = 0 , last = null , a = [ ] , n _unmapped = null ;
var re _cigar = /(\d+)([MIDSHN])/g ;
while ( file . readline ( buf ) >= 0 ) {
var m , line = buf . toString ( ) ;
++ lineno ;
if ( line [ 0 ] != '@' ) {
var t = line . split ( "\t" ) ;
if ( t [ 4 ] == '+' || t [ 4 ] == '-' ) { // PAF
if ( last != t [ 0 ] ) {
if ( last != null ) count _err ( last , a , tot , err , mode ) ;
a = [ ] , last = t [ 0 ] ;
}
if ( /\ts1:i:\d+/ . test ( line ) && ! /\ts2:i:\d+/ . test ( line ) ) // secondary alignment in minimap2 PAF
continue ;
var mapq = parseInt ( t [ 11 ] ) ;
if ( mapq > max _mapq ) mapq = max _mapq ;
a . push ( [ t [ 5 ] , parseInt ( t [ 7 ] ) , parseInt ( t [ 8 ] ) , t [ 4 ] , mapq , parseInt ( t [ 9 ] ) ] ) ;
} else { // SAM
var flag = parseInt ( t [ 1 ] ) ;
var read _no = flag >> 6 & 0x3 ;
var qname = t [ 0 ] ;
if ( ! /\/[12]$/ . test ( qname ) )
qname = read _no == 1 || read _no == 2 ? t [ 0 ] + '/' + read _no : t [ 0 ] ;
if ( last != qname ) {
if ( last != null ) count _err ( last , a , tot , err , mode ) ;
a = [ ] , last = qname ;
}
if ( flag & 0x100 ) continue ; // secondary alignment
if ( ( flag & 0x4 ) || t [ 2 ] == '*' ) { // unmapped
if ( n _unmapped == null ) n _unmapped = 0 ;
++ n _unmapped ;
continue ;
}
var mapq = parseInt ( t [ 4 ] ) ;
if ( mapq > max _mapq ) mapq = max _mapq ;
var pos = parseInt ( t [ 3 ] ) - 1 , pos _end = pos ;
var n _gap = 0 , mlen = 0 ;
while ( ( m = re _cigar . exec ( t [ 5 ] ) ) != null ) {
var len = parseInt ( m [ 1 ] ) ;
if ( m [ 2 ] == 'M' ) pos _end += len , mlen += len ;
else if ( m [ 2 ] == 'I' ) n _gap += len ;
else if ( m [ 2 ] == 'D' ) n _gap += len , pos _end += len ;
}
var score = pos _end - pos ;
if ( ( m = /\tNM:i:(\d+)/ . exec ( line ) ) != null ) {
var NM = parseInt ( m [ 1 ] ) ;
if ( NM >= n _gap ) score = mlen - ( NM - n _gap ) ;
}
a . push ( [ t [ 2 ] , pos , pos _end , ( flag & 16 ) ? '-' : '+' , mapq , score ] ) ;
}
}
}
if ( last != null ) count _err ( last , a , tot , err , mode ) ;
buf . destroy ( ) ;
file . close ( ) ;
var sum _tot = 0 , sum _err = 0 , q _out = - 1 , sum _tot2 = 0 , sum _err2 = 0 ;
for ( var q = max _mapq ; q >= 0 ; -- q ) {
if ( tot [ q ] == 0 ) continue ;
if ( q _out < 0 || err [ q ] > 0 ) {
if ( q _out >= 0 ) print ( 'Q' , q _out , sum _tot , sum _err , ( sum _err2 / sum _tot2 ) . toFixed ( 9 ) , sum _tot2 ) ;
sum _tot = sum _err = 0 , q _out = q ;
}
sum _tot += tot [ q ] , sum _err += err [ q ] ;
sum _tot2 += tot [ q ] , sum _err2 += err [ q ] ;
}
print ( 'Q' , q _out , sum _tot , sum _err , ( sum _err2 / sum _tot2 ) . toFixed ( 9 ) , sum _tot2 ) ;
if ( n _unmapped != null ) print ( 'U' , n _unmapped ) ;
}
// convert mason2 SAM to FASTQ
function paf _mason2fq ( args )
{
if ( args . length == 0 ) {
print ( "Usage: paftools.js mason2fq <mason.sam>" ) ;
exit ( 1 ) ;
}
function print _se ( a )
{
print ( '@' + a . slice ( 0 , 5 ) . join ( "!" ) + " " + a [ 8 ] ) ;
print ( a [ 5 ] ) ;
print ( "+" ) ;
print ( a [ 6 ] ) ;
}
var buf = new Bytes ( ) , buf2 = new Bytes ( ) ;
var file = new File ( args [ 0 ] ) ;
var re = /(\d+)([MIDSHN])/g ;
var last = null ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
if ( t [ 0 ] . charAt ( 0 ) == '@' ) continue ;
var m , l _ref = 0 ;
while ( ( m = re . exec ( t [ 5 ] ) ) != null )
if ( m [ 2 ] == 'D' || m [ 2 ] == 'M' || m [ 2 ] == 'N' )
l _ref += parseInt ( m [ 1 ] ) ;
var flag = parseInt ( t [ 1 ] ) ;
var rev = ! ! ( flag & 16 ) ;
var seq , qual ;
if ( rev ) {
buf2 . length = 0 ;
buf2 . set ( t [ 9 ] , 0 ) ;
buf2 . revcomp ( ) ;
seq = buf2 . toString ( ) ;
buf2 . set ( t [ 10 ] , 0 ) ;
buf2 . reverse ( ) ;
qual = buf2 . toString ( ) ;
} else seq = t [ 9 ] , qual = t [ 10 ] ;
var qname = t [ 0 ] ;
qname = qname . replace ( /^simulated./ , "" ) ;
var chr = t [ 2 ] ;
var pos = parseInt ( t [ 3 ] ) - 1 ;
var strand = ( flag & 16 ) ? '-' : '+' ;
var read _no = flag & 0xc0 ;
if ( read _no == 0x40 ) read _no = 1 ;
else if ( read _no == 0x80 ) read _no = 2 ;
else read _no = 0 ;
var err = 0 , snp = 0 , indel = 0 ;
for ( var i = 11 ; i < t . length ; ++ i ) {
if ( ( m = /^XE:i:(\d+)/ . exec ( t [ i ] ) ) != null ) err = m [ 1 ] ;
else if ( ( m = /^XS:i:(\d+)/ . exec ( t [ i ] ) ) != null ) snp = m [ 1 ] ;
else if ( ( m = /^XI:i:(\d+)/ . exec ( t [ i ] ) ) != null ) indel = m [ 1 ] ;
}
var comment = [ err , snp , indel ] . join ( ":" ) ;
if ( last == null ) {
last = [ qname , chr , pos , pos + l _ref , strand , seq , qual , read _no , comment ] ;
} else if ( last [ 0 ] != qname ) {
print _se ( last ) ;
last = [ qname , chr , pos , pos + l _ref , strand , seq , qual , read _no , comment ] ;
} else {
if ( read _no == 2 ) { // last[] is the first read
if ( last [ 7 ] != 1 ) throw Error ( "ERROR: can't find read1" ) ;
var name = [ qname , chr , last [ 2 ] + "_" + pos , last [ 3 ] + "_" + ( pos + l _ref ) , last [ 4 ] + strand ] . join ( "!" ) ;
print ( '@' + name + '/1' + ' ' + last [ 8 ] ) ; print ( last [ 5 ] ) ; print ( "+" ) ; print ( last [ 6 ] ) ;
print ( '@' + name + '/2' + ' ' + comment ) ; print ( seq ) ; print ( "+" ) ; print ( qual ) ;
} else {
if ( last [ 7 ] != 2 ) throw Error ( "ERROR: can't find read2" ) ;
var name = [ qname , chr , pos + "_" + last [ 2 ] , ( pos + l _ref ) + "_" + last [ 3 ] , strand + last [ 4 ] ] . join ( "!" ) ;
print ( '@' + name + '/1' + ' ' + comment ) ; print ( seq ) ; print ( "+" ) ; print ( qual ) ;
print ( '@' + name + '/2' + ' ' + last [ 8 ] ) ; print ( last [ 5 ] ) ; print ( "+" ) ; print ( last [ 6 ] ) ;
}
last = null ;
}
}
if ( last != null ) print _se ( last ) ;
file . close ( ) ;
buf . destroy ( ) ;
buf2 . destroy ( ) ;
}
// convert pbsim MAF to FASTQ
function paf _pbsim2fq ( args )
{
if ( args . length < 2 ) {
print ( "Usage: paftools.js pbsim2fq <ref.fa.fai> <pbsim1.maf> [[pbsim2.maf] ...]" ) ;
exit ( 1 ) ;
}
var file , buf = new Bytes ( ) , buf2 = new Bytes ( ) ;
file = new File ( args [ 0 ] ) ;
var chr _list = [ ] ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( /\s+/ ) ;
chr _list . push ( t [ 0 ] ) ;
}
file . close ( ) ;
for ( var k = 1 ; k < args . length ; ++ k ) {
var fn = args [ k ] ;
file = new File ( fn ) ;
var state = 0 , reg ;
while ( file . readline ( buf ) >= 0 ) {
var line = buf . toString ( ) ;
if ( state == 0 && line . charAt ( 0 ) == 'a' ) {
state = 1 ;
} else if ( state == 1 && line . charAt ( 0 ) == 's' ) {
var t = line . split ( /\s+/ ) ;
var st = parseInt ( t [ 2 ] ) ;
reg = [ st , st + parseInt ( t [ 3 ] ) ] ;
state = 2 ;
} else if ( state == 2 && line . charAt ( 0 ) == 's' ) {
var m , t = line . split ( /\s+/ ) ;
if ( ( m = /S(\d+)_\d+/ . exec ( t [ 1 ] ) ) == null ) throw Error ( "Failed to parse the read name" ) ;
var chr _id = parseInt ( m [ 1 ] ) - 1 ;
if ( chr _id >= chr _list . length ) throw Error ( "Index outside the chr list" ) ;
var name = [ t [ 1 ] , chr _list [ chr _id ] , reg [ 0 ] , reg [ 1 ] , t [ 4 ] ] . join ( "!" ) ;
var seq = t [ 6 ] . replace ( /\-/g , "" ) ;
if ( seq . length != parseInt ( t [ 5 ] ) ) throw Error ( "Inconsistent read length" ) ;
if ( seq . indexOf ( "NN" ) < 0 ) {
if ( t [ 4 ] == '-' ) {
buf2 . set ( seq , 0 ) ;
buf2 . length = seq . length ;
buf2 . revcomp ( ) ;
seq = buf2 . toString ( ) ;
}
print ( ">" + name ) ;
print ( seq ) ;
}
state = 0 ;
}
}
file . close ( ) ;
}
buf . destroy ( ) ;
buf2 . destroy ( ) ;
}
2018-02-10 00:07:14 +08:00
function paf _junceval ( args ) // FIXME: the reported number of mapped reads is slightly off
{
var c , l _fuzzy = 0 , print _ovlp = false , print _err _only = false , first _only = false ;
while ( ( c = getopt ( args , "l:ep" ) ) != null ) {
if ( c == 'l' ) l _fuzzy = parseInt ( getopt . arg ) ;
else if ( c == 'e' ) print _err _only = print _ovlp = true ;
else if ( c == 'p' ) print _ovlp = true ;
}
if ( args . length - getopt . ind < 2 ) {
print ( "Usage: paftools.js junceval [options] <gene.gtf> <aln.sam>" ) ;
print ( "Options:" ) ;
print ( " -l INT tolerance of junction positions (0 for exact) [0]" ) ;
print ( " -p print overlapping introns" ) ;
print ( " -e print erroreous overlapping introns" ) ;
exit ( 1 ) ;
}
var file , buf = new Bytes ( ) ;
var tr = { } ;
file = new File ( args [ getopt . ind ] ) ;
while ( file . readline ( buf ) >= 0 ) {
var m , t = buf . toString ( ) . split ( "\t" ) ;
if ( t [ 0 ] . charAt ( 0 ) == '#' ) continue ;
if ( t [ 2 ] != 'exon' ) continue ;
var st = parseInt ( t [ 3 ] ) - 1 ;
var en = parseInt ( t [ 4 ] ) ;
if ( ( m = /transcript_id "(\S+)"/ . exec ( t [ 8 ] ) ) == null ) continue ;
var tid = m [ 1 ] ;
if ( tr [ tid ] == null ) tr [ tid ] = [ t [ 0 ] , t [ 6 ] , 0 , 0 , [ ] ] ;
tr [ tid ] [ 4 ] . push ( [ st , en ] ) ;
}
file . close ( ) ;
var anno = { } ;
for ( var tid in tr ) {
var t = tr [ tid ] ;
Interval . sort ( t [ 4 ] ) ;
t [ 2 ] = t [ 4 ] [ 0 ] [ 0 ] ;
t [ 3 ] = t [ 4 ] [ t [ 4 ] . length - 1 ] [ 1 ] ;
if ( anno [ t [ 0 ] ] == null ) anno [ t [ 0 ] ] = [ ] ;
var s = t [ 4 ] ;
for ( var i = 0 ; i < s . length - 1 ; ++ i ) {
if ( s [ i ] [ 1 ] >= s [ i + 1 ] [ 0 ] )
warn ( "WARNING: incorrect annotation for transcript " + tid + " (" + s [ i ] [ 1 ] + " >= " + s [ i + 1 ] [ 0 ] + ")" )
anno [ t [ 0 ] ] . push ( [ s [ i ] [ 1 ] , s [ i + 1 ] [ 0 ] ] ) ;
}
}
tr = null ;
for ( var chr in anno ) {
var e = anno [ chr ] ;
if ( e . length == 0 ) continue ;
Interval . sort ( e ) ;
var k = 0 ;
for ( var i = 1 ; i < e . length ; ++ i ) // dedup
if ( e [ i ] [ 0 ] != e [ k ] [ 0 ] || e [ i ] [ 1 ] != e [ k ] [ 1 ] )
e [ ++ k ] = e [ i ] . slice ( 0 ) ;
e . length = k + 1 ;
Interval . index _end ( e ) ;
}
var n _pri = 0 , n _unmapped = 0 , n _mapped = 0 ;
var n _sgl = 0 , n _splice = 0 , n _splice _hit = 0 , n _splice _novel = 0 ;
file = new File ( args [ getopt . ind + 1 ] ) ;
var last _qname = null ;
var re _cigar = /(\d+)([MIDNSHX=])/g ;
while ( file . readline ( buf ) >= 0 ) {
var m , t = buf . toString ( ) . split ( "\t" ) ;
if ( t [ 0 ] . charAt ( 0 ) == '@' ) continue ;
var flag = parseInt ( t [ 1 ] ) ;
if ( flag & 0x100 ) continue ;
if ( first _only && last _qname == t [ 0 ] ) continue ;
if ( t [ 2 ] == '*' ) {
++ n _unmapped ;
continue ;
} else {
++ n _pri ;
if ( last _qname != t [ 0 ] ) ++ n _mapped ;
}
var pos = parseInt ( t [ 3 ] ) - 1 , intron = [ ] ;
while ( ( m = re _cigar . exec ( t [ 5 ] ) ) != null ) {
var len = parseInt ( m [ 1 ] ) , op = m [ 2 ] ;
if ( op == 'N' ) {
intron . push ( [ pos , pos + len ] ) ;
pos += len ;
} else if ( op == 'M' || op == 'X' || op == '=' || op == 'D' ) pos += len ;
}
if ( intron . length == 0 ) {
++ n _sgl ;
continue ;
}
n _splice += intron . length ;
var chr = anno [ t [ 2 ] ] ;
if ( chr != null ) {
for ( var i = 0 ; i < intron . length ; ++ i ) {
var o = Interval . find _ovlp ( chr , intron [ i ] [ 0 ] , intron [ i ] [ 1 ] ) ;
if ( o . length > 0 ) {
var hit = false ;
for ( var j = 0 ; j < o . length ; ++ j ) {
var st _diff = intron [ i ] [ 0 ] - o [ j ] [ 0 ] ;
var en _diff = intron [ i ] [ 1 ] - o [ j ] [ 1 ] ;
if ( st _diff < 0 ) st _diff = - st _diff ;
if ( en _diff < 0 ) en _diff = - en _diff ;
if ( st _diff <= l _fuzzy && en _diff <= l _fuzzy )
++ n _splice _hit , hit = true ;
if ( hit ) break ;
}
if ( print _ovlp ) {
var type = hit ? 'C' : 'P' ;
if ( hit && print _err _only ) continue ;
var x = '[' ;
for ( var j = 0 ; j < o . length ; ++ j ) {
if ( j ) x += ', ' ;
x += '(' + o [ j ] [ 0 ] + "," + o [ j ] [ 1 ] + ')' ;
}
x += ']' ;
print ( type , t [ 0 ] , i + 1 , t [ 2 ] , intron [ i ] [ 0 ] , intron [ i ] [ 1 ] , x ) ;
}
} else {
++ n _splice _novel ;
if ( print _ovlp )
print ( 'N' , t [ 0 ] , i + 1 , t [ 2 ] , intron [ i ] [ 0 ] , intron [ i ] [ 1 ] ) ;
}
}
} else {
n _splice _novel += intron . length ;
}
last _qname = t [ 0 ] ;
}
file . close ( ) ;
buf . destroy ( ) ;
if ( ! print _ovlp ) {
print ( "# unmapped reads: " + n _unmapped ) ;
print ( "# mapped reads: " + n _mapped ) ;
print ( "# primary alignments: " + n _pri ) ;
print ( "# singletons: " + n _sgl ) ;
print ( "# predicted introns: " + n _splice ) ;
print ( "# non-overlapping introns: " + n _splice _novel ) ;
print ( "# correct introns: " + n _splice _hit + " (" + ( n _splice _hit / n _splice * 100 ) . toFixed ( 2 ) + "%)" ) ;
}
}
2018-02-09 23:23:30 +08:00
// evaluate overlap sensitivity
2018-02-09 23:00:35 +08:00
function paf _ov _eval ( args )
{
var c , min _ovlp = 2000 , min _frac = 0.95 , min _mapq = 10 ;
while ( ( c = getopt ( args , "q:l:f:" ) ) != null ) {
if ( c == 'q' ) min _mapq = parseInt ( getopt . arg ) ;
else if ( c == 'l' ) min _ovlp = parseInt ( getopt . arg ) ;
else if ( c == 'f' ) min _frac = parseFloat ( getopt . arg ) ;
}
if ( args . length - getopt . ind < 2 ) {
print ( "Usage: sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval [options] - <ovlp.paf>" ) ;
print ( "Options:" ) ;
print ( " -l INT min overlap length [2000]" ) ;
print ( " -q INT min mapping quality [10]" ) ;
print ( " -f FLOAT min fraction of mapped length [0.95]" ) ;
exit ( 1 ) ;
}
var buf = new Bytes ( ) ;
var file = args [ getopt . ind ] == '-' ? new File ( ) : new File ( args [ getopt . ind ] ) ;
var a = [ ] , h = { } ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
var is _pri = false ;
if ( parseInt ( t [ 11 ] ) < min _mapq ) continue ;
for ( var i = 12 ; i < t . length ; ++ i )
if ( t [ i ] == 'tp:A:P' )
is _pri = true ;
if ( ! is _pri ) continue ;
for ( var i = 1 ; i <= 3 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
for ( var i = 6 ; i <= 8 ; ++ i )
t [ i ] = parseInt ( t [ i ] ) ;
if ( t [ 3 ] - t [ 2 ] < min _ovlp || t [ 8 ] - t [ 7 ] < min _ovlp || ( t [ 3 ] - t [ 2 ] ) / t [ 1 ] < min _frac )
continue ;
var ctg = t [ 5 ] , st = t [ 7 ] , en = t [ 8 ] ;
while ( a . length > 0 ) {
if ( a [ 0 ] [ 0 ] == ctg && a [ 0 ] [ 2 ] > st )
break ;
else a . shift ( ) ;
}
for ( var j = 0 ; j < a . length ; ++ j ) {
if ( a [ j ] [ 3 ] == t [ 0 ] ) continue ;
var len = ( en > a [ j ] [ 2 ] ? a [ j ] [ 2 ] : en ) - st ;
if ( len >= min _ovlp ) {
var key = a [ j ] [ 3 ] < t [ 0 ] ? a [ j ] [ 3 ] + "\t" + t [ 0 ] : t [ 0 ] + "\t" + a [ j ] [ 3 ] ;
h [ key ] = len ;
}
}
a . push ( [ ctg , st , en , t [ 0 ] ] ) ;
}
file . close ( ) ;
file = new File ( args [ getopt . ind + 1 ] ) ;
while ( file . readline ( buf ) >= 0 ) {
var t = buf . toString ( ) . split ( "\t" ) ;
var key = t [ 0 ] < t [ 5 ] ? t [ 0 ] + "\t" + t [ 5 ] : t [ 5 ] + "\t" + t [ 0 ] ;
if ( h [ key ] > 0 ) h [ key ] = - h [ key ] ;
}
file . close ( ) ;
buf . destroy ( ) ;
var n _ovlp = 0 , n _missing = 0 ;
for ( var key in h ) {
++ n _ovlp ;
if ( h [ key ] > 0 ) ++ n _missing ;
}
print ( n _ovlp + " overlaps inferred from the reference mapping" ) ;
print ( n _missing + " missed by the read overlapper" ) ;
print ( ( 100 * ( 1 - n _missing / n _ovlp ) ) . toFixed ( 2 ) + "% sensitivity" ) ;
}
/ * * * * * * * * * * * * * * * * * * * * * * * * *
* * * * * main function * * * * *
* * * * * * * * * * * * * * * * * * * * * * * * * /
function main ( args )
{
if ( args . length == 0 ) {
print ( "Usage: paftools.js <command> [arguments]" ) ;
print ( "Commands:" ) ;
2018-02-09 23:23:30 +08:00
print ( " view convert PAF to BLAST-like (for eyeballing) or MAF" ) ;
print ( " splice2bed convert spliced alignment in PAF/SAM to BED12" ) ;
2018-02-10 01:14:28 +08:00
print ( " sam2paf convert SAM to PAF" ) ;
print ( " delta2paf convert MUMmer's delta to PAF" ) ;
2018-02-09 23:47:12 +08:00
print ( " gff2bed convert GTF/GFF3 to BED12" ) ;
2018-02-09 23:23:30 +08:00
print ( "" ) ;
2018-02-09 23:47:12 +08:00
print ( " stat collect basic mapping information in PAF/SAM" ) ;
2018-02-09 23:00:35 +08:00
print ( " call call variants from asm-to-ref alignment with the cs tag" ) ;
print ( "" ) ;
print ( " mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ" ) ;
print ( " mason2fq convert mason2-simulated SAM to FASTQ" ) ;
print ( " pbsim2fq convert PBSIM-simulated MAF to FASTQ" ) ;
2018-02-10 00:07:14 +08:00
print ( " junceval evaluate splice junction consistency with known annotations" ) ;
2018-02-09 23:00:35 +08:00
print ( " ov-eval evaluate read overlap sensitivity using read-to-ref mapping" ) ;
exit ( 1 ) ;
}
var cmd = args . shift ( ) ;
2018-02-09 23:23:30 +08:00
if ( cmd == 'view' ) paf _view ( args ) ;
else if ( cmd == 'sam2paf' ) paf _sam2paf ( args ) ;
2018-02-10 01:14:28 +08:00
else if ( cmd == 'delta2paf' ) paf _delta2paf ( args ) ;
2018-02-09 23:23:30 +08:00
else if ( cmd == 'splice2bed' ) paf _splice2bed ( args ) ;
2018-02-09 23:47:12 +08:00
else if ( cmd == 'gff2bed' ) paf _gff2bed ( args ) ;
else if ( cmd == 'stat' ) paf _stat ( args ) ;
2018-02-09 23:23:30 +08:00
else if ( cmd == 'call' ) paf _call ( args ) ;
2018-02-09 23:00:35 +08:00
else if ( cmd == 'mapeval' ) paf _mapeval ( args ) ;
else if ( cmd == 'mason2fq' ) paf _mason2fq ( args ) ;
else if ( cmd == 'pbsim2fq' ) paf _pbsim2fq ( args ) ;
2018-02-10 00:07:14 +08:00
else if ( cmd == 'junceval' ) paf _junceval ( args ) ;
2018-02-09 23:00:35 +08:00
else if ( cmd == 'ov-eval' ) paf _ov _eval ( args ) ;
else throw Error ( "unrecognized command: " + cmd ) ;
}
main ( arguments ) ;