sam2paf cleanup
This commit is contained in:
parent
19e05a099d
commit
dc61301d9f
|
|
@ -1031,8 +1031,12 @@ function paf_sam2paf(args)
|
|||
var c, pri_only = false;
|
||||
while ((c = getopt(args, "p")) != null)
|
||||
if (c == 'p') pri_only = true;
|
||||
if (args.length == getopt.ind) {
|
||||
print("Usage: paftools.js sam2paf [-p] <in.sam>");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
var file = args.length == getopt.ind || args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
||||
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
|
||||
var buf = new Bytes();
|
||||
var re = /(\d+)([MIDSHNX=])/g;
|
||||
|
||||
|
|
@ -1050,31 +1054,40 @@ function paf_sam2paf(args)
|
|||
}
|
||||
var t = line.split("\t");
|
||||
var flag = parseInt(t[1]);
|
||||
if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length) throw Error("ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
|
||||
if (t[2] == '*' || (flag&4)) continue;
|
||||
if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length)
|
||||
throw Error("ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length);
|
||||
if (t[2] == '*' || (flag&4) || t[5] == '*') continue;
|
||||
if (pri_only && (flag&0x100)) continue;
|
||||
var tlen = len[t[2]];
|
||||
if (tlen == null) throw Error("ERROR at line " + lineno + ": can't find the length of contig " + t[2]);
|
||||
var nn = (m = /\tnn:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
|
||||
var NM = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : null;
|
||||
var have_NM = NM == null? false : true;
|
||||
NM += nn;
|
||||
var clip = [0, 0], I = [0, 0], D = [0, 0], M = 0, N = 0, ql = 0, tl = 0, mm = 0, ext_cigar = false;
|
||||
// find tags
|
||||
var nn = 0, NM = null, MD = null;
|
||||
for (var i = 11; i < t.length; ++i) {
|
||||
if (t[i].indexOf("NM:i:") == 0) NM = parseInt(t[i].substr(5));
|
||||
else if (t[i].indexOf("nn:i:") == 0) nn = parseInt(t[i].substr(5));
|
||||
else if (t[i].indexOf("MD:Z:") == 0) MD = t[i].substr(5);
|
||||
}
|
||||
// infer various lengths from CIGAR
|
||||
var clip = [0, 0], soft_clip = 0, I = [0, 0], D = [0, 0], M = 0, N = 0, mm = 0, have_M = false, have_ext = false;
|
||||
while ((m = re.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
if (m[2] == 'M') M += l, ql += l, tl += l, ext_cigar = false;
|
||||
else if (m[2] == 'I') ++I[0], I[1] += l, ql += l;
|
||||
else if (m[2] == 'D') ++D[0], D[1] += l, tl += l;
|
||||
else if (m[2] == 'N') N += l, tl += l;
|
||||
else if (m[2] == 'S') clip[M == 0? 0 : 1] = l, ql += l;
|
||||
else if (m[2] == 'H') clip[M == 0? 0 : 1] = l;
|
||||
else if (m[2] == '=') M += l, ql += l, tl += l, ext_cigar = true;
|
||||
else if (m[2] == 'X') M += l, ql += l, tl += l, mm += l, ext_cigar = true;
|
||||
if (m[2] == 'M') M += l, have_M = true;
|
||||
else if (m[2] == 'I') ++I[0], I[1] += l;
|
||||
else if (m[2] == 'D') ++D[0], D[1] += l;
|
||||
else if (m[2] == 'N') N += l;
|
||||
else if (m[2] == 'S') clip[n_cigar == 0? 0 : 1] = l, soft_clip += l;
|
||||
else if (m[2] == 'H') clip[n_cigar == 0? 0 : 1] = l;
|
||||
else if (m[2] == '=') M += l, have_ext = true;
|
||||
else if (m[2] == 'X') M += l, mm += l, have_ext = true;
|
||||
++n_cigar;
|
||||
}
|
||||
var ql = M + I[1] + soft_clip;
|
||||
var tl = M + D[1] + N;
|
||||
var ts = parseInt(t[3]) - 1, te = ts + tl;
|
||||
// checking coordinate and length consistencies
|
||||
if (n_cigar > 65535)
|
||||
warn("WARNING at line " + lineno + ": " + n_cigar + " CIGAR operations");
|
||||
if (tl + parseInt(t[3]) - 1 > tlen) {
|
||||
if (te > tlen) {
|
||||
warn("WARNING at line " + lineno + ": alignment end position larger than ref length; skipped");
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1082,24 +1095,38 @@ function paf_sam2paf(args)
|
|||
warn("WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t[9].length + " != " + ql + "); skipped");
|
||||
continue;
|
||||
}
|
||||
if (!have_NM || ext_cigar) NM = I[1] + D[1] + mm;
|
||||
if (NM < I[1] + D[1] + mm) {
|
||||
warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]+mm) + ")");
|
||||
// compute matching length, block length and calibrate NM
|
||||
if (have_ext && !have_M) { // extended CIGAR
|
||||
if (NM != null && NM != I[1] + D[1] + mm)
|
||||
warn("WARNING at line " + lineno + ": NM is different from sum of gaps and mismatches");
|
||||
NM = I[1] + D[1] + mm;
|
||||
} else if (NM != null) { // standard CIGAR; NM present
|
||||
if (NM < I[1] + D[1]) {
|
||||
warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]) + ")");
|
||||
NM = I[1] + D[1];
|
||||
}
|
||||
mm = NM - (I[1] + D[1]);
|
||||
} else { // no way to compute mm
|
||||
warn("WARNING at line " + lineno + ": unable to find the number of mismatches; assuming zero");
|
||||
mm = 0;
|
||||
}
|
||||
var extra = ["mm:i:"+(NM-I[1]-D[1]), "io:i:"+I[0], "in:i:"+I[1], "do:i:"+D[0], "dn:i:"+D[1]];
|
||||
var match = M - (NM - I[1] - D[1]);
|
||||
var mlen = M - mm;
|
||||
var blen = M + I[1] + D[1];
|
||||
// find query name, start and end
|
||||
var qlen = M + I[1] + clip[0] + clip[1];
|
||||
var qs, qe;
|
||||
if (flag&16) qs = clip[1], qe = qlen - clip[0];
|
||||
else qs = clip[0], qe = qlen - clip[1];
|
||||
var ts = parseInt(t[3]) - 1, te = ts + M + D[1] + N;
|
||||
var qname = t[0];
|
||||
var qname = t[0], qs, qe;
|
||||
if ((flag&1) && (flag&0x40)) qname += '/1';
|
||||
if ((flag&1) && (flag&0x80)) qname += '/2';
|
||||
var a = [qname, qlen, qs, qe, flag&16? '-' : '+', t[2], tlen, ts, te, match, blen, t[4]];
|
||||
print(a.join("\t"), extra.join("\t"));
|
||||
if (flag&16) qs = clip[1], qe = qlen - clip[0];
|
||||
else qs = clip[0], qe = qlen - clip[1];
|
||||
// optional tags
|
||||
var type = flag&0x100? 'S' : 'P';
|
||||
var tags = ["tp:A:" + type];
|
||||
if (NM != null) tags.push("mm:i:"+mm);
|
||||
tags.push("gn:i:"+(I[1]+D[1]), "go:i:"+(I[0]+D[0]), "cg:Z:" + t[5].replace(/\d+[SH]/g, ''));
|
||||
// print out
|
||||
var a = [qname, qlen, qs, qe, flag&16? '-' : '+', t[2], tlen, ts, te, mlen, blen, t[4]];
|
||||
print(a.join("\t"), tags.join("\t"));
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
|
|
|
|||
Loading…
Reference in New Issue