mismatchCount now has a version that counts mismatches only along a part of the read (takes additional args start_on_read and length_on_read to specify the read's subsequence to be interrogated);
isMateUnmapped() convenience shortcut method added. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3931 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e6e8a20a1e
commit
012a7cf0a5
|
|
@ -64,6 +64,24 @@ public class AlignmentUtils {
|
||||||
return getMismatchCount(r, refSeq, refIndex).numMismatches;
|
return getMismatchCount(r, refSeq, refIndex).numMismatches;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Same as #numMismatches(SAMRecord, byte[], refIndex), but counts mismatches only along the partial stretch
|
||||||
|
* on the read of length <code>nReadBases</code> starting at (0-based) position <code>readIndex</code>.
|
||||||
|
* @param r Aligned read to count mismatches for
|
||||||
|
* @param refSeq Chunk of reference sequence that subsumes the alignment
|
||||||
|
* @param refIndex Zero-based position on <code>refSeq</code> where the alighnment for the whole read starts
|
||||||
|
* @param readIndex Zero-based position on the read, the mismatches will be counted only from this position on
|
||||||
|
* @param nReadBases Length of continuous stretch on the read, along which mismatches will be counted
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) {
|
||||||
|
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||||
|
return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).numMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) {
|
||||||
|
return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).mismatchQualities;
|
||||||
|
}
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
|
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
|
||||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||||
|
|
@ -80,21 +98,31 @@ public class AlignmentUtils {
|
||||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||||
|
return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
|
||||||
|
}
|
||||||
|
|
||||||
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
|
// todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
|
||||||
// todo -- high performance implementation. We can do a lot better than this right now
|
// todo -- high performance implementation. We can do a lot better than this right now
|
||||||
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
|
||||||
MismatchCount mc = new MismatchCount();
|
MismatchCount mc = new MismatchCount();
|
||||||
|
|
||||||
int readIdx = 0;
|
int readIdx = 0;
|
||||||
|
int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count
|
||||||
byte[] readSeq = r.getReadBases();
|
byte[] readSeq = r.getReadBases();
|
||||||
Cigar c = r.getCigar();
|
Cigar c = r.getCigar();
|
||||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
|
|
||||||
|
if ( readIdx > endOnRead ) break;
|
||||||
|
|
||||||
CigarElement ce = c.getCigarElement(i);
|
CigarElement ce = c.getCigarElement(i);
|
||||||
switch ( ce.getOperator() ) {
|
switch ( ce.getOperator() ) {
|
||||||
case M:
|
case M:
|
||||||
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
|
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) {
|
||||||
if ( refIndex >= refSeq.length )
|
if ( refIndex >= refSeq.length )
|
||||||
continue;
|
continue;
|
||||||
|
if ( readIdx < startOnRead ) continue;
|
||||||
|
if ( readIdx > endOnRead ) break;
|
||||||
byte refChr = refSeq[refIndex];
|
byte refChr = refSeq[refIndex];
|
||||||
byte readChr = readSeq[readIdx];
|
byte readChr = readSeq[readIdx];
|
||||||
// Note: we need to count X/N's as mismatches because that's what SAM requires
|
// Note: we need to count X/N's as mismatches because that's what SAM requires
|
||||||
|
|
@ -337,6 +365,28 @@ public class AlignmentUtils {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format
|
||||||
|
* specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and
|
||||||
|
* alignment reference index/start of the mate.
|
||||||
|
* @param r sam record for the read
|
||||||
|
* @return true if read's mate is unmapped
|
||||||
|
*/
|
||||||
|
public static boolean isMateUnmapped(final SAMRecord r) {
|
||||||
|
if ( r.getMateUnmappedFlag() ) return true;
|
||||||
|
|
||||||
|
// our life would be so much easier if all sam files followed the specs. In reality,
|
||||||
|
// sam files (including those generated by maq or bwa) miss headers alltogether. When
|
||||||
|
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
|
||||||
|
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
|
||||||
|
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
|
||||||
|
// or vice versa.
|
||||||
|
if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
|
||||||
|
|| r.getMateReferenceName() != null && r.getMateReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME )
|
||||||
|
&& r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
|
/** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
|
||||||
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
|
* cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
|
||||||
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
|
* qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue