From 012a7cf0a5ea3ef38e36021043668a959c85caa3 Mon Sep 17 00:00:00 2001 From: asivache Date: Wed, 4 Aug 2010 15:27:35 +0000 Subject: [PATCH] mismatchCount now has a version that counts mismatches only along a part of the read (takes additional args start_on_read and length_on_read to specify the read's subsequence to be interrogated); isMateUnmapped() convenience shortcut method added. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3931 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/sam/AlignmentUtils.java | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 3bd3b5350..754a66029 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -64,6 +64,24 @@ public class AlignmentUtils { return getMismatchCount(r, refSeq, refIndex).numMismatches; } + /** Same as #numMismatches(SAMRecord, byte[], refIndex), but counts mismatches only along the partial stretch + * on the read of length nReadBases starting at (0-based) position readIndex. + * @param r Aligned read to count mismatches for + * @param refSeq Chunk of reference sequence that subsumes the alignment + * @param refIndex Zero-based position on refSeq where the alighnment for the whole read starts + * @param readIndex Zero-based position on the read, the mismatches will be counted only from this position on + * @param nReadBases Length of continuous stretch on the read, along which mismatches will be counted + * @return + */ + public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) { + if ( r.getReadUnmappedFlag() ) return 1000000; + return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).numMismatches; + } + + public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) { + return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).mismatchQualities; + } + @Deprecated public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) { if ( r.getReadUnmappedFlag() ) return 1000000; @@ -80,21 +98,31 @@ public class AlignmentUtils { return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex); } + private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); + } + // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single // todo -- high performance implementation. We can do a lot better than this right now - private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { MismatchCount mc = new MismatchCount(); int readIdx = 0; + int endOnRead = startOnRead + nReadBases - 1; // index of the last base on read we want to count byte[] readSeq = r.getReadBases(); Cigar c = r.getCigar(); for (int i = 0 ; i < c.numCigarElements() ; i++) { + + if ( readIdx > endOnRead ) break; + CigarElement ce = c.getCigarElement(i); switch ( ce.getOperator() ) { case M: for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { if ( refIndex >= refSeq.length ) continue; + if ( readIdx < startOnRead ) continue; + if ( readIdx > endOnRead ) break; byte refChr = refSeq[refIndex]; byte readChr = readSeq[readIdx]; // Note: we need to count X/N's as mismatches because that's what SAM requires @@ -337,6 +365,28 @@ public class AlignmentUtils { return true; } + /** + * Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format + * specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and + * alignment reference index/start of the mate. + * @param r sam record for the read + * @return true if read's mate is unmapped + */ + public static boolean isMateUnmapped(final SAMRecord r) { + if ( r.getMateUnmappedFlag() ) return true; + + // our life would be so much easier if all sam files followed the specs. In reality, + // sam files (including those generated by maq or bwa) miss headers alltogether. When + // reading such a SAM file, reference name is set, but since there is no sequence dictionary, + // null is always returned for referenceIndex. Let's be paranoid here, and make sure that + // we do not call the read "unmapped" when it has only reference name set with ref. index missing + // or vice versa. + if ( ( r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX + || r.getMateReferenceName() != null && r.getMateReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME ) + && r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) return false ; + return true; + } + /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array