From 49117819f5f36d553a71240f9acfa6f0d0759432 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 24 Mar 2010 15:21:58 +0000 Subject: [PATCH] For the cleaner to clean, it must beat the entropy produced by the aligner (and not just the raw reads). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3068 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/indels/IndelRealigner.java | 40 +++-- .../sting/utils/AlignmentUtils.java | 144 ++++-------------- 2 files changed, 49 insertions(+), 135 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 170524f5b..90c08f597 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -390,9 +390,8 @@ public class IndelRealigner extends ReadWalker { final ArrayList refReads = new ArrayList(); // reads that perfectly match ref final ArrayList altReads = new ArrayList(); // reads that don't perfectly match final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? - final ArrayList leftMovedIndels = new ArrayList(); final Set altConsenses = new LinkedHashSet(); // list of alt consenses - int totalMismatchSum = 0; + long totalAlignerMismatchSum = 0, totalRawMismatchSum = 0; // if there are any known indels for this region, get them for ( VariationRod knownIndel : knownIndelsToTry ) { @@ -419,29 +418,27 @@ public class IndelRealigner extends ReadWalker { final AlignedRead aRead = new AlignedRead(read); - // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); if ( numBlocks == 2 ) { - Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0); - if ( aRead.setCigar(newCigar) ) { - leftMovedIndels.add(aRead); - } + aRead.setCigar(newCigar); } - final int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, Integer.MAX_VALUE); + final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex; + totalAlignerMismatchSum += AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef); + final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); // if ( debugOn ) System.out.println("mismatchScore="+mismatchScore); // if this doesn't match perfectly to the reference, let's try to clean it - if ( mismatchScore > 0 ) { + if ( rawMismatchScore > 0 ) { altReads.add(aRead); if ( !read.getDuplicateReadFlag() ) - totalMismatchSum += mismatchScore; - aRead.setMismatchScoreToReference(mismatchScore); + totalRawMismatchSum += rawMismatchScore; + aRead.setMismatchScoreToReference(rawMismatchScore); // if it has an indel, let's see if that's the best consensus if ( numBlocks == 2 ) { - Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases()); + Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getRead().getReadBases()); if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName()); else altConsenses.add(c); @@ -543,10 +540,13 @@ public class IndelRealigner extends ReadWalker { } } - // if the best alternate consensus has a smaller sum of quality score mismatches (more than - // the LOD threshold), and it didn't just move around the mismatching columns, then clean! - final double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0); - if ( improvement >= LOD_THRESHOLD ) { + // if: + // 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads, + // 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads, + // 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy), + // then clean! + final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0); + if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) { bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference); @@ -584,7 +584,7 @@ public class IndelRealigner extends ReadWalker { for ( int i = 0; i < length; i++) str.append((char)bestConsensus.str[position+i]); } - str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n"); + str.append("\t" + (((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n"); try { indelOutput.write(str.toString()); indelOutput.flush(); @@ -623,10 +623,8 @@ public class IndelRealigner extends ReadWalker { } else if ( statsOutput != null ) { try { - statsOutput.write(readsToClean.getLocation().toString()); - statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD - statsOutput.write(Double.toString(improvement)); - statsOutput.write("\n"); + statsOutput.write(String.format("%s\tFAIL\t%.1f\t%d%n", + readsToClean.getLocation().toString(), improvement, bestConsensus.mismatchSum - totalAlignerMismatchSum)); statsOutput.flush(); } catch (Exception e) {} } diff --git a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java index e09bfc136..1944e3ce0 100644 --- a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java @@ -5,122 +5,17 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.util.StringUtil; -import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.pileup.*; -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Mar 25, 2009 - * Time: 12:15:38 AM - * To change this template use File | Settings | File Templates. - */ public class AlignmentUtils { - - /** Returns number of mismatches in the alignment r to the reference sequence - * refSeq. It is assumed that - * the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches - * are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do - * not contribute to the error count returned by this method. - * @param r aligned read - * @param refSeq reference sequence - * @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped) - */ - public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) { - byte[] ref = refSeq.getBases(); - if ( r.getReadUnmappedFlag() ) return 1000000; - int i_ref = r.getAlignmentStart()-1; // position on the ref - int i_read = 0; // position on the read - int mm_count = 0; // number of mismatches - Cigar c = r.getCigar(); - for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { - CigarElement ce = c.getCigarElement(k); - switch( ce.getOperator() ) { - case M: - for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - char refChr = (char)ref[i_ref]; - char readChr = (char)r.getReadBases()[i_read]; - if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - mm_count++; - } - break; - case I: - case S: - i_read += ce.getLength(); - break; - case D: - case N: - i_ref += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element"); - } - - } - return mm_count; + private static class MismatchCount { + public int numMismatches = 0; + public long mismatchQualities = 0; } - /** - * mhanna - 11 May 2009 - stubbed out competing method that works with partial references. - * Computes number of mismatches in the read alignment to the refence ref - * specified in the record r. Indels are completely ignored by this method: - * only base mismatches in the alignment segments where both sequences are present are counted. - * @param r - * @return - */ - public static int numMismatches(SAMRecord r, char[] ref) { - if ( r.getReadUnmappedFlag() ) return 1000000; - int i_ref = 0; // position on the ref - int i_read = 0; // position on the read - int mm_count = 0; // number of mismatches - Cigar c = r.getCigar(); - for ( int k = 0 ; k < c.numCigarElements() ; k++ ) { - CigarElement ce = c.getCigarElement(k); - switch( ce.getOperator() ) { - case M: - for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) { - char refChr = ref[i_ref]; - char readChr = (char)r.getReadBases()[i_read]; - if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || - BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) - continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - mm_count++; - } - break; - case I: - case S: - i_read += ce.getLength(); - break; - case D: - case N: - i_ref += ce.getLength(); - break; - default: throw new RuntimeException("Unrecognized cigar element: " + ce.getOperator()); - } - - } - return mm_count; - } - - // IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED - // TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO - // PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY... - /** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality - * for reference sequence specified as conventional java string (of bases). By default, it is assumed that - * the alignment starts at (1-based) position r.getAlignmentStart() on the reference refSeq. - * See {@link #numMismatches(SAMRecord, byte[], int)} if this is not the case. - */ - public static int numMismatches(SAMRecord r, String refSeq ) { - if ( r.getReadUnmappedFlag() ) return 1000000; - return numMismatches(r, StringUtil.stringToBytes(refSeq), r.getAlignmentStart()-1); - } - /** Returns number of mismatches in the alignment r to the reference sequence * refSeq assuming the alignment starts at (ZERO-based) position refIndex on the * specified reference sequence; in other words, refIndex is used in place of alignment's own @@ -136,8 +31,27 @@ public class AlignmentUtils { * @return the number of mismatches */ public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex).numMismatches; + } + + public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) { + if ( r.getReadUnmappedFlag() ) return 1000000; + return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex); + } + + public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { + return getMismatchCount(r, refSeq, refIndex).mismatchQualities; + } + + public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) { + if ( r.getReadUnmappedFlag() ) return 1000000; + return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex); + } + + private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + MismatchCount mc = new MismatchCount(); + int readIdx = 0; - int mismatches = 0; byte[] readSeq = r.getReadBases(); Cigar c = r.getCigar(); for (int i = 0 ; i < c.numCigarElements() ; i++) { @@ -153,8 +67,10 @@ public class AlignmentUtils { //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) // continue; // do not count Ns/Xs/etc ? - if ( readChr != refChr ) - mismatches++; + if ( readChr != refChr ) { + mc.numMismatches++; + mc.mismatchQualities += r.getBaseQualities()[readIdx]; + } } break; case I: @@ -169,7 +85,7 @@ public class AlignmentUtils { } } - return mismatches; + return mc; } /** Returns the number of mismatches in the pileup within the given reference context. @@ -406,8 +322,8 @@ public class AlignmentUtils { * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and * alignment reference index/start. - * @param r - * @return + * @param r record + * @return true if read is unmapped */ public static boolean isReadUnmapped(final SAMRecord r) { if ( r.getReadUnmappedFlag() ) return true;