For the cleaner to clean, it must beat the entropy produced by the aligner (and not just the raw reads).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3068 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-24 15:21:58 +00:00
parent 60dfba997b
commit 49117819f5
2 changed files with 49 additions and 135 deletions

View File

@ -390,9 +390,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
final ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>(); // reads that perfectly match ref
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
final LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>(); // should we try to make an alt consensus from the read?
final ArrayList<AlignedRead> leftMovedIndels = new ArrayList<AlignedRead>();
final Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
int totalMismatchSum = 0;
long totalAlignerMismatchSum = 0, totalRawMismatchSum = 0;
// if there are any known indels for this region, get them
for ( VariationRod knownIndel : knownIndelsToTry ) {
@ -419,29 +418,27 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
final AlignedRead aRead = new AlignedRead(read);
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
if ( numBlocks == 2 ) {
Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
if ( aRead.setCigar(newCigar) ) {
leftMovedIndels.add(aRead);
}
aRead.setCigar(newCigar);
}
final int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, Integer.MAX_VALUE);
final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex;
totalAlignerMismatchSum += AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef);
final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE);
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
// if this doesn't match perfectly to the reference, let's try to clean it
if ( mismatchScore > 0 ) {
if ( rawMismatchScore > 0 ) {
altReads.add(aRead);
if ( !read.getDuplicateReadFlag() )
totalMismatchSum += mismatchScore;
aRead.setMismatchScoreToReference(mismatchScore);
totalRawMismatchSum += rawMismatchScore;
aRead.setMismatchScoreToReference(rawMismatchScore);
// if it has an indel, let's see if that's the best consensus
if ( numBlocks == 2 ) {
Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases());
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getRead().getReadBases());
if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
else
altConsenses.add(c);
@ -543,10 +540,13 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
}
// if the best alternate consensus has a smaller sum of quality score mismatches (more than
// the LOD threshold), and it didn't just move around the mismatching columns, then clean!
final double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0);
if ( improvement >= LOD_THRESHOLD ) {
// if:
// 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads,
// 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads,
// 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy),
// then clean!
final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0);
if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) {
bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
@ -584,7 +584,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
for ( int i = 0; i < length; i++)
str.append((char)bestConsensus.str[position+i]);
}
str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
str.append("\t" + (((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
try {
indelOutput.write(str.toString());
indelOutput.flush();
@ -623,10 +623,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
} else if ( statsOutput != null ) {
try {
statsOutput.write(readsToClean.getLocation().toString());
statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
statsOutput.write(Double.toString(improvement));
statsOutput.write("\n");
statsOutput.write(String.format("%s\tFAIL\t%.1f\t%d%n",
readsToClean.getLocation().toString(), improvement, bestConsensus.mismatchSum - totalAlignerMismatchSum));
statsOutput.flush();
} catch (Exception e) {}
}

View File

@ -5,122 +5,17 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.pileup.*;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 25, 2009
* Time: 12:15:38 AM
* To change this template use File | Settings | File Templates.
*/
public class AlignmentUtils {
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
* <code>refSeq</code>. It is assumed that
* the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches
* are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do
* not contribute to the error count returned by this method.
* @param r aligned read
* @param refSeq reference sequence
* @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped)
*/
public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) {
byte[] ref = refSeq.getBases();
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = r.getAlignmentStart()-1; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
char refChr = (char)ref[i_ref];
char readChr = (char)r.getReadBases()[i_read];
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
continue; // do not count Ns/Xs/etc ?
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
mm_count++;
}
break;
case I:
case S:
i_read += ce.getLength();
break;
case D:
case N:
i_ref += ce.getLength();
break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return mm_count;
private static class MismatchCount {
public int numMismatches = 0;
public long mismatchQualities = 0;
}
/**
* mhanna - 11 May 2009 - stubbed out competing method that works with partial references.
* Computes number of mismatches in the read alignment to the refence <code>ref</code>
* specified in the record <code>r</code>. Indels are completely <i>ignored</i> by this method:
* only base mismatches in the alignment segments where both sequences are present are counted.
* @param r
* @return
*/
public static int numMismatches(SAMRecord r, char[] ref) {
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = 0; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
char refChr = ref[i_ref];
char readChr = (char)r.getReadBases()[i_read];
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
continue; // do not count Ns/Xs/etc ?
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
mm_count++;
}
break;
case I:
case S:
i_read += ce.getLength();
break;
case D:
case N:
i_ref += ce.getLength();
break;
default: throw new RuntimeException("Unrecognized cigar element: " + ce.getOperator());
}
}
return mm_count;
}
// IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED
// TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO
// PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY...
/** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality
* for reference sequence specified as conventional java string (of bases). By default, it is assumed that
* the alignment starts at (1-based) position r.getAlignmentStart() on the reference <code>refSeq</code>.
* See {@link #numMismatches(SAMRecord, byte[], int)} if this is not the case.
*/
public static int numMismatches(SAMRecord r, String refSeq ) {
if ( r.getReadUnmappedFlag() ) return 1000000;
return numMismatches(r, StringUtil.stringToBytes(refSeq), r.getAlignmentStart()-1);
}
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
* <code>refSeq</code> assuming the alignment starts at (ZERO-based) position <code>refIndex</code> on the
* specified reference sequence; in other words, <code>refIndex</code> is used in place of alignment's own
@ -136,8 +31,27 @@ public class AlignmentUtils {
* @return the number of mismatches
*/
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
return getMismatchCount(r, refSeq, refIndex).numMismatches;
}
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
if ( r.getReadUnmappedFlag() ) return 1000000;
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
}
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
}
public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) {
if ( r.getReadUnmappedFlag() ) return 1000000;
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
}
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
MismatchCount mc = new MismatchCount();
int readIdx = 0;
int mismatches = 0;
byte[] readSeq = r.getReadBases();
Cigar c = r.getCigar();
for (int i = 0 ; i < c.numCigarElements() ; i++) {
@ -153,8 +67,10 @@ public class AlignmentUtils {
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
// continue; // do not count Ns/Xs/etc ?
if ( readChr != refChr )
mismatches++;
if ( readChr != refChr ) {
mc.numMismatches++;
mc.mismatchQualities += r.getBaseQualities()[readIdx];
}
}
break;
case I:
@ -169,7 +85,7 @@ public class AlignmentUtils {
}
}
return mismatches;
return mc;
}
/** Returns the number of mismatches in the pileup within the given reference context.
@ -406,8 +322,8 @@ public class AlignmentUtils {
* Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
* alignment reference index/start.
* @param r
* @return
* @param r record
* @return true if read is unmapped
*/
public static boolean isReadUnmapped(final SAMRecord r) {
if ( r.getReadUnmappedFlag() ) return true;