For the cleaner to clean, it must beat the entropy produced by the aligner (and not just the raw reads).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3068 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
60dfba997b
commit
49117819f5
|
|
@ -390,9 +390,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
final ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>(); // reads that perfectly match ref
|
||||
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
|
||||
final LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>(); // should we try to make an alt consensus from the read?
|
||||
final ArrayList<AlignedRead> leftMovedIndels = new ArrayList<AlignedRead>();
|
||||
final Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
|
||||
int totalMismatchSum = 0;
|
||||
long totalAlignerMismatchSum = 0, totalRawMismatchSum = 0;
|
||||
|
||||
// if there are any known indels for this region, get them
|
||||
for ( VariationRod knownIndel : knownIndelsToTry ) {
|
||||
|
|
@ -419,29 +418,27 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
final AlignedRead aRead = new AlignedRead(read);
|
||||
|
||||
|
||||
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
|
||||
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
|
||||
if ( numBlocks == 2 ) {
|
||||
|
||||
Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
||||
if ( aRead.setCigar(newCigar) ) {
|
||||
leftMovedIndels.add(aRead);
|
||||
}
|
||||
aRead.setCigar(newCigar);
|
||||
}
|
||||
|
||||
final int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, Integer.MAX_VALUE);
|
||||
final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex;
|
||||
totalAlignerMismatchSum += AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef);
|
||||
final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE);
|
||||
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
|
||||
|
||||
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||
if ( mismatchScore > 0 ) {
|
||||
if ( rawMismatchScore > 0 ) {
|
||||
altReads.add(aRead);
|
||||
if ( !read.getDuplicateReadFlag() )
|
||||
totalMismatchSum += mismatchScore;
|
||||
aRead.setMismatchScoreToReference(mismatchScore);
|
||||
totalRawMismatchSum += rawMismatchScore;
|
||||
aRead.setMismatchScoreToReference(rawMismatchScore);
|
||||
// if it has an indel, let's see if that's the best consensus
|
||||
if ( numBlocks == 2 ) {
|
||||
Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases());
|
||||
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getRead().getReadBases());
|
||||
if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
|
||||
else
|
||||
altConsenses.add(c);
|
||||
|
|
@ -543,10 +540,13 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
// if the best alternate consensus has a smaller sum of quality score mismatches (more than
|
||||
// the LOD threshold), and it didn't just move around the mismatching columns, then clean!
|
||||
final double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0);
|
||||
if ( improvement >= LOD_THRESHOLD ) {
|
||||
// if:
|
||||
// 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads,
|
||||
// 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads,
|
||||
// 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy),
|
||||
// then clean!
|
||||
final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0);
|
||||
if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) {
|
||||
|
||||
bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
|
||||
|
||||
|
|
@ -584,7 +584,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
for ( int i = 0; i < length; i++)
|
||||
str.append((char)bestConsensus.str[position+i]);
|
||||
}
|
||||
str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
|
||||
str.append("\t" + (((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
|
||||
try {
|
||||
indelOutput.write(str.toString());
|
||||
indelOutput.flush();
|
||||
|
|
@ -623,10 +623,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
} else if ( statsOutput != null ) {
|
||||
try {
|
||||
statsOutput.write(readsToClean.getLocation().toString());
|
||||
statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
|
||||
statsOutput.write(Double.toString(improvement));
|
||||
statsOutput.write("\n");
|
||||
statsOutput.write(String.format("%s\tFAIL\t%.1f\t%d%n",
|
||||
readsToClean.getLocation().toString(), improvement, bestConsensus.mismatchSum - totalAlignerMismatchSum));
|
||||
statsOutput.flush();
|
||||
} catch (Exception e) {}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,122 +5,17 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Mar 25, 2009
|
||||
* Time: 12:15:38 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class AlignmentUtils {
|
||||
|
||||
|
||||
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
|
||||
* <code>refSeq</code>. It is assumed that
|
||||
* the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches
|
||||
* are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do
|
||||
* not contribute to the error count returned by this method.
|
||||
* @param r aligned read
|
||||
* @param refSeq reference sequence
|
||||
* @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped)
|
||||
*/
|
||||
public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) {
|
||||
byte[] ref = refSeq.getBases();
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
int i_ref = r.getAlignmentStart()-1; // position on the ref
|
||||
int i_read = 0; // position on the read
|
||||
int mm_count = 0; // number of mismatches
|
||||
Cigar c = r.getCigar();
|
||||
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
|
||||
CigarElement ce = c.getCigarElement(k);
|
||||
switch( ce.getOperator() ) {
|
||||
case M:
|
||||
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
|
||||
char refChr = (char)ref[i_ref];
|
||||
char readChr = (char)r.getReadBases()[i_read];
|
||||
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
||||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
||||
continue; // do not count Ns/Xs/etc ?
|
||||
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
|
||||
mm_count++;
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
case S:
|
||||
i_read += ce.getLength();
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
i_ref += ce.getLength();
|
||||
break;
|
||||
default: throw new RuntimeException("Unrecognized cigar element");
|
||||
}
|
||||
|
||||
}
|
||||
return mm_count;
|
||||
private static class MismatchCount {
|
||||
public int numMismatches = 0;
|
||||
public long mismatchQualities = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* mhanna - 11 May 2009 - stubbed out competing method that works with partial references.
|
||||
* Computes number of mismatches in the read alignment to the refence <code>ref</code>
|
||||
* specified in the record <code>r</code>. Indels are completely <i>ignored</i> by this method:
|
||||
* only base mismatches in the alignment segments where both sequences are present are counted.
|
||||
* @param r
|
||||
* @return
|
||||
*/
|
||||
public static int numMismatches(SAMRecord r, char[] ref) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
int i_ref = 0; // position on the ref
|
||||
int i_read = 0; // position on the read
|
||||
int mm_count = 0; // number of mismatches
|
||||
Cigar c = r.getCigar();
|
||||
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
|
||||
CigarElement ce = c.getCigarElement(k);
|
||||
switch( ce.getOperator() ) {
|
||||
case M:
|
||||
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
|
||||
char refChr = ref[i_ref];
|
||||
char readChr = (char)r.getReadBases()[i_read];
|
||||
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
||||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
||||
continue; // do not count Ns/Xs/etc ?
|
||||
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
|
||||
mm_count++;
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
case S:
|
||||
i_read += ce.getLength();
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
i_ref += ce.getLength();
|
||||
break;
|
||||
default: throw new RuntimeException("Unrecognized cigar element: " + ce.getOperator());
|
||||
}
|
||||
|
||||
}
|
||||
return mm_count;
|
||||
}
|
||||
|
||||
// IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED
|
||||
// TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO
|
||||
// PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY...
|
||||
/** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality
|
||||
* for reference sequence specified as conventional java string (of bases). By default, it is assumed that
|
||||
* the alignment starts at (1-based) position r.getAlignmentStart() on the reference <code>refSeq</code>.
|
||||
* See {@link #numMismatches(SAMRecord, byte[], int)} if this is not the case.
|
||||
*/
|
||||
public static int numMismatches(SAMRecord r, String refSeq ) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), r.getAlignmentStart()-1);
|
||||
}
|
||||
|
||||
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
|
||||
* <code>refSeq</code> assuming the alignment starts at (ZERO-based) position <code>refIndex</code> on the
|
||||
* specified reference sequence; in other words, <code>refIndex</code> is used in place of alignment's own
|
||||
|
|
@ -136,8 +31,27 @@ public class AlignmentUtils {
|
|||
* @return the number of mismatches
|
||||
*/
|
||||
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||
return getMismatchCount(r, refSeq, refIndex).numMismatches;
|
||||
}
|
||||
|
||||
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||
}
|
||||
|
||||
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||
return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
|
||||
}
|
||||
|
||||
public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||
}
|
||||
|
||||
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||
MismatchCount mc = new MismatchCount();
|
||||
|
||||
int readIdx = 0;
|
||||
int mismatches = 0;
|
||||
byte[] readSeq = r.getReadBases();
|
||||
Cigar c = r.getCigar();
|
||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||
|
|
@ -153,8 +67,10 @@ public class AlignmentUtils {
|
|||
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
||||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
||||
// continue; // do not count Ns/Xs/etc ?
|
||||
if ( readChr != refChr )
|
||||
mismatches++;
|
||||
if ( readChr != refChr ) {
|
||||
mc.numMismatches++;
|
||||
mc.mismatchQualities += r.getBaseQualities()[readIdx];
|
||||
}
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
|
|
@ -169,7 +85,7 @@ public class AlignmentUtils {
|
|||
}
|
||||
|
||||
}
|
||||
return mismatches;
|
||||
return mc;
|
||||
}
|
||||
|
||||
/** Returns the number of mismatches in the pileup within the given reference context.
|
||||
|
|
@ -406,8 +322,8 @@ public class AlignmentUtils {
|
|||
* Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
|
||||
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
|
||||
* alignment reference index/start.
|
||||
* @param r
|
||||
* @return
|
||||
* @param r record
|
||||
* @return true if read is unmapped
|
||||
*/
|
||||
public static boolean isReadUnmapped(final SAMRecord r) {
|
||||
if ( r.getReadUnmappedFlag() ) return true;
|
||||
|
|
|
|||
Loading…
Reference in New Issue