For the cleaner to clean, it must beat the entropy produced by the aligner (and not just the raw reads).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3068 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
60dfba997b
commit
49117819f5
|
|
@ -390,9 +390,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
final ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>(); // reads that perfectly match ref
|
final ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>(); // reads that perfectly match ref
|
||||||
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
|
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
|
||||||
final LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>(); // should we try to make an alt consensus from the read?
|
final LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>(); // should we try to make an alt consensus from the read?
|
||||||
final ArrayList<AlignedRead> leftMovedIndels = new ArrayList<AlignedRead>();
|
|
||||||
final Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
|
final Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
|
||||||
int totalMismatchSum = 0;
|
long totalAlignerMismatchSum = 0, totalRawMismatchSum = 0;
|
||||||
|
|
||||||
// if there are any known indels for this region, get them
|
// if there are any known indels for this region, get them
|
||||||
for ( VariationRod knownIndel : knownIndelsToTry ) {
|
for ( VariationRod knownIndel : knownIndelsToTry ) {
|
||||||
|
|
@ -419,29 +418,27 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
final AlignedRead aRead = new AlignedRead(read);
|
final AlignedRead aRead = new AlignedRead(read);
|
||||||
|
|
||||||
|
|
||||||
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
|
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
|
||||||
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
|
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
|
||||||
if ( numBlocks == 2 ) {
|
if ( numBlocks == 2 ) {
|
||||||
|
|
||||||
Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
||||||
if ( aRead.setCigar(newCigar) ) {
|
aRead.setCigar(newCigar);
|
||||||
leftMovedIndels.add(aRead);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex, Integer.MAX_VALUE);
|
final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex;
|
||||||
|
totalAlignerMismatchSum += AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef);
|
||||||
|
final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE);
|
||||||
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
|
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
|
||||||
|
|
||||||
// if this doesn't match perfectly to the reference, let's try to clean it
|
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||||
if ( mismatchScore > 0 ) {
|
if ( rawMismatchScore > 0 ) {
|
||||||
altReads.add(aRead);
|
altReads.add(aRead);
|
||||||
if ( !read.getDuplicateReadFlag() )
|
if ( !read.getDuplicateReadFlag() )
|
||||||
totalMismatchSum += mismatchScore;
|
totalRawMismatchSum += rawMismatchScore;
|
||||||
aRead.setMismatchScoreToReference(mismatchScore);
|
aRead.setMismatchScoreToReference(rawMismatchScore);
|
||||||
// if it has an indel, let's see if that's the best consensus
|
// if it has an indel, let's see if that's the best consensus
|
||||||
if ( numBlocks == 2 ) {
|
if ( numBlocks == 2 ) {
|
||||||
Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases());
|
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getRead().getReadBases());
|
||||||
if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
|
if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
|
||||||
else
|
else
|
||||||
altConsenses.add(c);
|
altConsenses.add(c);
|
||||||
|
|
@ -543,10 +540,13 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the best alternate consensus has a smaller sum of quality score mismatches (more than
|
// if:
|
||||||
// the LOD threshold), and it didn't just move around the mismatching columns, then clean!
|
// 1) the best alternate consensus has a smaller sum of quality score mismatches than the aligned version of the reads,
|
||||||
final double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0);
|
// 2) beats the LOD threshold for the sum of quality score mismatches of the raw version of the reads,
|
||||||
if ( improvement >= LOD_THRESHOLD ) {
|
// 3) didn't just move around the mismatching columns (i.e. it actually reduces entropy),
|
||||||
|
// then clean!
|
||||||
|
final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0);
|
||||||
|
if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) {
|
||||||
|
|
||||||
bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
|
bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
|
||||||
|
|
||||||
|
|
@ -584,7 +584,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
for ( int i = 0; i < length; i++)
|
for ( int i = 0; i < length; i++)
|
||||||
str.append((char)bestConsensus.str[position+i]);
|
str.append((char)bestConsensus.str[position+i]);
|
||||||
}
|
}
|
||||||
str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
|
str.append("\t" + (((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n");
|
||||||
try {
|
try {
|
||||||
indelOutput.write(str.toString());
|
indelOutput.write(str.toString());
|
||||||
indelOutput.flush();
|
indelOutput.flush();
|
||||||
|
|
@ -623,10 +623,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
} else if ( statsOutput != null ) {
|
} else if ( statsOutput != null ) {
|
||||||
try {
|
try {
|
||||||
statsOutput.write(readsToClean.getLocation().toString());
|
statsOutput.write(String.format("%s\tFAIL\t%.1f\t%d%n",
|
||||||
statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
|
readsToClean.getLocation().toString(), improvement, bestConsensus.mismatchSum - totalAlignerMismatchSum));
|
||||||
statsOutput.write(Double.toString(improvement));
|
|
||||||
statsOutput.write("\n");
|
|
||||||
statsOutput.flush();
|
statsOutput.flush();
|
||||||
} catch (Exception e) {}
|
} catch (Exception e) {}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,120 +5,15 @@ import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.Cigar;
|
import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.util.StringUtil;
|
import net.sf.samtools.util.StringUtil;
|
||||||
import net.sf.picard.reference.ReferenceSequence;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.utils.pileup.*;
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: asivache
|
|
||||||
* Date: Mar 25, 2009
|
|
||||||
* Time: 12:15:38 AM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
public class AlignmentUtils {
|
public class AlignmentUtils {
|
||||||
|
|
||||||
|
private static class MismatchCount {
|
||||||
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
|
public int numMismatches = 0;
|
||||||
* <code>refSeq</code>. It is assumed that
|
public long mismatchQualities = 0;
|
||||||
* the alignment starts at (1-based) position r.getAlignmentStart() on the specified, and all single-base mismatches
|
|
||||||
* are counted in the alignment segments where both sequences are present. Insertions/deletions are skipped and do
|
|
||||||
* not contribute to the error count returned by this method.
|
|
||||||
* @param r aligned read
|
|
||||||
* @param refSeq reference sequence
|
|
||||||
* @return number of single-base mismatches in the aligned segments (gaps on either of the sequences are skipped)
|
|
||||||
*/
|
|
||||||
public static int numMismatches(SAMRecord r, ReferenceSequence refSeq) {
|
|
||||||
byte[] ref = refSeq.getBases();
|
|
||||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
|
||||||
int i_ref = r.getAlignmentStart()-1; // position on the ref
|
|
||||||
int i_read = 0; // position on the read
|
|
||||||
int mm_count = 0; // number of mismatches
|
|
||||||
Cigar c = r.getCigar();
|
|
||||||
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
|
|
||||||
CigarElement ce = c.getCigarElement(k);
|
|
||||||
switch( ce.getOperator() ) {
|
|
||||||
case M:
|
|
||||||
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
|
|
||||||
char refChr = (char)ref[i_ref];
|
|
||||||
char readChr = (char)r.getReadBases()[i_read];
|
|
||||||
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
|
||||||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
|
||||||
continue; // do not count Ns/Xs/etc ?
|
|
||||||
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
|
|
||||||
mm_count++;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case I:
|
|
||||||
case S:
|
|
||||||
i_read += ce.getLength();
|
|
||||||
break;
|
|
||||||
case D:
|
|
||||||
case N:
|
|
||||||
i_ref += ce.getLength();
|
|
||||||
break;
|
|
||||||
default: throw new RuntimeException("Unrecognized cigar element");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
return mm_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* mhanna - 11 May 2009 - stubbed out competing method that works with partial references.
|
|
||||||
* Computes number of mismatches in the read alignment to the refence <code>ref</code>
|
|
||||||
* specified in the record <code>r</code>. Indels are completely <i>ignored</i> by this method:
|
|
||||||
* only base mismatches in the alignment segments where both sequences are present are counted.
|
|
||||||
* @param r
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public static int numMismatches(SAMRecord r, char[] ref) {
|
|
||||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
|
||||||
int i_ref = 0; // position on the ref
|
|
||||||
int i_read = 0; // position on the read
|
|
||||||
int mm_count = 0; // number of mismatches
|
|
||||||
Cigar c = r.getCigar();
|
|
||||||
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
|
|
||||||
CigarElement ce = c.getCigarElement(k);
|
|
||||||
switch( ce.getOperator() ) {
|
|
||||||
case M:
|
|
||||||
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
|
|
||||||
char refChr = ref[i_ref];
|
|
||||||
char readChr = (char)r.getReadBases()[i_read];
|
|
||||||
if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
|
||||||
BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
|
||||||
continue; // do not count Ns/Xs/etc ?
|
|
||||||
if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) )
|
|
||||||
mm_count++;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case I:
|
|
||||||
case S:
|
|
||||||
i_read += ce.getLength();
|
|
||||||
break;
|
|
||||||
case D:
|
|
||||||
case N:
|
|
||||||
i_ref += ce.getLength();
|
|
||||||
break;
|
|
||||||
default: throw new RuntimeException("Unrecognized cigar element: " + ce.getOperator());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
return mm_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
// IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED
|
|
||||||
// TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO
|
|
||||||
// PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY...
|
|
||||||
/** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality
|
|
||||||
* for reference sequence specified as conventional java string (of bases). By default, it is assumed that
|
|
||||||
* the alignment starts at (1-based) position r.getAlignmentStart() on the reference <code>refSeq</code>.
|
|
||||||
* See {@link #numMismatches(SAMRecord, byte[], int)} if this is not the case.
|
|
||||||
*/
|
|
||||||
public static int numMismatches(SAMRecord r, String refSeq ) {
|
|
||||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
|
||||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), r.getAlignmentStart()-1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
|
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
|
||||||
|
|
@ -136,8 +31,27 @@ public class AlignmentUtils {
|
||||||
* @return the number of mismatches
|
* @return the number of mismatches
|
||||||
*/
|
*/
|
||||||
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
|
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||||
|
return getMismatchCount(r, refSeq, refIndex).numMismatches;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
|
||||||
|
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||||
|
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||||
|
return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) {
|
||||||
|
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||||
|
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||||
|
MismatchCount mc = new MismatchCount();
|
||||||
|
|
||||||
int readIdx = 0;
|
int readIdx = 0;
|
||||||
int mismatches = 0;
|
|
||||||
byte[] readSeq = r.getReadBases();
|
byte[] readSeq = r.getReadBases();
|
||||||
Cigar c = r.getCigar();
|
Cigar c = r.getCigar();
|
||||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
|
|
@ -153,8 +67,10 @@ public class AlignmentUtils {
|
||||||
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
//if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 ||
|
||||||
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
// BaseUtils.simpleBaseToBaseIndex(refChr) == -1 )
|
||||||
// continue; // do not count Ns/Xs/etc ?
|
// continue; // do not count Ns/Xs/etc ?
|
||||||
if ( readChr != refChr )
|
if ( readChr != refChr ) {
|
||||||
mismatches++;
|
mc.numMismatches++;
|
||||||
|
mc.mismatchQualities += r.getBaseQualities()[readIdx];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case I:
|
case I:
|
||||||
|
|
@ -169,7 +85,7 @@ public class AlignmentUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return mismatches;
|
return mc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the number of mismatches in the pileup within the given reference context.
|
/** Returns the number of mismatches in the pileup within the given reference context.
|
||||||
|
|
@ -406,8 +322,8 @@ public class AlignmentUtils {
|
||||||
* Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
|
* Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
|
||||||
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
|
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
|
||||||
* alignment reference index/start.
|
* alignment reference index/start.
|
||||||
* @param r
|
* @param r record
|
||||||
* @return
|
* @return true if read is unmapped
|
||||||
*/
|
*/
|
||||||
public static boolean isReadUnmapped(final SAMRecord r) {
|
public static boolean isReadUnmapped(final SAMRecord r) {
|
||||||
if ( r.getReadUnmappedFlag() ) return true;
|
if ( r.getReadUnmappedFlag() ) return true;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue