added filtering out indels with large levels of noise (mismatches) remaining in the close proximity; also a bug in recording deletion coordinates is fixed
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1014 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a6477df6d1
commit
2259dc3a8f
|
|
@ -18,6 +18,7 @@ import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
|||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.playground.indels.AlignmentUtils;
|
||||
import org.broadinstitute.sting.playground.utils.CircularArray;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
|
@ -59,6 +60,10 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
private Set<String> normal_samples = new HashSet<String>();
|
||||
private Set<String> tumor_samples = new HashSet<String>();
|
||||
|
||||
private int MISMATCH_WIDTH = 5; // 5 bases on each side of the indel
|
||||
private int MISMATCH_CUTOFF = 1000000;
|
||||
private double AV_MISMATCHES_PER_READ = 1.5;
|
||||
|
||||
|
||||
private static String annGenomic = "GENOMIC";
|
||||
private static String annIntron = "INTRON";
|
||||
|
|
@ -162,8 +167,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
|
||||
throw new StingException("Read "+read.getReadName()+": contig is out of order");
|
||||
|
||||
if ( call_somatic) emit_somatic(1000000000); // print remaining indels from the previous contig (if any);
|
||||
else emit(1000000000);
|
||||
if ( call_somatic) emit_somatic(1000000000, true); // print remaining indels from the previous contig (if any);
|
||||
else emit(1000000000,true);
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
refName = new String(read.getReferenceName());
|
||||
location.setContig(refName);
|
||||
|
|
@ -199,8 +204,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
// Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting
|
||||
// the window to the read's start will ensure that the read fits...
|
||||
|
||||
if ( call_somatic ) emit_somatic( read.getAlignmentStart() );
|
||||
else emit( read.getAlignmentStart() );
|
||||
if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false );
|
||||
else emit( read.getAlignmentStart(), false );
|
||||
|
||||
if ( read.getAlignmentEnd() > coverage.getStop()) {
|
||||
// ooops, looks like the read does not fit into the current window!!
|
||||
|
|
@ -240,17 +245,46 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
*
|
||||
* @param position
|
||||
*/
|
||||
private void emit(long position) {
|
||||
private void emit(long position, boolean force) {
|
||||
|
||||
long stop_at = position; // we will shift to this position instead of passed 'position'
|
||||
// argument if we did not cover MISMATCH_WIDTH bases around the last indel yet
|
||||
|
||||
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||
|
||||
List<IndelVariant> variants = coverage.indelsAt(pos);
|
||||
if ( variants.size() == 0 ) continue; // no indels
|
||||
|
||||
// if we are here, we got a variant
|
||||
|
||||
int cov = coverage.coverageAt(pos);
|
||||
|
||||
if ( cov < minCoverage ) continue; // low coverage
|
||||
|
||||
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
||||
long right = pos+MISMATCH_WIDTH;
|
||||
|
||||
if ( right > coverage.getStop() ) { // we do not have enough bases in the current window
|
||||
// in order to assess mismatch rate
|
||||
if( force ) { // if we were asked to force-shift, then, well, shift anyway
|
||||
right = coverage.getStop() ;
|
||||
} else {
|
||||
// shift to the position prior to the last indel so that we could get all the mismatch counts around it later
|
||||
stop_at = left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
||||
int total_mismatches = 0;
|
||||
for ( long k = left; k <= right ; k++ ) total_mismatches+=coverage.mismatchesAt(k);
|
||||
|
||||
if ( total_mismatches > MISMATCH_CUTOFF || total_mismatches > ((double)cov)*AV_MISMATCHES_PER_READ) {
|
||||
System.out.println(refName+"\t"+(pos-1)+"\t"+
|
||||
"\tTOO DIRTY\t"+total_mismatches);
|
||||
continue; // too dirty
|
||||
}
|
||||
|
||||
location.setStart(pos); location.setStop(pos); // retrieve annotation data
|
||||
rodRefSeq annotation = refseqIterator.seekForward(location);
|
||||
|
||||
|
|
@ -289,7 +323,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
// }
|
||||
}
|
||||
|
||||
coverage.shift((int)(position - coverage.getStart() ) );
|
||||
coverage.shift((int)(stop_at - coverage.getStart() ) );
|
||||
}
|
||||
|
||||
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
||||
|
|
@ -297,7 +331,9 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
*
|
||||
* @param position
|
||||
*/
|
||||
private void emit_somatic(long position) {
|
||||
private void emit_somatic(long position, boolean force) {
|
||||
|
||||
long stop_at = position;
|
||||
|
||||
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||
|
||||
|
|
@ -314,6 +350,38 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
if ( tumor_cov < minCoverage ) continue; // low coverage
|
||||
if ( normal_cov < minNormalCoverage ) continue; // low coverage
|
||||
|
||||
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
||||
long right = pos+MISMATCH_WIDTH;
|
||||
|
||||
if ( right > coverage.getStop() ) { // we do not have enough bases in the current window
|
||||
// in order to assess mismatch rate
|
||||
if( force ) { // if we were asked to force-shift, then, well, shift anyway
|
||||
right = coverage.getStop() ;
|
||||
} else {
|
||||
// shift to the position prior to the last indel so that we could get all the mismatch counts around it later
|
||||
stop_at = left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
||||
int total_mismatches_normal = 0;
|
||||
int total_mismatches_tumor = 0;
|
||||
for ( long k = left; k <= right ; k++ ) {
|
||||
total_mismatches_tumor+=coverage.mismatchesAt(k);
|
||||
total_mismatches_normal+=normal_coverage.mismatchesAt(k);
|
||||
}
|
||||
|
||||
if ( total_mismatches_normal > MISMATCH_CUTOFF || total_mismatches_normal > ((double)normal_cov)*AV_MISMATCHES_PER_READ) {
|
||||
System.out.println(refName+"\t"+(pos-1)+"\t"+
|
||||
"\tNORMAL TOO DIRTY\t"+total_mismatches_normal);
|
||||
continue; // too dirty
|
||||
}
|
||||
if ( total_mismatches_tumor > MISMATCH_CUTOFF || total_mismatches_tumor > ((double)tumor_cov)*AV_MISMATCHES_PER_READ) {
|
||||
System.out.println(refName+"\t"+(pos-1)+"\t"+
|
||||
"\tTUMOR TOO DIRTY\t"+total_mismatches_tumor);
|
||||
continue; // too dirty
|
||||
}
|
||||
location.setStart(pos); location.setStop(pos); // retrieve annotation data
|
||||
rodRefSeq annotation = refseqIterator.seekForward(location);
|
||||
|
||||
|
|
@ -336,7 +404,13 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
if ( (double)total_variant_count_tumor > minFraction * tumor_cov && (double) max_variant_count_tumor > minConsensusFraction*total_variant_count_tumor ) {
|
||||
|
||||
String annotationString = getAnnotationString(annotation);
|
||||
|
||||
/*
|
||||
int leftpos = pos-1;
|
||||
int rightpos = pos-1;
|
||||
if ( event_length_tumor > 0 ) {
|
||||
leftpos -= event_length;
|
||||
}
|
||||
*/
|
||||
String message = refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
|
||||
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov;
|
||||
if ( normal_variants.size() == 0 ) {
|
||||
|
|
@ -381,7 +455,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
emit(1000000000); // emit everything we might have left
|
||||
if ( call_somatic ) emit_somatic(1000000000, true);
|
||||
else emit(1000000000,true); // emit everything we might have left
|
||||
try {
|
||||
output.close();
|
||||
} catch (IOException e) {
|
||||
|
|
@ -458,6 +533,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
private CircularArray.Int coverageWindow;
|
||||
private CircularArray< List< IndelVariant > > indels;
|
||||
private CircularArray.Int mismatches;
|
||||
|
||||
private static List<IndelVariant> emptyIndelList;
|
||||
|
||||
|
|
@ -469,6 +545,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
this.start = start;
|
||||
coverageWindow = new CircularArray.Int(length);
|
||||
indels = new CircularArray< List<IndelVariant> >(length);
|
||||
mismatches = new CircularArray.Int(length);
|
||||
}
|
||||
|
||||
/** Returns 1-based reference start position of the interval this object keeps coverage for.
|
||||
|
|
@ -492,6 +569,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
return coverageWindow.get( (int)( refPos - start ) );
|
||||
}
|
||||
|
||||
public int mismatchesAt(final long refPos) { return mismatches.get((int)(refPos-start)); }
|
||||
|
||||
public List<IndelVariant> indelsAt( final long refPos ) {
|
||||
List<IndelVariant> l = indels.get((int)( refPos - start ));
|
||||
if ( l == null ) return emptyIndelList;
|
||||
|
|
@ -507,11 +586,13 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
public void add(SAMRecord r, char [] ref) {
|
||||
final long rStart = r.getAlignmentStart();
|
||||
final long rStop = r.getAlignmentEnd();
|
||||
final String readBases = r.getReadString().toUpperCase();
|
||||
|
||||
|
||||
int localStart = (int)( rStart - start ); // start of the alignment wrt start of the current window
|
||||
|
||||
try {
|
||||
for ( int k = localStart; k <= (int)(rStop-start) ; k++ ) coverageWindow.increment(k, 1);
|
||||
for ( int k = localStart; k <= (int)(rStop-start) ; k++ ) coverageWindow.increment(k, 1);
|
||||
} catch ( IndexOutOfBoundsException e) { // replace the message and re-throw:
|
||||
throw new IndexOutOfBoundsException("Current coverage window: "+getStart()+"-"+getStop()+
|
||||
"; illegal attempt to add read spanning "+rStart+"-"+rStop);
|
||||
|
|
@ -528,22 +609,20 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
int posOnRead = 0;
|
||||
int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
|
||||
// its start on the actual full reference contig is r.getAlignmentStart()
|
||||
// int mm=0;
|
||||
|
||||
for ( int i = 0 ; i < nCigarElems ; i++ ) {
|
||||
|
||||
final CigarElement ce = c.getCigarElement(i);
|
||||
IndelVariant.Type type = null;
|
||||
String bases = null;
|
||||
int eventPosition = posOnRef;
|
||||
|
||||
int indelPosition = 0; // indel position in our coverage window (i.e. relative to getStart()).
|
||||
// note that here we assign indels to the first deleted base or to the first
|
||||
// base after insertion
|
||||
|
||||
switch(ce.getOperator()) {
|
||||
case I:
|
||||
type = IndelVariant.Type.I;
|
||||
bases = r.getReadString().substring(posOnRead,posOnRead+ce.getLength());
|
||||
indelPosition = localStart + posOnRef ;
|
||||
bases = readBases.substring(posOnRead,posOnRead+ce.getLength());
|
||||
// will increment position on the read below, there's no 'break' statement yet...
|
||||
case H:
|
||||
case S:
|
||||
|
|
@ -555,12 +634,13 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
case D:
|
||||
type = IndelVariant.Type.D;
|
||||
bases = new String( ref, posOnRef, ce.getLength() );
|
||||
indelPosition = localStart + posOnRef ;
|
||||
posOnRef += ce.getLength();
|
||||
break;
|
||||
case M:
|
||||
posOnRef += ce.getLength();
|
||||
posOnRead += ce.getLength();
|
||||
case M: for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) {
|
||||
if ( readBases.charAt(posOnRead) != Character.toUpperCase(ref[posOnRef]) ) { // mismatch!
|
||||
mismatches.increment(localStart+posOnRef, 1); //mm++;
|
||||
}
|
||||
}
|
||||
break; // advance along the gapless block in the alignment
|
||||
default :
|
||||
throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator());
|
||||
|
|
@ -573,7 +653,9 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
if ( i == nCigarElems - 1) logger.warn("Indel at the end of the read "+r.getReadName());
|
||||
|
||||
try {
|
||||
updateCount(indelPosition, type, bases);
|
||||
// note that here we will be assigning indels to the first deleted base or to the first
|
||||
// base after insertion, not to the last base before the event!
|
||||
updateCount(localStart+eventPosition, type, bases);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
|
||||
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
|
||||
|
|
@ -583,6 +665,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
// System.out.println(r.getReadName()+"\t"+(r.getReadNegativeStrandFlag()?"RC":"FW")+"\t"+r.getCigarString()+"\t"+mm);
|
||||
// System.out.println(AlignmentUtils.alignmentToString(r.getCigar(), readBases, new String(ref), 0));
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -633,6 +717,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
|||
start += offset;
|
||||
coverageWindow.shiftData(offset);
|
||||
indels.shiftData(offset);
|
||||
mismatches.shiftData(offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue