added filtering out indels with large levels of noise (mismatches) remaining in the close proximity; also a bug in recording deletion coordinates is fixed
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1014 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a6477df6d1
commit
2259dc3a8f
|
|
@ -18,6 +18,7 @@ import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.playground.indels.AlignmentUtils;
|
||||||
import org.broadinstitute.sting.playground.utils.CircularArray;
|
import org.broadinstitute.sting.playground.utils.CircularArray;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
@ -59,6 +60,10 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
private Set<String> normal_samples = new HashSet<String>();
|
private Set<String> normal_samples = new HashSet<String>();
|
||||||
private Set<String> tumor_samples = new HashSet<String>();
|
private Set<String> tumor_samples = new HashSet<String>();
|
||||||
|
|
||||||
|
private int MISMATCH_WIDTH = 5; // 5 bases on each side of the indel
|
||||||
|
private int MISMATCH_CUTOFF = 1000000;
|
||||||
|
private double AV_MISMATCHES_PER_READ = 1.5;
|
||||||
|
|
||||||
|
|
||||||
private static String annGenomic = "GENOMIC";
|
private static String annGenomic = "GENOMIC";
|
||||||
private static String annIntron = "INTRON";
|
private static String annIntron = "INTRON";
|
||||||
|
|
@ -162,8 +167,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
|
if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
|
||||||
throw new StingException("Read "+read.getReadName()+": contig is out of order");
|
throw new StingException("Read "+read.getReadName()+": contig is out of order");
|
||||||
|
|
||||||
if ( call_somatic) emit_somatic(1000000000); // print remaining indels from the previous contig (if any);
|
if ( call_somatic) emit_somatic(1000000000, true); // print remaining indels from the previous contig (if any);
|
||||||
else emit(1000000000);
|
else emit(1000000000,true);
|
||||||
currentContigIndex = read.getReferenceIndex();
|
currentContigIndex = read.getReferenceIndex();
|
||||||
refName = new String(read.getReferenceName());
|
refName = new String(read.getReferenceName());
|
||||||
location.setContig(refName);
|
location.setContig(refName);
|
||||||
|
|
@ -199,8 +204,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
// Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting
|
// Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting
|
||||||
// the window to the read's start will ensure that the read fits...
|
// the window to the read's start will ensure that the read fits...
|
||||||
|
|
||||||
if ( call_somatic ) emit_somatic( read.getAlignmentStart() );
|
if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false );
|
||||||
else emit( read.getAlignmentStart() );
|
else emit( read.getAlignmentStart(), false );
|
||||||
|
|
||||||
if ( read.getAlignmentEnd() > coverage.getStop()) {
|
if ( read.getAlignmentEnd() > coverage.getStop()) {
|
||||||
// ooops, looks like the read does not fit into the current window!!
|
// ooops, looks like the read does not fit into the current window!!
|
||||||
|
|
@ -240,17 +245,46 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
*
|
*
|
||||||
* @param position
|
* @param position
|
||||||
*/
|
*/
|
||||||
private void emit(long position) {
|
private void emit(long position, boolean force) {
|
||||||
|
|
||||||
|
long stop_at = position; // we will shift to this position instead of passed 'position'
|
||||||
|
// argument if we did not cover MISMATCH_WIDTH bases around the last indel yet
|
||||||
|
|
||||||
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
List<IndelVariant> variants = coverage.indelsAt(pos);
|
List<IndelVariant> variants = coverage.indelsAt(pos);
|
||||||
if ( variants.size() == 0 ) continue; // no indels
|
if ( variants.size() == 0 ) continue; // no indels
|
||||||
|
|
||||||
|
// if we are here, we got a variant
|
||||||
|
|
||||||
int cov = coverage.coverageAt(pos);
|
int cov = coverage.coverageAt(pos);
|
||||||
|
|
||||||
if ( cov < minCoverage ) continue; // low coverage
|
if ( cov < minCoverage ) continue; // low coverage
|
||||||
|
|
||||||
|
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
||||||
|
long right = pos+MISMATCH_WIDTH;
|
||||||
|
|
||||||
|
if ( right > coverage.getStop() ) { // we do not have enough bases in the current window
|
||||||
|
// in order to assess mismatch rate
|
||||||
|
if( force ) { // if we were asked to force-shift, then, well, shift anyway
|
||||||
|
right = coverage.getStop() ;
|
||||||
|
} else {
|
||||||
|
// shift to the position prior to the last indel so that we could get all the mismatch counts around it later
|
||||||
|
stop_at = left;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
||||||
|
int total_mismatches = 0;
|
||||||
|
for ( long k = left; k <= right ; k++ ) total_mismatches+=coverage.mismatchesAt(k);
|
||||||
|
|
||||||
|
if ( total_mismatches > MISMATCH_CUTOFF || total_mismatches > ((double)cov)*AV_MISMATCHES_PER_READ) {
|
||||||
|
System.out.println(refName+"\t"+(pos-1)+"\t"+
|
||||||
|
"\tTOO DIRTY\t"+total_mismatches);
|
||||||
|
continue; // too dirty
|
||||||
|
}
|
||||||
|
|
||||||
location.setStart(pos); location.setStop(pos); // retrieve annotation data
|
location.setStart(pos); location.setStop(pos); // retrieve annotation data
|
||||||
rodRefSeq annotation = refseqIterator.seekForward(location);
|
rodRefSeq annotation = refseqIterator.seekForward(location);
|
||||||
|
|
||||||
|
|
@ -289,7 +323,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
coverage.shift((int)(position - coverage.getStart() ) );
|
coverage.shift((int)(stop_at - coverage.getStart() ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
||||||
|
|
@ -297,7 +331,9 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
*
|
*
|
||||||
* @param position
|
* @param position
|
||||||
*/
|
*/
|
||||||
private void emit_somatic(long position) {
|
private void emit_somatic(long position, boolean force) {
|
||||||
|
|
||||||
|
long stop_at = position;
|
||||||
|
|
||||||
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
|
|
@ -314,6 +350,38 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
if ( tumor_cov < minCoverage ) continue; // low coverage
|
if ( tumor_cov < minCoverage ) continue; // low coverage
|
||||||
if ( normal_cov < minNormalCoverage ) continue; // low coverage
|
if ( normal_cov < minNormalCoverage ) continue; // low coverage
|
||||||
|
|
||||||
|
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
||||||
|
long right = pos+MISMATCH_WIDTH;
|
||||||
|
|
||||||
|
if ( right > coverage.getStop() ) { // we do not have enough bases in the current window
|
||||||
|
// in order to assess mismatch rate
|
||||||
|
if( force ) { // if we were asked to force-shift, then, well, shift anyway
|
||||||
|
right = coverage.getStop() ;
|
||||||
|
} else {
|
||||||
|
// shift to the position prior to the last indel so that we could get all the mismatch counts around it later
|
||||||
|
stop_at = left;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
||||||
|
int total_mismatches_normal = 0;
|
||||||
|
int total_mismatches_tumor = 0;
|
||||||
|
for ( long k = left; k <= right ; k++ ) {
|
||||||
|
total_mismatches_tumor+=coverage.mismatchesAt(k);
|
||||||
|
total_mismatches_normal+=normal_coverage.mismatchesAt(k);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( total_mismatches_normal > MISMATCH_CUTOFF || total_mismatches_normal > ((double)normal_cov)*AV_MISMATCHES_PER_READ) {
|
||||||
|
System.out.println(refName+"\t"+(pos-1)+"\t"+
|
||||||
|
"\tNORMAL TOO DIRTY\t"+total_mismatches_normal);
|
||||||
|
continue; // too dirty
|
||||||
|
}
|
||||||
|
if ( total_mismatches_tumor > MISMATCH_CUTOFF || total_mismatches_tumor > ((double)tumor_cov)*AV_MISMATCHES_PER_READ) {
|
||||||
|
System.out.println(refName+"\t"+(pos-1)+"\t"+
|
||||||
|
"\tTUMOR TOO DIRTY\t"+total_mismatches_tumor);
|
||||||
|
continue; // too dirty
|
||||||
|
}
|
||||||
location.setStart(pos); location.setStop(pos); // retrieve annotation data
|
location.setStart(pos); location.setStop(pos); // retrieve annotation data
|
||||||
rodRefSeq annotation = refseqIterator.seekForward(location);
|
rodRefSeq annotation = refseqIterator.seekForward(location);
|
||||||
|
|
||||||
|
|
@ -336,7 +404,13 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
if ( (double)total_variant_count_tumor > minFraction * tumor_cov && (double) max_variant_count_tumor > minConsensusFraction*total_variant_count_tumor ) {
|
if ( (double)total_variant_count_tumor > minFraction * tumor_cov && (double) max_variant_count_tumor > minConsensusFraction*total_variant_count_tumor ) {
|
||||||
|
|
||||||
String annotationString = getAnnotationString(annotation);
|
String annotationString = getAnnotationString(annotation);
|
||||||
|
/*
|
||||||
|
int leftpos = pos-1;
|
||||||
|
int rightpos = pos-1;
|
||||||
|
if ( event_length_tumor > 0 ) {
|
||||||
|
leftpos -= event_length;
|
||||||
|
}
|
||||||
|
*/
|
||||||
String message = refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
|
String message = refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
|
||||||
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov;
|
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov;
|
||||||
if ( normal_variants.size() == 0 ) {
|
if ( normal_variants.size() == 0 ) {
|
||||||
|
|
@ -381,7 +455,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(Integer result) {
|
||||||
emit(1000000000); // emit everything we might have left
|
if ( call_somatic ) emit_somatic(1000000000, true);
|
||||||
|
else emit(1000000000,true); // emit everything we might have left
|
||||||
try {
|
try {
|
||||||
output.close();
|
output.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
@ -458,6 +533,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
private CircularArray.Int coverageWindow;
|
private CircularArray.Int coverageWindow;
|
||||||
private CircularArray< List< IndelVariant > > indels;
|
private CircularArray< List< IndelVariant > > indels;
|
||||||
|
private CircularArray.Int mismatches;
|
||||||
|
|
||||||
private static List<IndelVariant> emptyIndelList;
|
private static List<IndelVariant> emptyIndelList;
|
||||||
|
|
||||||
|
|
@ -469,6 +545,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
this.start = start;
|
this.start = start;
|
||||||
coverageWindow = new CircularArray.Int(length);
|
coverageWindow = new CircularArray.Int(length);
|
||||||
indels = new CircularArray< List<IndelVariant> >(length);
|
indels = new CircularArray< List<IndelVariant> >(length);
|
||||||
|
mismatches = new CircularArray.Int(length);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns 1-based reference start position of the interval this object keeps coverage for.
|
/** Returns 1-based reference start position of the interval this object keeps coverage for.
|
||||||
|
|
@ -492,6 +569,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
return coverageWindow.get( (int)( refPos - start ) );
|
return coverageWindow.get( (int)( refPos - start ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int mismatchesAt(final long refPos) { return mismatches.get((int)(refPos-start)); }
|
||||||
|
|
||||||
public List<IndelVariant> indelsAt( final long refPos ) {
|
public List<IndelVariant> indelsAt( final long refPos ) {
|
||||||
List<IndelVariant> l = indels.get((int)( refPos - start ));
|
List<IndelVariant> l = indels.get((int)( refPos - start ));
|
||||||
if ( l == null ) return emptyIndelList;
|
if ( l == null ) return emptyIndelList;
|
||||||
|
|
@ -507,6 +586,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
public void add(SAMRecord r, char [] ref) {
|
public void add(SAMRecord r, char [] ref) {
|
||||||
final long rStart = r.getAlignmentStart();
|
final long rStart = r.getAlignmentStart();
|
||||||
final long rStop = r.getAlignmentEnd();
|
final long rStop = r.getAlignmentEnd();
|
||||||
|
final String readBases = r.getReadString().toUpperCase();
|
||||||
|
|
||||||
|
|
||||||
int localStart = (int)( rStart - start ); // start of the alignment wrt start of the current window
|
int localStart = (int)( rStart - start ); // start of the alignment wrt start of the current window
|
||||||
|
|
||||||
|
|
@ -528,22 +609,20 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
int posOnRead = 0;
|
int posOnRead = 0;
|
||||||
int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
|
int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
|
||||||
// its start on the actual full reference contig is r.getAlignmentStart()
|
// its start on the actual full reference contig is r.getAlignmentStart()
|
||||||
|
// int mm=0;
|
||||||
|
|
||||||
for ( int i = 0 ; i < nCigarElems ; i++ ) {
|
for ( int i = 0 ; i < nCigarElems ; i++ ) {
|
||||||
|
|
||||||
final CigarElement ce = c.getCigarElement(i);
|
final CigarElement ce = c.getCigarElement(i);
|
||||||
IndelVariant.Type type = null;
|
IndelVariant.Type type = null;
|
||||||
String bases = null;
|
String bases = null;
|
||||||
|
int eventPosition = posOnRef;
|
||||||
|
|
||||||
int indelPosition = 0; // indel position in our coverage window (i.e. relative to getStart()).
|
|
||||||
// note that here we assign indels to the first deleted base or to the first
|
|
||||||
// base after insertion
|
|
||||||
|
|
||||||
switch(ce.getOperator()) {
|
switch(ce.getOperator()) {
|
||||||
case I:
|
case I:
|
||||||
type = IndelVariant.Type.I;
|
type = IndelVariant.Type.I;
|
||||||
bases = r.getReadString().substring(posOnRead,posOnRead+ce.getLength());
|
bases = readBases.substring(posOnRead,posOnRead+ce.getLength());
|
||||||
indelPosition = localStart + posOnRef ;
|
|
||||||
// will increment position on the read below, there's no 'break' statement yet...
|
// will increment position on the read below, there's no 'break' statement yet...
|
||||||
case H:
|
case H:
|
||||||
case S:
|
case S:
|
||||||
|
|
@ -555,12 +634,13 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
case D:
|
case D:
|
||||||
type = IndelVariant.Type.D;
|
type = IndelVariant.Type.D;
|
||||||
bases = new String( ref, posOnRef, ce.getLength() );
|
bases = new String( ref, posOnRef, ce.getLength() );
|
||||||
indelPosition = localStart + posOnRef ;
|
|
||||||
posOnRef += ce.getLength();
|
posOnRef += ce.getLength();
|
||||||
break;
|
break;
|
||||||
case M:
|
case M: for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) {
|
||||||
posOnRef += ce.getLength();
|
if ( readBases.charAt(posOnRead) != Character.toUpperCase(ref[posOnRef]) ) { // mismatch!
|
||||||
posOnRead += ce.getLength();
|
mismatches.increment(localStart+posOnRef, 1); //mm++;
|
||||||
|
}
|
||||||
|
}
|
||||||
break; // advance along the gapless block in the alignment
|
break; // advance along the gapless block in the alignment
|
||||||
default :
|
default :
|
||||||
throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator());
|
throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator());
|
||||||
|
|
@ -573,7 +653,9 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
if ( i == nCigarElems - 1) logger.warn("Indel at the end of the read "+r.getReadName());
|
if ( i == nCigarElems - 1) logger.warn("Indel at the end of the read "+r.getReadName());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
updateCount(indelPosition, type, bases);
|
// note that here we will be assigning indels to the first deleted base or to the first
|
||||||
|
// base after insertion, not to the last base before the event!
|
||||||
|
updateCount(localStart+eventPosition, type, bases);
|
||||||
} catch (IndexOutOfBoundsException e) {
|
} catch (IndexOutOfBoundsException e) {
|
||||||
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
|
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
|
||||||
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
|
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
|
||||||
|
|
@ -583,6 +665,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// System.out.println(r.getReadName()+"\t"+(r.getReadNegativeStrandFlag()?"RC":"FW")+"\t"+r.getCigarString()+"\t"+mm);
|
||||||
|
// System.out.println(AlignmentUtils.alignmentToString(r.getCigar(), readBases, new String(ref), 0));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -633,6 +717,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
start += offset;
|
start += offset;
|
||||||
coverageWindow.shiftData(offset);
|
coverageWindow.shiftData(offset);
|
||||||
indels.shiftData(offset);
|
indels.shiftData(offset);
|
||||||
|
mismatches.shiftData(offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue