fixed a bug found by Eric: genotyper would crash in the case of an indel too close to the window end, with the next read mapping sufficiently far away on the ref
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1313 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2db86b7829
commit
64221907a2
|
|
@ -1,13 +1,16 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
||||||
|
|
||||||
|
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||||
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
||||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||||
import org.broadinstitute.sting.playground.utils.CircularArray;
|
import org.broadinstitute.sting.playground.utils.CircularArray;
|
||||||
|
|
@ -164,6 +167,53 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void assignReadGroups1(final SAMFileHeader mergedHeader) {
|
||||||
|
|
||||||
|
SAMDataSource d = getToolkit().getDataSource();
|
||||||
|
|
||||||
|
Reads reads = d.getReadsInfo();
|
||||||
|
|
||||||
|
System.out.println(reads.getReadsFiles().size() + " input files");
|
||||||
|
|
||||||
|
System.out.println(d.getHeaderMerger().getReaders().size() +" readers instantiated");
|
||||||
|
for ( SAMFileReader r : d.getHeaderMerger().getReaders() ) {
|
||||||
|
System.out.print("----------\nread groups:");
|
||||||
|
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
|
||||||
|
System.out.print(" "+g.getReadGroupId()+ " ("+g.getSample()+":"+g.getLibrary()+")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.exit(1);
|
||||||
|
|
||||||
|
Set<String> normal = new HashSet<String>(); // list normal samples here
|
||||||
|
Set<String> tumor = new HashSet<String>(); // list tumor samples here
|
||||||
|
|
||||||
|
SAMFileReader rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(0));
|
||||||
|
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
||||||
|
normal.add(new String(rec.getSample()));
|
||||||
|
}
|
||||||
|
rn.close();
|
||||||
|
rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(1));
|
||||||
|
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
||||||
|
tumor.add(new String(rec.getSample()));
|
||||||
|
}
|
||||||
|
rn.close();
|
||||||
|
|
||||||
|
// now we know what samples are normal, and what are tumor; let's assign dynamic read groups we get in merged header:
|
||||||
|
for ( SAMReadGroupRecord mr : mergedHeader.getReadGroups() ) {
|
||||||
|
if ( normal.contains(mr.getSample() ) ) {
|
||||||
|
normal_samples.add( new String(mr.getReadGroupId()) );
|
||||||
|
System.out.println("Read group "+ mr.getReadGroupId() + "--> Sample "+ mr.getSample() + " (normal)");
|
||||||
|
} else if ( tumor.contains(mr.getSample() ) ) {
|
||||||
|
tumor_samples.add( new String(mr.getReadGroupId()) );
|
||||||
|
System.out.println("Read group "+ mr.getReadGroupId() + "--> Sample "+ mr.getSample() + " (tumor)");
|
||||||
|
} else throw new StingException("Unrecognized sample "+mr.getSample() +" in merged SAM stream");
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Integer map(char[] ref, SAMRecord read) {
|
public Integer map(char[] ref, SAMRecord read) {
|
||||||
|
|
||||||
|
|
@ -357,13 +407,15 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
*/
|
*/
|
||||||
private void emit(long position, boolean force) {
|
private void emit(long position, boolean force) {
|
||||||
|
|
||||||
long stop_at = position; // we will shift to this position instead of passed 'position'
|
long move_to = position; // we will shift to position move_to; it's initialized with 'position',
|
||||||
// argument if we did not cover MISMATCH_WIDTH bases around the last indel yet
|
// but it may end up being smaller (delayed shift), if we have not
|
||||||
|
// covered MISMATCH_WIDTH bases to the right of the last indel yet.
|
||||||
|
|
||||||
|
// walk along the coverage window and emit indels up to the position we are trying ot shift the window to
|
||||||
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
List<IndelVariant> variants = coverage.indelsAt(pos);
|
List<IndelVariant> variants = coverage.indelsAt(pos);
|
||||||
if ( variants.size() == 0 ) continue; // no indels
|
if ( variants.size() == 0 ) continue; // no indels at current position
|
||||||
|
|
||||||
// if we are here, we got a variant
|
// if we are here, we got a variant
|
||||||
|
|
||||||
|
|
@ -371,19 +423,26 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
if ( cov < minCoverage ) continue; // low coverage
|
if ( cov < minCoverage ) continue; // low coverage
|
||||||
|
|
||||||
|
// System.out.println("indel at "+pos);
|
||||||
|
|
||||||
|
// region around the current indel we need to have covered in order to compute mismatch rate:
|
||||||
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
||||||
long right = pos+MISMATCH_WIDTH;
|
long right = pos+MISMATCH_WIDTH;
|
||||||
|
|
||||||
if ( right > coverage.getStop() ) { // we do not have enough bases in the current window
|
// if right < position we are shifting to, we already have all the coverage within MISMATCH_WINDOW bases around the indel,
|
||||||
// in order to assess mismatch rate
|
// so we can proceed with counting mismatches and emitting the indel
|
||||||
if( force ) { // if we were asked to force-shift, then, well, shift anyway
|
|
||||||
right = coverage.getStop() ;
|
if ( right >= position && ! force) {
|
||||||
} else {
|
// we are not asked to force-shift, and there is more coverage around the current indel that we need to collect
|
||||||
// shift to the position prior to the last indel so that we could get all the mismatch counts around it later
|
|
||||||
stop_at = left;
|
// we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it;
|
||||||
|
// instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage
|
||||||
|
move_to = left;
|
||||||
|
// System.out.println("right="+right+" requested="+position+" stopped at="+left);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
if ( right > coverage.getStop() ) right = coverage.getStop(); // if indel is too close to the end of the window but we need to emit anyway
|
||||||
|
|
||||||
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
||||||
int total_mismatches = 0;
|
int total_mismatches = 0;
|
||||||
|
|
@ -405,12 +464,14 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
if ( verbose ) out.println(message + "\t"+ annotationString);
|
if ( verbose ) out.println(message + "\t"+ annotationString);
|
||||||
}
|
}
|
||||||
|
// else { System.out.println("not a call: count="+p.first.count+" total_count="+p.second+" cov="+cov+
|
||||||
|
// " minConsensusF="+((double)p.first.count)/p.second+" minF="+((double)p.first.count)/cov); }
|
||||||
// for ( IndelVariant var : variants ) {
|
// for ( IndelVariant var : variants ) {
|
||||||
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
// System.out.println("Shifting to " + move_to+" ("+position+")");
|
||||||
coverage.shift((int)(stop_at - coverage.getStart() ) );
|
coverage.shift((int)(move_to - coverage.getStart() ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
||||||
|
|
@ -420,7 +481,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
*/
|
*/
|
||||||
private void emit_somatic(long position, boolean force) {
|
private void emit_somatic(long position, boolean force) {
|
||||||
|
|
||||||
long stop_at = position;
|
long move_to = position;
|
||||||
|
|
||||||
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
|
|
@ -440,16 +501,16 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() );
|
||||||
long right = pos+MISMATCH_WIDTH;
|
long right = pos+MISMATCH_WIDTH;
|
||||||
|
|
||||||
if ( right > coverage.getStop() ) { // we do not have enough bases in the current window
|
if ( right >= position && ! force) {
|
||||||
// in order to assess mismatch rate
|
// we are not asked to force-shift, and there is more coverage around the current indel that we need to collect
|
||||||
if( force ) { // if we were asked to force-shift, then, well, shift anyway
|
|
||||||
right = coverage.getStop() ;
|
// we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it;
|
||||||
} else {
|
// instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage
|
||||||
// shift to the position prior to the last indel so that we could get all the mismatch counts around it later
|
move_to = left;
|
||||||
stop_at = left;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
if ( right > coverage.getStop() ) right = coverage.getStop(); // if indel is too close to the end of the window but we need to emit anyway
|
||||||
|
|
||||||
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
// count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side):
|
||||||
int total_mismatches_normal = 0;
|
int total_mismatches_normal = 0;
|
||||||
|
|
@ -500,8 +561,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
coverage.shift((int)(stop_at - coverage.getStart() ) );
|
coverage.shift((int)(move_to - coverage.getStart() ) );
|
||||||
normal_coverage.shift((int)(stop_at - normal_coverage.getStart() ) );
|
normal_coverage.shift((int)(move_to - normal_coverage.getStart() ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue