From 64221907a2e44c86a560188efd7fcda45b759082 Mon Sep 17 00:00:00 2001 From: asivache Date: Fri, 24 Jul 2009 21:00:31 +0000 Subject: [PATCH] fixed a bug found by Eric: genotyper would crash in the case of an indel too close to the window end, with the next read mapping sufficiently far away on the ref git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1313 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IndelGenotyperWalker.java | 115 ++++++++++++++---- 1 file changed, 88 insertions(+), 27 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelGenotyperWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelGenotyperWalker.java index 11d017c2f..2f02fc574 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelGenotyperWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelGenotyperWalker.java @@ -1,13 +1,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.indels; +import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.refdata.RODIterator; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.Transcript; import org.broadinstitute.sting.gatk.refdata.rodRefSeq; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.playground.utils.CircularArray; @@ -164,6 +167,53 @@ public class IndelGenotyperWalker extends ReadWalker { } + + void assignReadGroups1(final SAMFileHeader mergedHeader) { + + SAMDataSource d = getToolkit().getDataSource(); + + Reads reads = d.getReadsInfo(); + + System.out.println(reads.getReadsFiles().size() + " input files"); + + System.out.println(d.getHeaderMerger().getReaders().size() +" readers instantiated"); + for ( SAMFileReader r : d.getHeaderMerger().getReaders() ) { + System.out.print("----------\nread groups:"); + for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) { + System.out.print(" "+g.getReadGroupId()+ " ("+g.getSample()+":"+g.getLibrary()+")"); + } + } + + System.exit(1); + + Set normal = new HashSet(); // list normal samples here + Set tumor = new HashSet(); // list tumor samples here + + SAMFileReader rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(0)); + for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) { + normal.add(new String(rec.getSample())); + } + rn.close(); + rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(1)); + for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) { + tumor.add(new String(rec.getSample())); + } + rn.close(); + + // now we know what samples are normal, and what are tumor; let's assign dynamic read groups we get in merged header: + for ( SAMReadGroupRecord mr : mergedHeader.getReadGroups() ) { + if ( normal.contains(mr.getSample() ) ) { + normal_samples.add( new String(mr.getReadGroupId()) ); + System.out.println("Read group "+ mr.getReadGroupId() + "--> Sample "+ mr.getSample() + " (normal)"); + } else if ( tumor.contains(mr.getSample() ) ) { + tumor_samples.add( new String(mr.getReadGroupId()) ); + System.out.println("Read group "+ mr.getReadGroupId() + "--> Sample "+ mr.getSample() + " (tumor)"); + } else throw new StingException("Unrecognized sample "+mr.getSample() +" in merged SAM stream"); + } + System.out.println(); + + } + @Override public Integer map(char[] ref, SAMRecord read) { @@ -357,13 +407,15 @@ public class IndelGenotyperWalker extends ReadWalker { */ private void emit(long position, boolean force) { - long stop_at = position; // we will shift to this position instead of passed 'position' - // argument if we did not cover MISMATCH_WIDTH bases around the last indel yet + long move_to = position; // we will shift to position move_to; it's initialized with 'position', + // but it may end up being smaller (delayed shift), if we have not + // covered MISMATCH_WIDTH bases to the right of the last indel yet. + // walk along the coverage window and emit indels up to the position we are trying ot shift the window to for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) { List variants = coverage.indelsAt(pos); - if ( variants.size() == 0 ) continue; // no indels + if ( variants.size() == 0 ) continue; // no indels at current position // if we are here, we got a variant @@ -371,20 +423,27 @@ public class IndelGenotyperWalker extends ReadWalker { if ( cov < minCoverage ) continue; // low coverage +// System.out.println("indel at "+pos); + + // region around the current indel we need to have covered in order to compute mismatch rate: long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() ); long right = pos+MISMATCH_WIDTH; - if ( right > coverage.getStop() ) { // we do not have enough bases in the current window - // in order to assess mismatch rate - if( force ) { // if we were asked to force-shift, then, well, shift anyway - right = coverage.getStop() ; - } else { - // shift to the position prior to the last indel so that we could get all the mismatch counts around it later - stop_at = left; - break; - } + // if right < position we are shifting to, we already have all the coverage within MISMATCH_WINDOW bases around the indel, + // so we can proceed with counting mismatches and emitting the indel + + if ( right >= position && ! force) { + // we are not asked to force-shift, and there is more coverage around the current indel that we need to collect + + // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; + // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage + move_to = left; +// System.out.println("right="+right+" requested="+position+" stopped at="+left); + break; } + if ( right > coverage.getStop() ) right = coverage.getStop(); // if indel is too close to the end of the window but we need to emit anyway + // count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side): int total_mismatches = 0; for ( long k = left; k <= right ; k++ ) total_mismatches+=coverage.mismatchesAt(k); @@ -404,13 +463,15 @@ public class IndelGenotyperWalker extends ReadWalker { String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotation)); if ( verbose ) out.println(message + "\t"+ annotationString); - } + } +// else { System.out.println("not a call: count="+p.first.count+" total_count="+p.second+" cov="+cov+ +// " minConsensusF="+((double)p.first.count)/p.second+" minF="+((double)p.first.count)/cov); } // for ( IndelVariant var : variants ) { // System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); // } } - - coverage.shift((int)(stop_at - coverage.getStart() ) ); +// System.out.println("Shifting to " + move_to+" ("+position+")"); + coverage.shift((int)(move_to - coverage.getStart() ) ); } /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed @@ -420,7 +481,7 @@ public class IndelGenotyperWalker extends ReadWalker { */ private void emit_somatic(long position, boolean force) { - long stop_at = position; + long move_to = position; for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) { @@ -440,17 +501,17 @@ public class IndelGenotyperWalker extends ReadWalker { long left = Math.max( pos-MISMATCH_WIDTH, coverage.getStart() ); long right = pos+MISMATCH_WIDTH; - if ( right > coverage.getStop() ) { // we do not have enough bases in the current window - // in order to assess mismatch rate - if( force ) { // if we were asked to force-shift, then, well, shift anyway - right = coverage.getStop() ; - } else { - // shift to the position prior to the last indel so that we could get all the mismatch counts around it later - stop_at = left; - break; - } + if ( right >= position && ! force) { + // we are not asked to force-shift, and there is more coverage around the current indel that we need to collect + + // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; + // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage + move_to = left; + break; } + if ( right > coverage.getStop() ) right = coverage.getStop(); // if indel is too close to the end of the window but we need to emit anyway + // count mismatches around the current indel, inside specified window (MISMATCH_WIDTH on each side): int total_mismatches_normal = 0; int total_mismatches_tumor = 0; @@ -500,8 +561,8 @@ public class IndelGenotyperWalker extends ReadWalker { // } } - coverage.shift((int)(stop_at - coverage.getStart() ) ); - normal_coverage.shift((int)(stop_at - normal_coverage.getStart() ) ); + coverage.shift((int)(move_to - coverage.getStart() ) ); + normal_coverage.shift((int)(move_to - normal_coverage.getStart() ) ); }