From 599ceeddd8318ac375fb9723a8f86de9e4c8e6c4 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 11 Jun 2009 16:57:40 +0000 Subject: [PATCH] Better method for downsampling deep regions git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@983 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IntervalCleanerWalker.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java index caae7e804..46f718e3e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java @@ -271,9 +271,20 @@ public class IntervalCleanerWalker extends LocusWindowWalker // if we have too many reads with mismatches, be greedy if ( altReads.size() > GREEDY_THRESHOLD) { logger.debug("Downsampling from " + altReads.size() + " to " + GREEDY_THRESHOLD + " mismatching reads"); - //sortByGreedy(); - for ( int i = GREEDY_THRESHOLD; i < altReads.size(); i++) - altAlignmentsToTest.set(i, false); + // the best thing to do here is to randomly sample from the reads + // however, we definitely do want to keep the clean indel-containing reads + // (which were purposely placed at the beginning of the list) + int downsampleTo = GREEDY_THRESHOLD - priorIndelsToTest.size(); + int sampleRate = (altReads.size() - priorIndelsToTest.size()) / downsampleTo; + for ( int i = 0; i < downsampleTo; i++) { + int index = priorIndelsToTest.size() + (i * sampleRate); + for ( int j = 1; j < sampleRate; j++) + altAlignmentsToTest.set(index+j, false); + } + // also get the trailing reads + int tail = priorIndelsToTest.size() + (downsampleTo * sampleRate); + for ( int i = tail; i < altAlignmentsToTest.size(); i++) + altAlignmentsToTest.set(i, false); } Consensus bestConsensus = null; @@ -710,7 +721,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker // got maximum possible shift after checking period=1 above. } - // if ( ce2.getLength() >= 2 ) + // if ( ce2.getLength() >= 2 ) // System.out.println("-----------------------------------\n FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0))); @@ -720,7 +731,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker newCigar.add(ce2); newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M)); // System.out.println(" FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex)); - // if ( ce2.getLength() >=2 ) + // if ( ce2.getLength() >=2 ) // System.out.println(" REALIGNED TO:\n"+AlignmentUtils.alignmentToString(newCigar,readSeq,refSeq,refIndex,readIndex)+"\n"); logger.debug("Realigning indel: " + cigarToString(cigar) + " to " + cigarToString(newCigar));