Better method for downsampling deep regions

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@983 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-06-11 16:57:40 +00:00
parent 4d9a88153a
commit 599ceeddd8
1 changed files with 16 additions and 5 deletions

View File

@ -271,8 +271,19 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// if we have too many reads with mismatches, be greedy // if we have too many reads with mismatches, be greedy
if ( altReads.size() > GREEDY_THRESHOLD) { if ( altReads.size() > GREEDY_THRESHOLD) {
logger.debug("Downsampling from " + altReads.size() + " to " + GREEDY_THRESHOLD + " mismatching reads"); logger.debug("Downsampling from " + altReads.size() + " to " + GREEDY_THRESHOLD + " mismatching reads");
//sortByGreedy(); // the best thing to do here is to randomly sample from the reads
for ( int i = GREEDY_THRESHOLD; i < altReads.size(); i++) // however, we definitely do want to keep the clean indel-containing reads
// (which were purposely placed at the beginning of the list)
int downsampleTo = GREEDY_THRESHOLD - priorIndelsToTest.size();
int sampleRate = (altReads.size() - priorIndelsToTest.size()) / downsampleTo;
for ( int i = 0; i < downsampleTo; i++) {
int index = priorIndelsToTest.size() + (i * sampleRate);
for ( int j = 1; j < sampleRate; j++)
altAlignmentsToTest.set(index+j, false);
}
// also get the trailing reads
int tail = priorIndelsToTest.size() + (downsampleTo * sampleRate);
for ( int i = tail; i < altAlignmentsToTest.size(); i++)
altAlignmentsToTest.set(i, false); altAlignmentsToTest.set(i, false);
} }