From 919e995b7fefd2d0b0b33484fdfa6bbf3cafc633 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 27 May 2009 16:34:24 +0000 Subject: [PATCH] -Moved my walkers to indels directory -Removed entropy walker and replaced it with mismatch (column) walker -Some improvements to the cleaner (more to come) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@830 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/EntropyIntervalWalker.java | 98 ----------------- .../indels/CoverageGapIntervalWalker.java | 71 ++++++++++++ .../{ => indels}/IndelIntervalWalker.java | 0 .../{ => indels}/IntervalCleanerWalker.java | 0 .../indels/MismatchIntervalWalker.java | 102 ++++++++++++++++++ 5 files changed, 173 insertions(+), 98 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CoverageGapIntervalWalker.java rename java/src/org/broadinstitute/sting/playground/gatk/walkers/{ => indels}/IndelIntervalWalker.java (100%) rename java/src/org/broadinstitute/sting/playground/gatk/walkers/{ => indels}/IntervalCleanerWalker.java (100%) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/MismatchIntervalWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java deleted file mode 100755 index cef9a8234..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java +++ /dev/null @@ -1,98 +0,0 @@ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.util.*; - -@WalkerName("EntropyIntervals") -public class EntropyIntervalWalker extends LocusWalker, Pair, GenomeLoc>> { - @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false) - public int windowSize = 10; - - public void initialize() { - if ( windowSize < 1) - throw new RuntimeException("Window Size must be a positive integer"); - } - - public Pair map(RefMetaDataTracker tracker, char ref, LocusContext context) { - // return the entropy of this locus - int[] baseCounts = new int[4]; - for (int i=0; i < 4; i++) - baseCounts[i] = 0; - - List reads = context.getReads(); - List offsets = context.getOffsets(); - int goodBases = 0; - double errorRate = 0.0; - for (int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - int base = BaseUtils.simpleBaseToBaseIndex((char)read.getReadBases()[offset]); - if ( base != -1 ) { - errorRate += Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); - baseCounts[base]++; - goodBases++; - } - } - - double expectedEntropy = (errorRate * Math.log(errorRate)) + ((1-errorRate) * Math.log(1-errorRate)); - double observedEntropy = 0.0; - if ( goodBases > 0 ) { - for (int i=0; i < 4; i++) { - double Pjk = (double)baseCounts[i] / (double)goodBases; - if ( Pjk > 0 ) - observedEntropy += Pjk * Math.log(Pjk); - } - if ( observedEntropy != 0 ) - observedEntropy *= -1; - } - - double locusEntropy = (observedEntropy > expectedEntropy ? (observedEntropy-expectedEntropy) : 0.0); - return new Pair(context.getLocation(), locusEntropy); - } - - public void onTraversalDone() {} - - public Pair, GenomeLoc> reduceInit() { - return new Pair, GenomeLoc>(new LinkedList(), null); - } - - public Pair, GenomeLoc> reduce(Pair value, Pair, GenomeLoc> sum) { - sum.first.addLast(value.second); - if ( sum.first.size() <= windowSize ) - return sum; - - sum.first.remove(); - double avgEntropy = 0.0; - for (int i = 0; i < windowSize; i++) - avgEntropy += sum.first.get(i); - avgEntropy /= windowSize; - - if ( avgEntropy > 0.001 ) { - //out.println(avgEntropy); - - // if there is no interval to the left, then this is the first one - if ( sum.second == null ) { - sum.second = value.first; - } - // if the intervals don't overlap, print out the leftmost one and start a new one - else if ( !sum.second.contiguousP(value.first) ) { - out.println(sum.second); - sum.second = value.first; - } - // otherwise, merge them - else { - sum.second = sum.second.merge(value.first); - } - } - - return sum; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CoverageGapIntervalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CoverageGapIntervalWalker.java new file mode 100755 index 000000000..babbdca81 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CoverageGapIntervalWalker.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; + +@WalkerName("CoverageGapIntervals") +public class CoverageGapIntervalWalker extends LocusWalker, GenomeLoc> { + + private final int minReadsAtInterval = 10; + + public void initialize() {} + + public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { + int goodReads = 0; + List reads = context.getReads(); + for (int i = 0; i < reads.size(); i++ ) { + if ( reads.get(i).getMappingQuality() > 0 ) + goodReads++; + } + return goodReads >= minReadsAtInterval; + } + + public Pair map(RefMetaDataTracker tracker, char ref, LocusContext context) { + // find the probability that this locus has a statistically significant gap in coverage + List reads = context.getReads(); + List offsets = context.getOffsets(); + int totalXi = 0; + for (int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + if ( read.getMappingQuality() == 0 ) + continue; + int halfLength = read.getReadString().length() >> 1; + int distanceFromMiddle = Math.abs(offsets.get(i) - halfLength); + int quarterLength = halfLength >> 1; + + // Xi is < 0 if you are closer to the middle than the quartile + // and is > 0 if further to the middle than quartile + // We expect the total sum of Xi over an interval to be ~0 + int Xi = distanceFromMiddle - quarterLength; + totalXi += Xi; + } + + return new Pair(context.getLocation(), totalXi); + } + + public void onTraversalDone() {} + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc reduce(Pair value, GenomeLoc sum) { + if ( value.second > 1000 ) { + if ( sum != null ) + sum.setStop(value.first.getStop()); + else + sum = value.first; + } else if ( sum != null ) { + out.println(sum); + sum = null; + } + return sum; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IndelIntervalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelIntervalWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/IndelIntervalWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelIntervalWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java similarity index 100% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/MismatchIntervalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/MismatchIntervalWalker.java new file mode 100755 index 000000000..bef298672 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/MismatchIntervalWalker.java @@ -0,0 +1,102 @@ + +package org.broadinstitute.sting.playground.gatk.walkers; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; + +@WalkerName("MismatchIntervals") +public class MismatchIntervalWalker extends LocusWalker, Pair, GenomeLoc>> { + @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false) + public int windowSize = 10; + @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false) + public double mismatchThreshold = 0.20; + + private final int minReadsAtInterval = 4; + + public void initialize() { + if ( windowSize < 1) + throw new RuntimeException("Window Size must be a positive integer"); + } + + public Pair map(RefMetaDataTracker tracker, char ref, LocusContext context) { + char upperRef = Character.toUpperCase(ref); + List reads = context.getReads(); + List offsets = context.getOffsets(); + int goodReads = 0, mismatchQualities = 0, totalQualities = 0; + for (int i = 0; i < reads.size(); i++) { + SAMRecord read = reads.get(i); + if ( read.getMappingQuality() == 0 ) + continue; + + goodReads++; + int offset = offsets.get(i); + int quality = read.getBaseQualityString().charAt(offset) - 33; + totalQualities += quality; + + char base = Character.toUpperCase((char)read.getReadBases()[offset]); + if ( base != upperRef ) + mismatchQualities += quality; + } + + boolean flag = false; + if ( goodReads >= minReadsAtInterval && (double)mismatchQualities / (double)totalQualities > mismatchThreshold ) + flag = true; + return new Pair(context.getLocation(), flag); + } + + public void onTraversalDone(Pair, GenomeLoc> sum) { + if (sum.second != null) + out.println(sum.second); + } + + public Pair, GenomeLoc> reduceInit() { + return new Pair, GenomeLoc>(new LinkedList(), null); + } + + public Pair, GenomeLoc> reduce(Pair value, Pair, GenomeLoc> sum) { + sum.first.addLast(value.second); + if ( sum.first.size() <= windowSize ) + return sum; + + sum.first.remove(); + if ( !value.second ) + return sum; + + int mismatches = 0; + int firstMismatch = -1; + for (int i = 0; i < windowSize; i++) { + if ( sum.first.get(i) ) { + mismatches++; + if ( firstMismatch == -1 ) + firstMismatch = i; + } + } + + if ( mismatches > 1 ) { + // if there is no interval to the left, then this is the first one + if ( sum.second == null ) { + sum.second = value.first; + sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1); + } + // if the intervals don't overlap, print out the leftmost one and start a new one + else if ( value.first.getStop() - sum.second.getStop() > windowSize ) { + out.println(sum.second); + sum.second = value.first; + sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1); + } + // otherwise, merge them + else { + sum.second.setStop(value.first.getStop()); + } + } + + return sum; + } +} \ No newline at end of file