From 5dda448ae0f3fc58e3843174d1af2433da834cb4 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 14 May 2009 13:59:48 +0000 Subject: [PATCH] 1. Add printouts for the cleaner 2. First pass at the entropy interval walker (still needs work) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@696 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/EntropyIntervalWalker.java | 98 +++++++++++++++++++ .../gatk/walkers/IntervalCleanerWalker.java | 17 ++++ 2 files changed, 115 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java new file mode 100755 index 000000000..cef9a8234 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/EntropyIntervalWalker.java @@ -0,0 +1,98 @@ + +package org.broadinstitute.sting.playground.gatk.walkers; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; + +@WalkerName("EntropyIntervals") +public class EntropyIntervalWalker extends LocusWalker, Pair, GenomeLoc>> { + @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false) + public int windowSize = 10; + + public void initialize() { + if ( windowSize < 1) + throw new RuntimeException("Window Size must be a positive integer"); + } + + public Pair map(RefMetaDataTracker tracker, char ref, LocusContext context) { + // return the entropy of this locus + int[] baseCounts = new int[4]; + for (int i=0; i < 4; i++) + baseCounts[i] = 0; + + List reads = context.getReads(); + List offsets = context.getOffsets(); + int goodBases = 0; + double errorRate = 0.0; + for (int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + int base = BaseUtils.simpleBaseToBaseIndex((char)read.getReadBases()[offset]); + if ( base != -1 ) { + errorRate += Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); + baseCounts[base]++; + goodBases++; + } + } + + double expectedEntropy = (errorRate * Math.log(errorRate)) + ((1-errorRate) * Math.log(1-errorRate)); + double observedEntropy = 0.0; + if ( goodBases > 0 ) { + for (int i=0; i < 4; i++) { + double Pjk = (double)baseCounts[i] / (double)goodBases; + if ( Pjk > 0 ) + observedEntropy += Pjk * Math.log(Pjk); + } + if ( observedEntropy != 0 ) + observedEntropy *= -1; + } + + double locusEntropy = (observedEntropy > expectedEntropy ? (observedEntropy-expectedEntropy) : 0.0); + return new Pair(context.getLocation(), locusEntropy); + } + + public void onTraversalDone() {} + + public Pair, GenomeLoc> reduceInit() { + return new Pair, GenomeLoc>(new LinkedList(), null); + } + + public Pair, GenomeLoc> reduce(Pair value, Pair, GenomeLoc> sum) { + sum.first.addLast(value.second); + if ( sum.first.size() <= windowSize ) + return sum; + + sum.first.remove(); + double avgEntropy = 0.0; + for (int i = 0; i < windowSize; i++) + avgEntropy += sum.first.get(i); + avgEntropy /= windowSize; + + if ( avgEntropy > 0.001 ) { + //out.println(avgEntropy); + + // if there is no interval to the left, then this is the first one + if ( sum.second == null ) { + sum.second = value.first; + } + // if the intervals don't overlap, print out the leftmost one and start a new one + else if ( !sum.second.contiguousP(value.first) ) { + out.println(sum.second); + sum.second = value.first; + } + // otherwise, merge them + else { + sum.second = sum.second.merge(value.first); + } + } + + return sum; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java index a256d746d..9a9cda030 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java @@ -140,6 +140,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker StringBuffer sb = new StringBuffer(); sb.append(reference.substring(0, refIdx)); Cigar c = swConsensus.getCigar(); + //out.println("CIGAR = " + cigarToString(c)); int indelCount = 0; int altIdx = 0; @@ -564,4 +565,20 @@ public class IntervalCleanerWalker extends LocusWindowWalker for ( AlignedRead aRec : altReads ) readsToWrite.add(new ComparableSAMRecord(aRec.getRead())); } + + public static String cigarToString(Cigar cig) { + StringBuilder b = new StringBuilder(); + + for ( int i = 0 ; i < cig.numCigarElements() ; i++ ) { + char c='?'; + switch ( cig.getCigarElement(i).getOperator() ) { + case M : c = 'M'; break; + case D : c = 'D'; break; + case I : c = 'I'; break; + } + b.append(cig.getCigarElement(i).getLength()); + b.append(c); + } + return b.toString(); + } } \ No newline at end of file