-Moved my walkers to indels directory

-Removed entropy walker and replaced it with mismatch (column) walker
-Some improvements to the cleaner (more to come)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@830 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-05-27 16:34:24 +00:00
parent df8490a0cf
commit 919e995b7f
5 changed files with 173 additions and 98 deletions

View File

@ -1,98 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
@WalkerName("EntropyIntervals")
public class EntropyIntervalWalker extends LocusWalker<Pair<GenomeLoc, Double>, Pair<LinkedList<Double>, GenomeLoc>> {
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
public int windowSize = 10;
public void initialize() {
if ( windowSize < 1)
throw new RuntimeException("Window Size must be a positive integer");
}
public Pair<GenomeLoc, Double> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
// return the entropy of this locus
int[] baseCounts = new int[4];
for (int i=0; i < 4; i++)
baseCounts[i] = 0;
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
int goodBases = 0;
double errorRate = 0.0;
for (int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
int base = BaseUtils.simpleBaseToBaseIndex((char)read.getReadBases()[offset]);
if ( base != -1 ) {
errorRate += Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
baseCounts[base]++;
goodBases++;
}
}
double expectedEntropy = (errorRate * Math.log(errorRate)) + ((1-errorRate) * Math.log(1-errorRate));
double observedEntropy = 0.0;
if ( goodBases > 0 ) {
for (int i=0; i < 4; i++) {
double Pjk = (double)baseCounts[i] / (double)goodBases;
if ( Pjk > 0 )
observedEntropy += Pjk * Math.log(Pjk);
}
if ( observedEntropy != 0 )
observedEntropy *= -1;
}
double locusEntropy = (observedEntropy > expectedEntropy ? (observedEntropy-expectedEntropy) : 0.0);
return new Pair<GenomeLoc, Double>(context.getLocation(), locusEntropy);
}
public void onTraversalDone() {}
public Pair<LinkedList<Double>, GenomeLoc> reduceInit() {
return new Pair<LinkedList<Double>, GenomeLoc>(new LinkedList<Double>(), null);
}
public Pair<LinkedList<Double>, GenomeLoc> reduce(Pair<GenomeLoc, Double> value, Pair<LinkedList<Double>, GenomeLoc> sum) {
sum.first.addLast(value.second);
if ( sum.first.size() <= windowSize )
return sum;
sum.first.remove();
double avgEntropy = 0.0;
for (int i = 0; i < windowSize; i++)
avgEntropy += sum.first.get(i);
avgEntropy /= windowSize;
if ( avgEntropy > 0.001 ) {
//out.println(avgEntropy);
// if there is no interval to the left, then this is the first one
if ( sum.second == null ) {
sum.second = value.first;
}
// if the intervals don't overlap, print out the leftmost one and start a new one
else if ( !sum.second.contiguousP(value.first) ) {
out.println(sum.second);
sum.second = value.first;
}
// otherwise, merge them
else {
sum.second = sum.second.merge(value.first);
}
}
return sum;
}
}

View File

@ -0,0 +1,71 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
@WalkerName("CoverageGapIntervals")
public class CoverageGapIntervalWalker extends LocusWalker<Pair<GenomeLoc, Integer>, GenomeLoc> {
private final int minReadsAtInterval = 10;
public void initialize() {}
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
int goodReads = 0;
List<SAMRecord> reads = context.getReads();
for (int i = 0; i < reads.size(); i++ ) {
if ( reads.get(i).getMappingQuality() > 0 )
goodReads++;
}
return goodReads >= minReadsAtInterval;
}
public Pair<GenomeLoc, Integer> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
// find the probability that this locus has a statistically significant gap in coverage
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
int totalXi = 0;
for (int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
if ( read.getMappingQuality() == 0 )
continue;
int halfLength = read.getReadString().length() >> 1;
int distanceFromMiddle = Math.abs(offsets.get(i) - halfLength);
int quarterLength = halfLength >> 1;
// Xi is < 0 if you are closer to the middle than the quartile
// and is > 0 if further to the middle than quartile
// We expect the total sum of Xi over an interval to be ~0
int Xi = distanceFromMiddle - quarterLength;
totalXi += Xi;
}
return new Pair<GenomeLoc, Integer>(context.getLocation(), totalXi);
}
public void onTraversalDone() {}
public GenomeLoc reduceInit() {
return null;
}
public GenomeLoc reduce(Pair<GenomeLoc, Integer> value, GenomeLoc sum) {
if ( value.second > 1000 ) {
if ( sum != null )
sum.setStop(value.first.getStop());
else
sum = value.first;
} else if ( sum != null ) {
out.println(sum);
sum = null;
}
return sum;
}
}

View File

@ -0,0 +1,102 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
@WalkerName("MismatchIntervals")
public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>, Pair<LinkedList<Boolean>, GenomeLoc>> {
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
public int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false)
public double mismatchThreshold = 0.20;
private final int minReadsAtInterval = 4;
public void initialize() {
if ( windowSize < 1)
throw new RuntimeException("Window Size must be a positive integer");
}
public Pair<GenomeLoc, Boolean> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
char upperRef = Character.toUpperCase(ref);
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
int goodReads = 0, mismatchQualities = 0, totalQualities = 0;
for (int i = 0; i < reads.size(); i++) {
SAMRecord read = reads.get(i);
if ( read.getMappingQuality() == 0 )
continue;
goodReads++;
int offset = offsets.get(i);
int quality = read.getBaseQualityString().charAt(offset) - 33;
totalQualities += quality;
char base = Character.toUpperCase((char)read.getReadBases()[offset]);
if ( base != upperRef )
mismatchQualities += quality;
}
boolean flag = false;
if ( goodReads >= minReadsAtInterval && (double)mismatchQualities / (double)totalQualities > mismatchThreshold )
flag = true;
return new Pair<GenomeLoc, Boolean>(context.getLocation(), flag);
}
public void onTraversalDone(Pair<LinkedList<Boolean>, GenomeLoc> sum) {
if (sum.second != null)
out.println(sum.second);
}
public Pair<LinkedList<Boolean>, GenomeLoc> reduceInit() {
return new Pair<LinkedList<Boolean>, GenomeLoc>(new LinkedList<Boolean>(), null);
}
public Pair<LinkedList<Boolean>, GenomeLoc> reduce(Pair<GenomeLoc, Boolean> value, Pair<LinkedList<Boolean>, GenomeLoc> sum) {
sum.first.addLast(value.second);
if ( sum.first.size() <= windowSize )
return sum;
sum.first.remove();
if ( !value.second )
return sum;
int mismatches = 0;
int firstMismatch = -1;
for (int i = 0; i < windowSize; i++) {
if ( sum.first.get(i) ) {
mismatches++;
if ( firstMismatch == -1 )
firstMismatch = i;
}
}
if ( mismatches > 1 ) {
// if there is no interval to the left, then this is the first one
if ( sum.second == null ) {
sum.second = value.first;
sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1);
}
// if the intervals don't overlap, print out the leftmost one and start a new one
else if ( value.first.getStop() - sum.second.getStop() > windowSize ) {
out.println(sum.second);
sum.second = value.first;
sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1);
}
// otherwise, merge them
else {
sum.second.setStop(value.first.getStop());
}
}
return sum;
}
}