-Moved my walkers to indels directory
-Removed entropy walker and replaced it with mismatch (column) walker -Some improvements to the cleaner (more to come) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@830 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
df8490a0cf
commit
919e995b7f
|
|
@ -1,98 +0,0 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@WalkerName("EntropyIntervals")
|
||||
public class EntropyIntervalWalker extends LocusWalker<Pair<GenomeLoc, Double>, Pair<LinkedList<Double>, GenomeLoc>> {
|
||||
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
|
||||
public int windowSize = 10;
|
||||
|
||||
public void initialize() {
|
||||
if ( windowSize < 1)
|
||||
throw new RuntimeException("Window Size must be a positive integer");
|
||||
}
|
||||
|
||||
public Pair<GenomeLoc, Double> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
// return the entropy of this locus
|
||||
int[] baseCounts = new int[4];
|
||||
for (int i=0; i < 4; i++)
|
||||
baseCounts[i] = 0;
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
int goodBases = 0;
|
||||
double errorRate = 0.0;
|
||||
for (int i = 0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
int base = BaseUtils.simpleBaseToBaseIndex((char)read.getReadBases()[offset]);
|
||||
if ( base != -1 ) {
|
||||
errorRate += Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
||||
baseCounts[base]++;
|
||||
goodBases++;
|
||||
}
|
||||
}
|
||||
|
||||
double expectedEntropy = (errorRate * Math.log(errorRate)) + ((1-errorRate) * Math.log(1-errorRate));
|
||||
double observedEntropy = 0.0;
|
||||
if ( goodBases > 0 ) {
|
||||
for (int i=0; i < 4; i++) {
|
||||
double Pjk = (double)baseCounts[i] / (double)goodBases;
|
||||
if ( Pjk > 0 )
|
||||
observedEntropy += Pjk * Math.log(Pjk);
|
||||
}
|
||||
if ( observedEntropy != 0 )
|
||||
observedEntropy *= -1;
|
||||
}
|
||||
|
||||
double locusEntropy = (observedEntropy > expectedEntropy ? (observedEntropy-expectedEntropy) : 0.0);
|
||||
return new Pair<GenomeLoc, Double>(context.getLocation(), locusEntropy);
|
||||
}
|
||||
|
||||
public void onTraversalDone() {}
|
||||
|
||||
public Pair<LinkedList<Double>, GenomeLoc> reduceInit() {
|
||||
return new Pair<LinkedList<Double>, GenomeLoc>(new LinkedList<Double>(), null);
|
||||
}
|
||||
|
||||
public Pair<LinkedList<Double>, GenomeLoc> reduce(Pair<GenomeLoc, Double> value, Pair<LinkedList<Double>, GenomeLoc> sum) {
|
||||
sum.first.addLast(value.second);
|
||||
if ( sum.first.size() <= windowSize )
|
||||
return sum;
|
||||
|
||||
sum.first.remove();
|
||||
double avgEntropy = 0.0;
|
||||
for (int i = 0; i < windowSize; i++)
|
||||
avgEntropy += sum.first.get(i);
|
||||
avgEntropy /= windowSize;
|
||||
|
||||
if ( avgEntropy > 0.001 ) {
|
||||
//out.println(avgEntropy);
|
||||
|
||||
// if there is no interval to the left, then this is the first one
|
||||
if ( sum.second == null ) {
|
||||
sum.second = value.first;
|
||||
}
|
||||
// if the intervals don't overlap, print out the leftmost one and start a new one
|
||||
else if ( !sum.second.contiguousP(value.first) ) {
|
||||
out.println(sum.second);
|
||||
sum.second = value.first;
|
||||
}
|
||||
// otherwise, merge them
|
||||
else {
|
||||
sum.second = sum.second.merge(value.first);
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@WalkerName("CoverageGapIntervals")
|
||||
public class CoverageGapIntervalWalker extends LocusWalker<Pair<GenomeLoc, Integer>, GenomeLoc> {
|
||||
|
||||
private final int minReadsAtInterval = 10;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
int goodReads = 0;
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
for (int i = 0; i < reads.size(); i++ ) {
|
||||
if ( reads.get(i).getMappingQuality() > 0 )
|
||||
goodReads++;
|
||||
}
|
||||
return goodReads >= minReadsAtInterval;
|
||||
}
|
||||
|
||||
public Pair<GenomeLoc, Integer> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
// find the probability that this locus has a statistically significant gap in coverage
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
int totalXi = 0;
|
||||
for (int i = 0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
if ( read.getMappingQuality() == 0 )
|
||||
continue;
|
||||
int halfLength = read.getReadString().length() >> 1;
|
||||
int distanceFromMiddle = Math.abs(offsets.get(i) - halfLength);
|
||||
int quarterLength = halfLength >> 1;
|
||||
|
||||
// Xi is < 0 if you are closer to the middle than the quartile
|
||||
// and is > 0 if further to the middle than quartile
|
||||
// We expect the total sum of Xi over an interval to be ~0
|
||||
int Xi = distanceFromMiddle - quarterLength;
|
||||
totalXi += Xi;
|
||||
}
|
||||
|
||||
return new Pair<GenomeLoc, Integer>(context.getLocation(), totalXi);
|
||||
}
|
||||
|
||||
public void onTraversalDone() {}
|
||||
|
||||
public GenomeLoc reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public GenomeLoc reduce(Pair<GenomeLoc, Integer> value, GenomeLoc sum) {
|
||||
if ( value.second > 1000 ) {
|
||||
if ( sum != null )
|
||||
sum.setStop(value.first.getStop());
|
||||
else
|
||||
sum = value.first;
|
||||
} else if ( sum != null ) {
|
||||
out.println(sum);
|
||||
sum = null;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@WalkerName("MismatchIntervals")
|
||||
public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>, Pair<LinkedList<Boolean>, GenomeLoc>> {
|
||||
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
|
||||
public int windowSize = 10;
|
||||
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false)
|
||||
public double mismatchThreshold = 0.20;
|
||||
|
||||
private final int minReadsAtInterval = 4;
|
||||
|
||||
public void initialize() {
|
||||
if ( windowSize < 1)
|
||||
throw new RuntimeException("Window Size must be a positive integer");
|
||||
}
|
||||
|
||||
public Pair<GenomeLoc, Boolean> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
char upperRef = Character.toUpperCase(ref);
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
int goodReads = 0, mismatchQualities = 0, totalQualities = 0;
|
||||
for (int i = 0; i < reads.size(); i++) {
|
||||
SAMRecord read = reads.get(i);
|
||||
if ( read.getMappingQuality() == 0 )
|
||||
continue;
|
||||
|
||||
goodReads++;
|
||||
int offset = offsets.get(i);
|
||||
int quality = read.getBaseQualityString().charAt(offset) - 33;
|
||||
totalQualities += quality;
|
||||
|
||||
char base = Character.toUpperCase((char)read.getReadBases()[offset]);
|
||||
if ( base != upperRef )
|
||||
mismatchQualities += quality;
|
||||
}
|
||||
|
||||
boolean flag = false;
|
||||
if ( goodReads >= minReadsAtInterval && (double)mismatchQualities / (double)totalQualities > mismatchThreshold )
|
||||
flag = true;
|
||||
return new Pair<GenomeLoc, Boolean>(context.getLocation(), flag);
|
||||
}
|
||||
|
||||
public void onTraversalDone(Pair<LinkedList<Boolean>, GenomeLoc> sum) {
|
||||
if (sum.second != null)
|
||||
out.println(sum.second);
|
||||
}
|
||||
|
||||
public Pair<LinkedList<Boolean>, GenomeLoc> reduceInit() {
|
||||
return new Pair<LinkedList<Boolean>, GenomeLoc>(new LinkedList<Boolean>(), null);
|
||||
}
|
||||
|
||||
public Pair<LinkedList<Boolean>, GenomeLoc> reduce(Pair<GenomeLoc, Boolean> value, Pair<LinkedList<Boolean>, GenomeLoc> sum) {
|
||||
sum.first.addLast(value.second);
|
||||
if ( sum.first.size() <= windowSize )
|
||||
return sum;
|
||||
|
||||
sum.first.remove();
|
||||
if ( !value.second )
|
||||
return sum;
|
||||
|
||||
int mismatches = 0;
|
||||
int firstMismatch = -1;
|
||||
for (int i = 0; i < windowSize; i++) {
|
||||
if ( sum.first.get(i) ) {
|
||||
mismatches++;
|
||||
if ( firstMismatch == -1 )
|
||||
firstMismatch = i;
|
||||
}
|
||||
}
|
||||
|
||||
if ( mismatches > 1 ) {
|
||||
// if there is no interval to the left, then this is the first one
|
||||
if ( sum.second == null ) {
|
||||
sum.second = value.first;
|
||||
sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1);
|
||||
}
|
||||
// if the intervals don't overlap, print out the leftmost one and start a new one
|
||||
else if ( value.first.getStop() - sum.second.getStop() > windowSize ) {
|
||||
out.println(sum.second);
|
||||
sum.second = value.first;
|
||||
sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1);
|
||||
}
|
||||
// otherwise, merge them
|
||||
else {
|
||||
sum.second.setStop(value.first.getStop());
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue