-Moved my walkers to indels directory
-Removed entropy walker and replaced it with mismatch (column) walker -Some improvements to the cleaner (more to come) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@830 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
df8490a0cf
commit
919e995b7f
|
|
@ -1,98 +0,0 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
|
||||||
|
|
||||||
import net.sf.samtools.*;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.*;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
|
||||||
import org.broadinstitute.sting.utils.*;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
@WalkerName("EntropyIntervals")
|
|
||||||
public class EntropyIntervalWalker extends LocusWalker<Pair<GenomeLoc, Double>, Pair<LinkedList<Double>, GenomeLoc>> {
|
|
||||||
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
|
|
||||||
public int windowSize = 10;
|
|
||||||
|
|
||||||
public void initialize() {
|
|
||||||
if ( windowSize < 1)
|
|
||||||
throw new RuntimeException("Window Size must be a positive integer");
|
|
||||||
}
|
|
||||||
|
|
||||||
public Pair<GenomeLoc, Double> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
|
||||||
// return the entropy of this locus
|
|
||||||
int[] baseCounts = new int[4];
|
|
||||||
for (int i=0; i < 4; i++)
|
|
||||||
baseCounts[i] = 0;
|
|
||||||
|
|
||||||
List<SAMRecord> reads = context.getReads();
|
|
||||||
List<Integer> offsets = context.getOffsets();
|
|
||||||
int goodBases = 0;
|
|
||||||
double errorRate = 0.0;
|
|
||||||
for (int i = 0; i < reads.size(); i++ ) {
|
|
||||||
SAMRecord read = reads.get(i);
|
|
||||||
int offset = offsets.get(i);
|
|
||||||
int base = BaseUtils.simpleBaseToBaseIndex((char)read.getReadBases()[offset]);
|
|
||||||
if ( base != -1 ) {
|
|
||||||
errorRate += Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
|
||||||
baseCounts[base]++;
|
|
||||||
goodBases++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
double expectedEntropy = (errorRate * Math.log(errorRate)) + ((1-errorRate) * Math.log(1-errorRate));
|
|
||||||
double observedEntropy = 0.0;
|
|
||||||
if ( goodBases > 0 ) {
|
|
||||||
for (int i=0; i < 4; i++) {
|
|
||||||
double Pjk = (double)baseCounts[i] / (double)goodBases;
|
|
||||||
if ( Pjk > 0 )
|
|
||||||
observedEntropy += Pjk * Math.log(Pjk);
|
|
||||||
}
|
|
||||||
if ( observedEntropy != 0 )
|
|
||||||
observedEntropy *= -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
double locusEntropy = (observedEntropy > expectedEntropy ? (observedEntropy-expectedEntropy) : 0.0);
|
|
||||||
return new Pair<GenomeLoc, Double>(context.getLocation(), locusEntropy);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone() {}
|
|
||||||
|
|
||||||
public Pair<LinkedList<Double>, GenomeLoc> reduceInit() {
|
|
||||||
return new Pair<LinkedList<Double>, GenomeLoc>(new LinkedList<Double>(), null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Pair<LinkedList<Double>, GenomeLoc> reduce(Pair<GenomeLoc, Double> value, Pair<LinkedList<Double>, GenomeLoc> sum) {
|
|
||||||
sum.first.addLast(value.second);
|
|
||||||
if ( sum.first.size() <= windowSize )
|
|
||||||
return sum;
|
|
||||||
|
|
||||||
sum.first.remove();
|
|
||||||
double avgEntropy = 0.0;
|
|
||||||
for (int i = 0; i < windowSize; i++)
|
|
||||||
avgEntropy += sum.first.get(i);
|
|
||||||
avgEntropy /= windowSize;
|
|
||||||
|
|
||||||
if ( avgEntropy > 0.001 ) {
|
|
||||||
//out.println(avgEntropy);
|
|
||||||
|
|
||||||
// if there is no interval to the left, then this is the first one
|
|
||||||
if ( sum.second == null ) {
|
|
||||||
sum.second = value.first;
|
|
||||||
}
|
|
||||||
// if the intervals don't overlap, print out the leftmost one and start a new one
|
|
||||||
else if ( !sum.second.contiguousP(value.first) ) {
|
|
||||||
out.println(sum.second);
|
|
||||||
sum.second = value.first;
|
|
||||||
}
|
|
||||||
// otherwise, merge them
|
|
||||||
else {
|
|
||||||
sum.second = sum.second.merge(value.first);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
@WalkerName("CoverageGapIntervals")
|
||||||
|
public class CoverageGapIntervalWalker extends LocusWalker<Pair<GenomeLoc, Integer>, GenomeLoc> {
|
||||||
|
|
||||||
|
private final int minReadsAtInterval = 10;
|
||||||
|
|
||||||
|
public void initialize() {}
|
||||||
|
|
||||||
|
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
int goodReads = 0;
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
for (int i = 0; i < reads.size(); i++ ) {
|
||||||
|
if ( reads.get(i).getMappingQuality() > 0 )
|
||||||
|
goodReads++;
|
||||||
|
}
|
||||||
|
return goodReads >= minReadsAtInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<GenomeLoc, Integer> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
// find the probability that this locus has a statistically significant gap in coverage
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
int totalXi = 0;
|
||||||
|
for (int i = 0; i < reads.size(); i++ ) {
|
||||||
|
SAMRecord read = reads.get(i);
|
||||||
|
if ( read.getMappingQuality() == 0 )
|
||||||
|
continue;
|
||||||
|
int halfLength = read.getReadString().length() >> 1;
|
||||||
|
int distanceFromMiddle = Math.abs(offsets.get(i) - halfLength);
|
||||||
|
int quarterLength = halfLength >> 1;
|
||||||
|
|
||||||
|
// Xi is < 0 if you are closer to the middle than the quartile
|
||||||
|
// and is > 0 if further to the middle than quartile
|
||||||
|
// We expect the total sum of Xi over an interval to be ~0
|
||||||
|
int Xi = distanceFromMiddle - quarterLength;
|
||||||
|
totalXi += Xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Pair<GenomeLoc, Integer>(context.getLocation(), totalXi);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone() {}
|
||||||
|
|
||||||
|
public GenomeLoc reduceInit() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public GenomeLoc reduce(Pair<GenomeLoc, Integer> value, GenomeLoc sum) {
|
||||||
|
if ( value.second > 1000 ) {
|
||||||
|
if ( sum != null )
|
||||||
|
sum.setStop(value.first.getStop());
|
||||||
|
else
|
||||||
|
sum = value.first;
|
||||||
|
} else if ( sum != null ) {
|
||||||
|
out.println(sum);
|
||||||
|
sum = null;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,102 @@
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
@WalkerName("MismatchIntervals")
|
||||||
|
public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>, Pair<LinkedList<Boolean>, GenomeLoc>> {
|
||||||
|
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
|
||||||
|
public int windowSize = 10;
|
||||||
|
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false)
|
||||||
|
public double mismatchThreshold = 0.20;
|
||||||
|
|
||||||
|
private final int minReadsAtInterval = 4;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if ( windowSize < 1)
|
||||||
|
throw new RuntimeException("Window Size must be a positive integer");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<GenomeLoc, Boolean> map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
char upperRef = Character.toUpperCase(ref);
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
int goodReads = 0, mismatchQualities = 0, totalQualities = 0;
|
||||||
|
for (int i = 0; i < reads.size(); i++) {
|
||||||
|
SAMRecord read = reads.get(i);
|
||||||
|
if ( read.getMappingQuality() == 0 )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
goodReads++;
|
||||||
|
int offset = offsets.get(i);
|
||||||
|
int quality = read.getBaseQualityString().charAt(offset) - 33;
|
||||||
|
totalQualities += quality;
|
||||||
|
|
||||||
|
char base = Character.toUpperCase((char)read.getReadBases()[offset]);
|
||||||
|
if ( base != upperRef )
|
||||||
|
mismatchQualities += quality;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean flag = false;
|
||||||
|
if ( goodReads >= minReadsAtInterval && (double)mismatchQualities / (double)totalQualities > mismatchThreshold )
|
||||||
|
flag = true;
|
||||||
|
return new Pair<GenomeLoc, Boolean>(context.getLocation(), flag);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(Pair<LinkedList<Boolean>, GenomeLoc> sum) {
|
||||||
|
if (sum.second != null)
|
||||||
|
out.println(sum.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<LinkedList<Boolean>, GenomeLoc> reduceInit() {
|
||||||
|
return new Pair<LinkedList<Boolean>, GenomeLoc>(new LinkedList<Boolean>(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<LinkedList<Boolean>, GenomeLoc> reduce(Pair<GenomeLoc, Boolean> value, Pair<LinkedList<Boolean>, GenomeLoc> sum) {
|
||||||
|
sum.first.addLast(value.second);
|
||||||
|
if ( sum.first.size() <= windowSize )
|
||||||
|
return sum;
|
||||||
|
|
||||||
|
sum.first.remove();
|
||||||
|
if ( !value.second )
|
||||||
|
return sum;
|
||||||
|
|
||||||
|
int mismatches = 0;
|
||||||
|
int firstMismatch = -1;
|
||||||
|
for (int i = 0; i < windowSize; i++) {
|
||||||
|
if ( sum.first.get(i) ) {
|
||||||
|
mismatches++;
|
||||||
|
if ( firstMismatch == -1 )
|
||||||
|
firstMismatch = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( mismatches > 1 ) {
|
||||||
|
// if there is no interval to the left, then this is the first one
|
||||||
|
if ( sum.second == null ) {
|
||||||
|
sum.second = value.first;
|
||||||
|
sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1);
|
||||||
|
}
|
||||||
|
// if the intervals don't overlap, print out the leftmost one and start a new one
|
||||||
|
else if ( value.first.getStop() - sum.second.getStop() > windowSize ) {
|
||||||
|
out.println(sum.second);
|
||||||
|
sum.second = value.first;
|
||||||
|
sum.second.setStart(sum.second.getStart() - windowSize + firstMismatch + 1);
|
||||||
|
}
|
||||||
|
// otherwise, merge them
|
||||||
|
else {
|
||||||
|
sum.second.setStop(value.first.getStop());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue