Calculate interval-based statistics for Hybrid Selection

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@558 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kcibul 2009-04-29 04:01:24 +00:00
parent 6ecc43f385
commit f557da0a78
1 changed files with 115 additions and 0 deletions

View File

@ -0,0 +1,115 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.List;
import net.sf.samtools.SAMRecord;
public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerformanceWalker.TargetInfo> {
public static class TargetInfo {
public int counts = 0;
// did at least two reads hit this target
public boolean hitTwice = false;
// TODO: track max and min?
// TODO: median rather than average?
// TODO: bin into segments? (requires knowing position)
}
// @Argument(fullName="suppressLocusPrinting",required=false,defaultValue="false")
// public boolean suppressPrinting;
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
List<SAMRecord> reads = context.getReads();
int depth = 0;
for ( int i = 0; i < reads.size(); i++ )
{
SAMRecord read = reads.get(i);
// TODO: is there a better way to do this?
if (read.getNotPrimaryAlignmentFlag() ||
read.getDuplicateReadFlag() ||
read.getReadUnmappedFlag() ||
read.getMappingQuality() <= -1
) {
continue;
}
depth++;
}
return depth;
}
/**
* Return true if your walker wants to reduce each interval separately. Default is false.
*
* If you set this flag, several things will happen.
*
* The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce
* Reduce will accumulate normally at each map unit in the interval
* However, onTraversalDone(reduce) will be called after each interval is processed.
* The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done,
* which is overloaded here to call onTraversalDone(reduce) for each location
*/
public boolean isReduceByInterval() {
return true;
}
public TargetInfo reduceInit() { return new TargetInfo(); }
public TargetInfo reduce(Integer value, TargetInfo sum) {
sum.counts += value;
if (value >= 2) { sum.hitTwice = true; }
return sum;
}
public void onTraversalDone(TargetInfo result) {
}
@Override
public void onTraversalDone(List<Pair<GenomeLoc, TargetInfo>> results) {
out.println("location\tlength\tavg_coverage\tnormalized_coverage\thit_twice");
// first zip through and calculate the total average coverage
long totalCoverage = 0;
long basesConsidered = 0;
for(Pair<GenomeLoc, TargetInfo> pair : results) {
GenomeLoc target = pair.getFirst();
TargetInfo ti = pair.getSecond();
// as long as it was hit twice, count it
if(ti.hitTwice) {
long length = target.getStop() - target.getStart() + 1;
totalCoverage += ti.counts;
basesConsidered += length;
}
}
double meanTargetCoverage = totalCoverage / basesConsidered;
for(Pair<GenomeLoc, TargetInfo> pair : results) {
GenomeLoc target = pair.getFirst();
TargetInfo ti = pair.getSecond();
long length = target.getStop() - target.getStart() + 1;
double avgCoverage = ((double)ti.counts / (double)length);
double normCoverage = avgCoverage / meanTargetCoverage;
out.printf("%s:%d-%d\t%d\t%6.4f\t%6.4f\t%d\n",
target.getContig(), target.getStart(), target.getStop(), length,
avgCoverage, normCoverage, ((ti.hitTwice)?1:0)
);
}
}
}