Calculate interval-based statistics for Hybrid Selection
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@558 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6ecc43f385
commit
f557da0a78
|
|
@ -0,0 +1,115 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class HybSelPerformanceWalker extends LocusWalker<Integer, HybSelPerformanceWalker.TargetInfo> {
|
||||
public static class TargetInfo {
|
||||
public int counts = 0;
|
||||
|
||||
// did at least two reads hit this target
|
||||
public boolean hitTwice = false;
|
||||
|
||||
// TODO: track max and min?
|
||||
// TODO: median rather than average?
|
||||
// TODO: bin into segments? (requires knowing position)
|
||||
}
|
||||
|
||||
// @Argument(fullName="suppressLocusPrinting",required=false,defaultValue="false")
|
||||
// public boolean suppressPrinting;
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
|
||||
int depth = 0;
|
||||
for ( int i = 0; i < reads.size(); i++ )
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
|
||||
// TODO: is there a better way to do this?
|
||||
if (read.getNotPrimaryAlignmentFlag() ||
|
||||
read.getDuplicateReadFlag() ||
|
||||
read.getReadUnmappedFlag() ||
|
||||
read.getMappingQuality() <= -1
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
depth++;
|
||||
}
|
||||
|
||||
return depth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if your walker wants to reduce each interval separately. Default is false.
|
||||
*
|
||||
* If you set this flag, several things will happen.
|
||||
*
|
||||
* The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce
|
||||
* Reduce will accumulate normally at each map unit in the interval
|
||||
* However, onTraversalDone(reduce) will be called after each interval is processed.
|
||||
* The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done,
|
||||
* which is overloaded here to call onTraversalDone(reduce) for each location
|
||||
*/
|
||||
public boolean isReduceByInterval() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public TargetInfo reduceInit() { return new TargetInfo(); }
|
||||
|
||||
public TargetInfo reduce(Integer value, TargetInfo sum) {
|
||||
sum.counts += value;
|
||||
if (value >= 2) { sum.hitTwice = true; }
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(TargetInfo result) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(List<Pair<GenomeLoc, TargetInfo>> results) {
|
||||
out.println("location\tlength\tavg_coverage\tnormalized_coverage\thit_twice");
|
||||
|
||||
// first zip through and calculate the total average coverage
|
||||
long totalCoverage = 0;
|
||||
long basesConsidered = 0;
|
||||
for(Pair<GenomeLoc, TargetInfo> pair : results) {
|
||||
GenomeLoc target = pair.getFirst();
|
||||
TargetInfo ti = pair.getSecond();
|
||||
|
||||
// as long as it was hit twice, count it
|
||||
if(ti.hitTwice) {
|
||||
long length = target.getStop() - target.getStart() + 1;
|
||||
totalCoverage += ti.counts;
|
||||
basesConsidered += length;
|
||||
}
|
||||
}
|
||||
double meanTargetCoverage = totalCoverage / basesConsidered;
|
||||
|
||||
|
||||
for(Pair<GenomeLoc, TargetInfo> pair : results) {
|
||||
GenomeLoc target = pair.getFirst();
|
||||
TargetInfo ti = pair.getSecond();
|
||||
long length = target.getStop() - target.getStart() + 1;
|
||||
|
||||
double avgCoverage = ((double)ti.counts / (double)length);
|
||||
double normCoverage = avgCoverage / meanTargetCoverage;
|
||||
|
||||
out.printf("%s:%d-%d\t%d\t%6.4f\t%6.4f\t%d\n",
|
||||
target.getContig(), target.getStart(), target.getStop(), length,
|
||||
avgCoverage, normCoverage, ((ti.hitTwice)?1:0)
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue