Oneoff project, totally unrelated to anything
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2776 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
334da80e8b
commit
f7e7bcd2ef
|
|
@ -0,0 +1,167 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Dec 3, 2009
|
||||||
|
* Time: 11:54:35 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
|
|
||||||
|
public class DSBWalker extends LocusWalker<Integer,Integer> {
|
||||||
|
@Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
|
||||||
|
int COV_CUTOFF = 0;
|
||||||
|
@Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
|
||||||
|
int MINLENGTH_CUTOFF = 0;
|
||||||
|
|
||||||
|
private int MERGE_DIST = 300; // merge intervals that are closer than this distance from one another
|
||||||
|
|
||||||
|
private long maxcov = 0;
|
||||||
|
private long maxz = 0;
|
||||||
|
private long mergedmaxcov = 0;
|
||||||
|
private long mergedmaxz = 0;
|
||||||
|
GenomeLoc mergedInterval = null;
|
||||||
|
GenomeLoc currentInterval = null;
|
||||||
|
|
||||||
|
private long nIntervals = 0;
|
||||||
|
|
||||||
|
private void emit(GenomeLoc l) {
|
||||||
|
if ( mergedInterval == null ) {
|
||||||
|
mergedInterval = l.clone();
|
||||||
|
mergedmaxcov = maxcov;
|
||||||
|
mergedmaxz = maxz;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( mergedInterval.getContigIndex() != l.getContigIndex() ) {
|
||||||
|
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
|
||||||
|
if ( length >= MINLENGTH_CUTOFF ) {
|
||||||
|
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
|
||||||
|
nIntervals++;
|
||||||
|
}
|
||||||
|
mergedInterval = l.clone();
|
||||||
|
mergedmaxcov = maxcov;
|
||||||
|
mergedmaxz = maxz;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// merged interval exists and new interval is on the same contig. Check if the new interval
|
||||||
|
// is close enough so we got to merge and keep waiting:
|
||||||
|
|
||||||
|
if ( l.getStart() - mergedInterval.getStop() < MERGE_DIST ) {
|
||||||
|
mergedInterval = GenomeLocParser.setStop(mergedInterval,l.getStop());
|
||||||
|
if ( maxcov > mergedmaxcov) mergedmaxcov = maxcov;
|
||||||
|
if ( maxz > mergedmaxz ) mergedmaxz = maxz;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// nope, new interval is far enough. Print old one and keep current one.
|
||||||
|
|
||||||
|
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
|
||||||
|
if ( length >= MINLENGTH_CUTOFF ) {
|
||||||
|
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
|
||||||
|
nIntervals++;
|
||||||
|
}
|
||||||
|
mergedInterval = l.clone();
|
||||||
|
mergedmaxcov = maxcov;
|
||||||
|
mergedmaxz = maxz;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone() {
|
||||||
|
if ( mergedInterval != null ) {
|
||||||
|
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
|
||||||
|
if ( length >= MINLENGTH_CUTOFF ) {
|
||||||
|
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
|
||||||
|
nIntervals++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println(nIntervals+" intervals detected.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
|
ReadBackedPileup pileup = context.getPileup();
|
||||||
|
List<SAMRecord> reads = pileup.getReads();
|
||||||
|
|
||||||
|
int nZero = pileup.getNumberOfMappingQualityZeroReads();
|
||||||
|
|
||||||
|
int nonZCoverage = reads.size() - nZero;
|
||||||
|
|
||||||
|
if ( nonZCoverage >= COV_CUTOFF ) {
|
||||||
|
|
||||||
|
// if we were not inside an interval, start one:
|
||||||
|
if ( currentInterval == null ) {
|
||||||
|
maxcov = nonZCoverage;
|
||||||
|
maxz = nZero;
|
||||||
|
currentInterval = context.getLocation().clone();
|
||||||
|
// System.out.println("Setting current to "+currentInterval);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we were inside an interval and we just jumped onto a new contig, get rid of the old interval
|
||||||
|
if ( currentInterval.compareContigs(context.getLocation()) != 0 ) {
|
||||||
|
// we just moved to a new contig
|
||||||
|
System.out.println("On contig "+context.getLocation().getContig());
|
||||||
|
emit(currentInterval);
|
||||||
|
maxcov = nonZCoverage;
|
||||||
|
maxz = nZero;
|
||||||
|
currentInterval = context.getLocation().clone();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we are on the same contig, we are within the interval, so we need to extend the current interval:
|
||||||
|
currentInterval = GenomeLocParser.setStop(currentInterval,context.getLocation().getStop()); // still within the interval, adjust stop
|
||||||
|
//System.out.println("Extending current to "+currentInterval +" ("+context.getLocation()+", "+context.getLocation().getStop()+")");
|
||||||
|
if ( nonZCoverage > maxcov ) maxcov = nonZCoverage; // adjust maxcov
|
||||||
|
if ( nZero > maxz ) maxz = nZero; // adjust maxz
|
||||||
|
} else {
|
||||||
|
// low coverage, if we were inside an interval, it stops now:
|
||||||
|
if ( currentInterval != null ) {
|
||||||
|
// System.out.println("Emitting current as "+currentInterval);
|
||||||
|
emit(currentInterval);
|
||||||
|
currentInterval = null;
|
||||||
|
maxcov = 0;
|
||||||
|
maxz = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide an initial value for reduce computations.
|
||||||
|
*
|
||||||
|
* @return Initial value of reduce.
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return accumulator with result of the map taken into account.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,360 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.CircularArray;
|
||||||
|
import org.broadinstitute.sting.utils.PrimitivePair;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Dec 12, 2009
|
||||||
|
* Time: 2:25:44 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
|
|
||||||
|
public class DSBWalkerV2 extends LocusWalker<Integer,Integer> {
|
||||||
|
// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
|
||||||
|
// int COV_CUTOFF = 0;
|
||||||
|
// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
|
||||||
|
// int MINLENGTH_CUTOFF = 0;
|
||||||
|
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
|
||||||
|
int WINDOW_SIZE = 100;
|
||||||
|
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
|
||||||
|
double ENRICHMENT_CUTOFF = 5.0;
|
||||||
|
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
|
||||||
|
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
|
||||||
|
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
|
||||||
|
int MIN_SIGNAL = 10;
|
||||||
|
|
||||||
|
private CircularArray<PrimitivePair.Int> signalWindow = null;
|
||||||
|
private CircularArray<PrimitivePair.Int> controlWindow = null;
|
||||||
|
private CircularArray<PrimitivePair.Int> signalStrandsWindow = null;
|
||||||
|
private CircularArray<PrimitivePair.Int> controlStrandsWindow = null;
|
||||||
|
|
||||||
|
private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long();
|
||||||
|
private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long();
|
||||||
|
private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long();
|
||||||
|
private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long();
|
||||||
|
|
||||||
|
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
|
||||||
|
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
|
||||||
|
|
||||||
|
private long windowStart = -1;
|
||||||
|
private long windowStop = -1;
|
||||||
|
private int curContig = -1;
|
||||||
|
private String curContigName = "";
|
||||||
|
|
||||||
|
// the following variables are for buffering and merging windows :
|
||||||
|
private long regionStart = -1;
|
||||||
|
private long lastWindowStart = -1;
|
||||||
|
private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int();
|
||||||
|
private PrimitivePair.Int minSignalReads = new PrimitivePair.Int();
|
||||||
|
private PrimitivePair.Int maxControlReads = new PrimitivePair.Int();
|
||||||
|
private PrimitivePair.Int minControlReads = new PrimitivePair.Int();
|
||||||
|
private double minEnrichmentUnique;
|
||||||
|
private double maxEnrichmentUnique;
|
||||||
|
private double minEnrichmentNonUnique;
|
||||||
|
private double maxEnrichmentNonUnique;
|
||||||
|
private double minEnrichmentTotal;
|
||||||
|
private double maxEnrichmentTotal;
|
||||||
|
private double minUniqueSignalStrandBalance = 0.0;
|
||||||
|
private double maxUniqueSignalStrandBalance = 0.0;
|
||||||
|
private double minNonUniqueSignalStrandBalance = 0.0;
|
||||||
|
private double maxNonUniqueSignalStrandBalance = 0.0;
|
||||||
|
private double minUniqueControlStrandBalance = 0.0;
|
||||||
|
private double maxUniqueControlStrandBalance = 0.0;
|
||||||
|
private double minNonUniqueControlStrandBalance = 0.0;
|
||||||
|
private double maxNonUniqueControlStrandBalance = 0.0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
int nSams = getToolkit().getArguments().samFiles.size();
|
||||||
|
|
||||||
|
if ( nSams != 2 ) {
|
||||||
|
out.println("ERROR: two input bam files (signal and backround control) must be specified");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
|
||||||
|
signalReadGroups = readGroupSets.get(0);
|
||||||
|
// System.out.println(signalReadGroups.size()+" read groups in signal");
|
||||||
|
controlReadGroups = readGroupSets.get(1);
|
||||||
|
// System.out.println(controlReadGroups.size()+" read groups in control");
|
||||||
|
signalWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||||
|
controlWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||||
|
signalStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||||
|
controlStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
|
ReadBackedPileup pileup = context.getPileup();
|
||||||
|
List<SAMRecord> reads = pileup.getReads();
|
||||||
|
|
||||||
|
// compute coverages at the current site:
|
||||||
|
PrimitivePair.Int signalCov = new PrimitivePair.Int();
|
||||||
|
PrimitivePair.Int controlCov = new PrimitivePair.Int();
|
||||||
|
PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int();
|
||||||
|
PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int();
|
||||||
|
|
||||||
|
for ( SAMRecord r : reads ) {
|
||||||
|
if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
|
||||||
|
if ( r.getMappingQuality() == 0 ) {
|
||||||
|
signalCov.second++;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
signalCov.first++;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
|
||||||
|
if ( r.getMappingQuality() == 0 ) {
|
||||||
|
controlCov.second++;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
controlCov.first++;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc loc = context.getLocation();
|
||||||
|
|
||||||
|
// if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second);
|
||||||
|
|
||||||
|
if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) {
|
||||||
|
// we jumped to the next contig, or we are on the same contig but the current position is
|
||||||
|
// more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift)
|
||||||
|
checkCurrentWindow(true);
|
||||||
|
|
||||||
|
if ( loc.getContigIndex() != curContig ) {
|
||||||
|
System.out.println("on contig "+loc.getContig());
|
||||||
|
}
|
||||||
|
curContig = loc.getContigIndex();
|
||||||
|
curContigName = loc.getContig();
|
||||||
|
// prevPos = loc.getStart();
|
||||||
|
windowStart = loc.getStart();
|
||||||
|
windowStop = windowStart + WINDOW_SIZE - 1;
|
||||||
|
signalWindow.clear();
|
||||||
|
controlWindow.clear();
|
||||||
|
totalSignalCoverage.assignFrom( signalCov );
|
||||||
|
totalControlCoverage.assignFrom( controlCov );
|
||||||
|
totalSignalFwdStrands.assignFrom( signalFwdStrands );
|
||||||
|
totalControlFwdStrands.assignFrom( controlFwdStrands );
|
||||||
|
signalWindow.set(0,signalCov);
|
||||||
|
controlWindow.set(0,controlCov);
|
||||||
|
signalStrandsWindow.set(0,signalFwdStrands);
|
||||||
|
controlStrandsWindow.set(0,controlFwdStrands);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// offset of the current position w.r.t. the start of the window:
|
||||||
|
int offset = (int)(loc.getStart() - windowStart);
|
||||||
|
|
||||||
|
if ( offset >= WINDOW_SIZE ) {
|
||||||
|
// if we are here, the current position is outside of the current window, but not
|
||||||
|
// far enough so that we'd need to reinitialize the window from scratch (that was already checked above).
|
||||||
|
// Now we need to shift.
|
||||||
|
|
||||||
|
// We are receiving covered positions in order, so we are guaranteed that everything prior to
|
||||||
|
// the current position was already counted; if some elements of the windows are still nulls, it means
|
||||||
|
// there was no coverage there
|
||||||
|
|
||||||
|
int shift = offset - WINDOW_SIZE + 1;
|
||||||
|
|
||||||
|
// scroll the window(s) base by base until the current position is inside the window. At each step
|
||||||
|
// we will check if the window meets the requirements and should be printed out.
|
||||||
|
for ( int i = 0 ; i < shift ; i++ ) {
|
||||||
|
|
||||||
|
// we are going to shift; check if the window as it is now is worth printing
|
||||||
|
checkCurrentWindow(false);
|
||||||
|
|
||||||
|
// discard coverage from the first element of the window (this element is about to be shifted out of scope)
|
||||||
|
if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0));
|
||||||
|
if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0));
|
||||||
|
|
||||||
|
if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0));
|
||||||
|
if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0));
|
||||||
|
|
||||||
|
// advnace window coordinates on the ref
|
||||||
|
windowStart++;
|
||||||
|
windowStop++;
|
||||||
|
|
||||||
|
// shift the data in the window(s):
|
||||||
|
signalWindow.shiftData(1);
|
||||||
|
controlWindow.shiftData(1);
|
||||||
|
signalStrandsWindow.shiftData(1);
|
||||||
|
controlStrandsWindow.shiftData(1);
|
||||||
|
|
||||||
|
offset--; // this is the new offset w.r.t. to the shifted window
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// at this point, either the current position was inside the current window, or it was outside,
|
||||||
|
// but the window was already shifted
|
||||||
|
totalSignalCoverage.add(signalCov);
|
||||||
|
totalControlCoverage.add(controlCov);
|
||||||
|
totalSignalFwdStrands.add(signalFwdStrands);
|
||||||
|
totalControlFwdStrands.add(controlFwdStrands);
|
||||||
|
signalWindow.set(offset,signalCov);
|
||||||
|
controlWindow.set(offset,controlCov);
|
||||||
|
signalStrandsWindow.set(offset,signalFwdStrands);
|
||||||
|
controlStrandsWindow.set(offset,controlFwdStrands);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide an initial value for reduce computations.
|
||||||
|
*
|
||||||
|
* @return Initial value of reduce.
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return accumulator with result of the map taken into account.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
printRegion();
|
||||||
|
super.onTraversalDone(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so.
|
||||||
|
* If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with
|
||||||
|
* the new window if it passes the cutoffs, or left empty.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private void checkCurrentWindow(boolean force) {
|
||||||
|
if ( force ) printRegion();
|
||||||
|
if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later
|
||||||
|
if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low
|
||||||
|
if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we have decent coverage in control, just check for required enrichment in the signal
|
||||||
|
if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that
|
||||||
|
* are close enough and prints only when a train of close-by windows has ended and next window received is far enough
|
||||||
|
*/
|
||||||
|
private void emitWindow(boolean force) {
|
||||||
|
|
||||||
|
if ( regionStart == -1 ) {
|
||||||
|
resetBuffer();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) {
|
||||||
|
// new window is far enough from the region we were buffering: emit old region
|
||||||
|
|
||||||
|
printRegion();
|
||||||
|
resetBuffer();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// current window is too close (overlapping) with a previous one: we need to merge
|
||||||
|
|
||||||
|
lastWindowStart = windowStart;
|
||||||
|
maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
|
||||||
|
maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
|
||||||
|
minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
|
||||||
|
minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
|
||||||
|
maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
|
||||||
|
maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
|
||||||
|
minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
|
||||||
|
minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
|
||||||
|
maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
|
||||||
|
minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
|
||||||
|
maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0));
|
||||||
|
minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) );
|
||||||
|
maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
|
||||||
|
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
|
||||||
|
minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
|
||||||
|
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
|
||||||
|
|
||||||
|
|
||||||
|
maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
|
||||||
|
minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
|
||||||
|
maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
|
||||||
|
minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
|
||||||
|
maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
|
||||||
|
minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
|
||||||
|
maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
|
||||||
|
minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void resetBuffer() {
|
||||||
|
regionStart = windowStart;
|
||||||
|
lastWindowStart = windowStart;
|
||||||
|
maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0);
|
||||||
|
maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0);
|
||||||
|
minSignalReads.assignFrom(maxSignalReads);
|
||||||
|
maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0);
|
||||||
|
maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0);
|
||||||
|
minControlReads.assignFrom(maxControlReads);
|
||||||
|
minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0);
|
||||||
|
minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0);
|
||||||
|
minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
|
||||||
|
((totalControlCoverage.first+ totalControlCoverage.second)/36.0);
|
||||||
|
|
||||||
|
minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first;
|
||||||
|
minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second;
|
||||||
|
minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first;
|
||||||
|
minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printRegion() {
|
||||||
|
if ( regionStart == -1 ) return;
|
||||||
|
out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+
|
||||||
|
minSignalReads.first+"-"+maxSignalReads.first+"\t"+
|
||||||
|
minSignalReads.second+"-"+maxSignalReads.second+"\t"+
|
||||||
|
minControlReads.first+"-"+maxControlReads.first+"\t"+
|
||||||
|
minControlReads.second+"-"+maxControlReads.second+"\t");
|
||||||
|
out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique);
|
||||||
|
out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique);
|
||||||
|
out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal);
|
||||||
|
out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance);
|
||||||
|
out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance);
|
||||||
|
out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance);
|
||||||
|
out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance);
|
||||||
|
|
||||||
|
if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
|
||||||
|
out.println();
|
||||||
|
|
||||||
|
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,441 @@
|
||||||
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: asivache
|
||||||
|
* Date: Jan 3, 2010
|
||||||
|
* Time: 1:58:38 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class DSBWalkerV3 extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
|
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
|
||||||
|
int WINDOW_SIZE = 100;
|
||||||
|
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
|
||||||
|
double ENRICHMENT_CUTOFF = 5.0;
|
||||||
|
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
|
||||||
|
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
|
||||||
|
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
|
||||||
|
int MIN_SIGNAL = 10;
|
||||||
|
@Argument(fullName="coverageFactor",shortName="cf",doc="Total number of uniquely mapped signal reads/total number of uniquely mapped control reads",required=false)
|
||||||
|
double COVERAGE_FACTOR=1.0;
|
||||||
|
@Argument(fullName="coverageFactorNU",shortName="cfnu",doc="Total number of non-uniquely mapped signal reads/total number of non-uniquely mapped control reads",required=false)
|
||||||
|
double COVERAGE_FACTOR_NU=1.0;
|
||||||
|
|
||||||
|
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
|
||||||
|
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
|
||||||
|
|
||||||
|
private GenomeLoc currentWindow = null;
|
||||||
|
private String currentContig = "chrM";
|
||||||
|
|
||||||
|
|
||||||
|
private LinkedList<SAMRecord> readsInSignalWindow = null;
|
||||||
|
private LinkedList<SAMRecord> readsInControlWindow = null;
|
||||||
|
|
||||||
|
private WindowStats signalCountsInCurrWindow = new WindowStats();
|
||||||
|
private WindowStats controlCountsInCurrWindow = new WindowStats();
|
||||||
|
|
||||||
|
|
||||||
|
// following variables are used by emitWindow to buffer adjacent windows
|
||||||
|
private int MERGE_CUTOFF = -1;
|
||||||
|
|
||||||
|
private long regionStart = -1;
|
||||||
|
private long lastWindowStart = -1;
|
||||||
|
private int addedSinceLastEmit = 0; // how many sliding window steps where buffered since the last emit (i.e. since the last window that really passed significance criteria)
|
||||||
|
// buffered read count stats for the windows inside the currently held merged print region:
|
||||||
|
private List<WindowStats> signalReadCountsBuffer = new ArrayList<WindowStats>(1000);
|
||||||
|
private List<WindowStats> controlReadCountsBuffer = new ArrayList<WindowStats>(1000);
|
||||||
|
|
||||||
|
|
||||||
|
/** Clears buffered reads and all counts. DOES NOT clear buffered print region */
|
||||||
|
private void resetWindows() {
|
||||||
|
readsInSignalWindow.clear();
|
||||||
|
readsInControlWindow.clear();
|
||||||
|
signalCountsInCurrWindow.clear();
|
||||||
|
controlCountsInCurrWindow.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addSignal(SAMRecord read) {
|
||||||
|
readsInSignalWindow.add(read);
|
||||||
|
signalCountsInCurrWindow.addRead(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addControl(SAMRecord read) {
|
||||||
|
readsInControlWindow.add(read);
|
||||||
|
controlCountsInCurrWindow.addRead(read);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Discard signal reads that start strictly before the specified position and
|
||||||
|
* update associated counts
|
||||||
|
* @param pos
|
||||||
|
*/
|
||||||
|
private void purgeSignal(long pos) {
|
||||||
|
Iterator<SAMRecord> it = readsInSignalWindow.iterator();
|
||||||
|
while ( it.hasNext() ) {
|
||||||
|
SAMRecord r = it.next();
|
||||||
|
if ( r.getAlignmentStart() >= pos ) return; // we are done
|
||||||
|
|
||||||
|
// read starts before pos: discard it and update the counts:
|
||||||
|
signalCountsInCurrWindow.removeRead(r);
|
||||||
|
it.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Discard signal reads that start strictly before the specified position and
|
||||||
|
* update associated counts
|
||||||
|
* @param pos
|
||||||
|
*/
|
||||||
|
private void purgeControl(long pos) {
|
||||||
|
Iterator<SAMRecord> it = readsInControlWindow.iterator();
|
||||||
|
while ( it.hasNext() ) {
|
||||||
|
SAMRecord r = it.next();
|
||||||
|
if ( r.getAlignmentStart() >= pos ) return; // we are done
|
||||||
|
|
||||||
|
// read starts before pos: discard it and update the counts:
|
||||||
|
controlCountsInCurrWindow.removeRead(r);
|
||||||
|
it.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void resetWindowMergingBuffer(long start) {
|
||||||
|
regionStart = start;
|
||||||
|
lastWindowStart = start;
|
||||||
|
signalReadCountsBuffer.clear();
|
||||||
|
controlReadCountsBuffer.clear();
|
||||||
|
signalReadCountsBuffer.add(signalCountsInCurrWindow.clone());
|
||||||
|
controlReadCountsBuffer.add(controlCountsInCurrWindow.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Delayed print: the window starting at 'start' will be added to the print buffer; if the window is close enough
|
||||||
|
* to the current contents if the buffer, the addition will result in merging the window with the buffer;
|
||||||
|
* otherwise, the old contents of the buffer will be printed and the buffer will be re-initialized with new window.
|
||||||
|
* It is assumed that counters are in synch with the start position passed to this method.
|
||||||
|
* @param start
|
||||||
|
*/
|
||||||
|
private void emitWindow(long start) {
|
||||||
|
// System.out.println("Emitting at "+start);
|
||||||
|
|
||||||
|
if ( regionStart == -1 ) { // we did not keep any region so far; initialize the buffer and return, will print later
|
||||||
|
resetWindowMergingBuffer(start);
|
||||||
|
addedSinceLastEmit = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( start > lastWindowStart + MERGE_CUTOFF ) {
|
||||||
|
// this loop is a dummy: we have already cleared those unneeded
|
||||||
|
// counts in shiftWindows(); stays here to avoid generating bugs later
|
||||||
|
// if we change something in shiftWindows()
|
||||||
|
for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) {
|
||||||
|
signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1);
|
||||||
|
controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1);
|
||||||
|
}
|
||||||
|
printRegion();
|
||||||
|
resetWindowMergingBuffer(start);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the current window is too close to the previous one: we have to merge;
|
||||||
|
// NOTE: if window is too close, bufferAccepts() returned true, so the counts are already
|
||||||
|
// added.
|
||||||
|
lastWindowStart = start;
|
||||||
|
addedSinceLastEmit = 0;
|
||||||
|
// signalReadCountsBuffer.add(uniqueSignalReads);
|
||||||
|
// controlReadCountsBuffer.add(uniqueControlReads);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean bufferAccepts(long pos) {
|
||||||
|
return ( regionStart != -1 && pos <= lastWindowStart+MERGE_CUTOFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private void printRegion() {
|
||||||
|
if ( regionStart == -1 ) return;
|
||||||
|
|
||||||
|
long regionStop = lastWindowStart+WINDOW_SIZE-1;
|
||||||
|
|
||||||
|
double[] tmpEnrU = new double[signalReadCountsBuffer.size()];
|
||||||
|
int[] tmpSignalU = new int[signalReadCountsBuffer.size()];
|
||||||
|
int[] tmpControlU = new int[signalReadCountsBuffer.size()];
|
||||||
|
double[] tmpEnrNU = new double[signalReadCountsBuffer.size()];
|
||||||
|
int[] tmpSignalNU = new int[signalReadCountsBuffer.size()];
|
||||||
|
int[] tmpControlNU = new int[signalReadCountsBuffer.size()];
|
||||||
|
|
||||||
|
double[] tmpFWDSignalFracU = new double[signalReadCountsBuffer.size()];
|
||||||
|
double[] tmpFWDControlFracU = new double[signalReadCountsBuffer.size()];
|
||||||
|
double[] tmpFWDSignalFracNU = new double[signalReadCountsBuffer.size()];
|
||||||
|
double[] tmpFWDControlFracNU = new double[signalReadCountsBuffer.size()];
|
||||||
|
|
||||||
|
int lastInd = signalReadCountsBuffer.size() - 1;
|
||||||
|
|
||||||
|
// out.println("Size="+signalReadCountsBuffer.size()+":");
|
||||||
|
|
||||||
|
for ( int i = 0 ; i <= lastInd ; i++ ) {
|
||||||
|
tmpEnrU[i]= ( ((double) signalReadCountsBuffer.get(i).uniqueReads) / (controlReadCountsBuffer.get(i).uniqueReads+1.0 ) ) / COVERAGE_FACTOR ;
|
||||||
|
|
||||||
|
tmpSignalU[i] = signalReadCountsBuffer.get(i).uniqueReads;
|
||||||
|
tmpControlU[i] = controlReadCountsBuffer.get(i).uniqueReads;
|
||||||
|
|
||||||
|
tmpEnrNU[i]= ( ((double) signalReadCountsBuffer.get(i).nonUniqueReads) / (controlReadCountsBuffer.get(i).nonUniqueReads+1.0 ) ) / COVERAGE_FACTOR_NU ;
|
||||||
|
|
||||||
|
tmpSignalNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads;
|
||||||
|
tmpControlNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads;
|
||||||
|
|
||||||
|
tmpFWDSignalFracU[i] = signalReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).uniqueFWDReads) / signalReadCountsBuffer.get(i).uniqueReads ) : 0.5;
|
||||||
|
tmpFWDControlFracU[i] = controlReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).uniqueFWDReads) / controlReadCountsBuffer.get(i).uniqueReads ) : 0.5;
|
||||||
|
tmpFWDSignalFracNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).nonUniqueFWDReads) / signalReadCountsBuffer.get(i).nonUniqueReads ) : 0.5;
|
||||||
|
tmpFWDControlFracNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).nonUniqueFWDReads) / controlReadCountsBuffer.get(i).nonUniqueReads ) : 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
Arrays.sort(tmpEnrU);
|
||||||
|
Arrays.sort(tmpSignalU);
|
||||||
|
Arrays.sort(tmpControlU);
|
||||||
|
|
||||||
|
Arrays.sort(tmpEnrNU);
|
||||||
|
Arrays.sort(tmpSignalNU);
|
||||||
|
Arrays.sort(tmpControlNU);
|
||||||
|
|
||||||
|
Arrays.sort(tmpFWDSignalFracU);
|
||||||
|
Arrays.sort(tmpFWDControlFracU);
|
||||||
|
Arrays.sort(tmpFWDSignalFracNU);
|
||||||
|
Arrays.sort(tmpFWDControlFracNU);
|
||||||
|
|
||||||
|
|
||||||
|
out.print(currentContig+":"+regionStart+"-"+regionStop+"\t"+
|
||||||
|
(regionStop-regionStart+1) +"\t"+
|
||||||
|
"signal_unique:"+ tmpSignalU[0]+"-"+ tmpSignalU[lastInd/2]+"-"+ tmpSignalU[lastInd]+"\t"+
|
||||||
|
"control_unique:"+ tmpControlU[0]+"-"+ tmpControlU[lastInd/2]+"-"+ tmpControlU[lastInd]);
|
||||||
|
|
||||||
|
out.printf("\tsignal_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDSignalFracU[0],tmpFWDSignalFracU[lastInd/2],tmpFWDSignalFracU[lastInd]);
|
||||||
|
out.printf("\tcontrol_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDControlFracU[0],tmpFWDControlFracU[lastInd/2],tmpFWDControlFracU[lastInd]);
|
||||||
|
|
||||||
|
out.print("\tsignal_nonnunique:"+ tmpSignalNU[0]+"-"+ tmpSignalNU[lastInd/2]+"-"+ tmpSignalNU[lastInd]+"\t"+
|
||||||
|
"control_nonunique:"+ tmpControlNU[0]+"-"+ tmpControlNU[lastInd/2]+"-"+ tmpControlNU[lastInd]);
|
||||||
|
|
||||||
|
out.printf("\tsignal_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDSignalFracNU[0],tmpFWDSignalFracNU[lastInd/2],tmpFWDSignalFracNU[lastInd]);
|
||||||
|
out.printf("\tcontrol_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDControlFracNU[0],tmpFWDControlFracNU[lastInd/2],tmpFWDControlFracNU[lastInd]);
|
||||||
|
|
||||||
|
out.printf("\tnorm_enrichment_unique:%.2f-%.2f-%.2f",tmpEnrU[0],tmpEnrU[lastInd/2],tmpEnrU[lastInd]);
|
||||||
|
out.printf("\tnorm_enrichment_nonunique:%.2f-%.2f-%.2f",tmpEnrNU[0],tmpEnrNU[lastInd/2],tmpEnrNU[lastInd]);
|
||||||
|
|
||||||
|
// if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
|
||||||
|
out.println();
|
||||||
|
|
||||||
|
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
|
||||||
|
// System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateWindowMergingBuffer(long i) {
|
||||||
|
if ( bufferAccepts(i) ) {
|
||||||
|
// we are not too far away from last window added to the buffer that actually passed significance criteria;
|
||||||
|
// in this case we have to keep buffering since another significant window may be encountered soon
|
||||||
|
// System.out.println("Updating buffer at "+i+" with "+ uniqueSignalReads);
|
||||||
|
signalReadCountsBuffer.add(signalCountsInCurrWindow.clone());
|
||||||
|
controlReadCountsBuffer.add(controlCountsInCurrWindow.clone());
|
||||||
|
addedSinceLastEmit++;
|
||||||
|
} else {
|
||||||
|
// we are too far from the last significant window; if another significant window comes later, it will not
|
||||||
|
// be merged into this region but will start a new one. In this case we have to erase all the counts we have been
|
||||||
|
// saving since the last significant window (the latter is where the current region is going to end!)
|
||||||
|
for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) {
|
||||||
|
signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1);
|
||||||
|
controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1);
|
||||||
|
}
|
||||||
|
printRegion(); // print current region right away, why not? next significant window will start new region for sure.
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void shiftWindows(long pos) {
|
||||||
|
// we shift windows when there is a read that does not fit into the current window.
|
||||||
|
// the position, to which the shift is performed, is the first position such that the new read
|
||||||
|
// can be accomodated. Hence we can safely slide up to pos, only discarding reads that go out of scope -
|
||||||
|
// we are guaranteed that there will be no new reads to add until we reach pos.
|
||||||
|
|
||||||
|
|
||||||
|
for ( long i = currentWindow.getStart() ; i < pos ; i++ ) {
|
||||||
|
// if ( readsInSignalWindow.size() == 0 ) {
|
||||||
|
// i = pos-1;
|
||||||
|
// continue;
|
||||||
|
// };
|
||||||
|
|
||||||
|
// if ( readsInSignalWindow.getFirst().getAlignmentStart() > i ) {
|
||||||
|
// i = readsInSignalWindow.getFirst().getAlignmentStart() - 1; // jump directly to next read position
|
||||||
|
// continue;
|
||||||
|
// }
|
||||||
|
|
||||||
|
purgeSignal(i); // remove all the reads that start before current position i (and update all the counters)
|
||||||
|
purgeControl(i);
|
||||||
|
|
||||||
|
updateWindowMergingBuffer(i);
|
||||||
|
|
||||||
|
if ( ( controlCountsInCurrWindow.uniqueReads + 1 ) * ENRICHMENT_CUTOFF < MIN_SIGNAL ) {
|
||||||
|
// too few control reads
|
||||||
|
if ( signalCountsInCurrWindow.uniqueReads >= MIN_SIGNAL ) {
|
||||||
|
// emit signal only if it is higher that hard cut-off:
|
||||||
|
emitWindow(i); // print current window (print can be buffered and delayed!)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// enough control reads;
|
||||||
|
// check for actual enrichment:
|
||||||
|
if ( ((double) signalCountsInCurrWindow.uniqueReads) / (controlCountsInCurrWindow.uniqueReads+1.0) > ENRICHMENT_CUTOFF ) {
|
||||||
|
emitWindow(i); // print current window (print can be buffered and delayed!)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// we emitted intermediate windows up to pos-1 as/if needed and purged everything that starts before pos-1
|
||||||
|
// now we have to purge everything that starts before pos and return (no emitting yet, as we are about to add a read upon return):
|
||||||
|
|
||||||
|
purgeSignal(pos);
|
||||||
|
purgeControl(pos);
|
||||||
|
|
||||||
|
currentWindow = GenomeLocParser.createGenomeLoc(currentWindow.getContigIndex(),pos,pos+WINDOW_SIZE-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
int nSams = getToolkit().getArguments().samFiles.size();
|
||||||
|
|
||||||
|
if ( nSams != 2 ) {
|
||||||
|
out.println("ERROR: two input bam files (signal and backround control) must be specified");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
|
||||||
|
signalReadGroups = readGroupSets.get(0);
|
||||||
|
// System.out.println(signalReadGroups.size()+" read groups in signal");
|
||||||
|
controlReadGroups = readGroupSets.get(1);
|
||||||
|
// System.out.println(controlReadGroups.size()+" read groups in control");
|
||||||
|
|
||||||
|
currentWindow = GenomeLocParser.createGenomeLoc(0,1,WINDOW_SIZE);
|
||||||
|
readsInSignalWindow = new LinkedList<SAMRecord>();
|
||||||
|
readsInControlWindow = new LinkedList<SAMRecord>();
|
||||||
|
|
||||||
|
MERGE_CUTOFF = WINDOW_SIZE;
|
||||||
|
ENRICHMENT_CUTOFF *= COVERAGE_FACTOR;
|
||||||
|
currentContig = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(0).getSequenceName();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Integer map(char[] ref, SAMRecord read) {
|
||||||
|
|
||||||
|
if ( AlignmentUtils.isReadUnmapped(read) ) return 0;
|
||||||
|
|
||||||
|
if ( read.getReferenceIndex() > currentWindow.getContigIndex() ) {
|
||||||
|
printRegion(); // print all we had on the previous contig
|
||||||
|
|
||||||
|
currentWindow = GenomeLocParser.createGenomeLoc(read.getReferenceIndex(),
|
||||||
|
read.getAlignmentStart(),
|
||||||
|
read.getAlignmentStart()+WINDOW_SIZE-1);
|
||||||
|
currentContig = read.getReferenceName();
|
||||||
|
resetWindows();
|
||||||
|
} else {
|
||||||
|
// we are on the same contig
|
||||||
|
if ( read.getAlignmentEnd() > currentWindow.getStop() ) {
|
||||||
|
// can not accomodate the read inside the current window - shift!
|
||||||
|
// System.out.println("read ends at "+read.getAlignmentEnd()+" window ends at "+currentWindow.getStop()+ " shifting to "+ (currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() )) +" ("+uniqueSignalReads+"/"+uniqueControlReads+")");
|
||||||
|
|
||||||
|
// while shifting the window, the following method will issue (delayed) print commands for
|
||||||
|
// all intermediate windows that pass significance criteria:
|
||||||
|
shiftWindows(currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() ));
|
||||||
|
}
|
||||||
|
// now the read will fit into the window
|
||||||
|
}
|
||||||
|
|
||||||
|
// at this point we are guaranteed that the read will fit into the window
|
||||||
|
|
||||||
|
if ( signalReadGroups.contains( read.getReadGroup().getReadGroupId() ) ) {
|
||||||
|
addSignal(read);
|
||||||
|
} else if ( controlReadGroups.contains( read.getReadGroup().getReadGroupId() )) {
|
||||||
|
addControl(read);
|
||||||
|
} else {
|
||||||
|
throw new StingException("Read "+read + " belongs to unrecognized read group");
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provide an initial value for reduce computations.
|
||||||
|
*
|
||||||
|
* @return Initial value of reduce.
|
||||||
|
*/
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||||
|
*
|
||||||
|
* @param value result of the map.
|
||||||
|
* @param sum accumulator for the reduce.
|
||||||
|
* @return accumulator with result of the map taken into account.
|
||||||
|
*/
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value+sum; //To change body of implemented methods use File | Settings | File Templates.
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Auxiliary class that encapsulates the task of monitoring counts of various read traits in some set of reads
|
||||||
|
* (for instance, reads in the current window). Counted traits include uniquely/non-uniquely mapped reads,
|
||||||
|
* forward-strand aligned reads etc.
|
||||||
|
*/
|
||||||
|
class WindowStats implements Cloneable {
|
||||||
|
public int uniqueReads = 0;
|
||||||
|
public int nonUniqueReads = 0;
|
||||||
|
public int uniqueFWDReads = 0;
|
||||||
|
public int nonUniqueFWDReads = 0;
|
||||||
|
|
||||||
|
/** Reset all counts to 0 */
|
||||||
|
public void clear() {
|
||||||
|
uniqueReads = nonUniqueReads = uniqueFWDReads = nonUniqueFWDReads = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Examines the read and increments the counts for all the monitored traits observed in this read. */
|
||||||
|
public void addRead(SAMRecord r) {
|
||||||
|
if ( r.getMappingQuality() == 0 ) {
|
||||||
|
// nonunique
|
||||||
|
nonUniqueReads++;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads++;
|
||||||
|
} else {
|
||||||
|
// unique
|
||||||
|
uniqueReads++;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Examines the read and decrements the counts for all the monitored traits observed in this read. */
|
||||||
|
public void removeRead(SAMRecord r) {
|
||||||
|
if ( r.getMappingQuality() == 0 ) {
|
||||||
|
// nonunique
|
||||||
|
nonUniqueReads--;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads--;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// unique
|
||||||
|
uniqueReads--;
|
||||||
|
if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads--;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** allocates new object, copies this object into it, and returns the copy */
|
||||||
|
public WindowStats clone() {
|
||||||
|
WindowStats ret = new WindowStats();
|
||||||
|
ret.uniqueReads = this.uniqueReads;
|
||||||
|
ret.nonUniqueReads = this.nonUniqueReads;
|
||||||
|
ret.uniqueFWDReads = this.uniqueFWDReads;
|
||||||
|
ret.nonUniqueFWDReads = this.nonUniqueFWDReads;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue