gatk-3.8/archive/java/src/org/broadinstitute/sting/DSBWalkerV2.java

361 lines
20 KiB
Java

package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.CircularArray;
import org.broadinstitute.sting.utils.PrimitivePair;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 12, 2009
* Time: 2:25:44 PM
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class DSBWalkerV2 extends LocusWalker<Integer,Integer> {
// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
// int COV_CUTOFF = 0;
// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
// int MINLENGTH_CUTOFF = 0;
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
int WINDOW_SIZE = 100;
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
double ENRICHMENT_CUTOFF = 5.0;
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
int MIN_SIGNAL = 10;
private CircularArray<PrimitivePair.Int> signalWindow = null;
private CircularArray<PrimitivePair.Int> controlWindow = null;
private CircularArray<PrimitivePair.Int> signalStrandsWindow = null;
private CircularArray<PrimitivePair.Int> controlStrandsWindow = null;
private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long();
private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long();
private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long();
private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long();
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
private long windowStart = -1;
private long windowStop = -1;
private int curContig = -1;
private String curContigName = "";
// the following variables are for buffering and merging windows :
private long regionStart = -1;
private long lastWindowStart = -1;
private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int();
private PrimitivePair.Int minSignalReads = new PrimitivePair.Int();
private PrimitivePair.Int maxControlReads = new PrimitivePair.Int();
private PrimitivePair.Int minControlReads = new PrimitivePair.Int();
private double minEnrichmentUnique;
private double maxEnrichmentUnique;
private double minEnrichmentNonUnique;
private double maxEnrichmentNonUnique;
private double minEnrichmentTotal;
private double maxEnrichmentTotal;
private double minUniqueSignalStrandBalance = 0.0;
private double maxUniqueSignalStrandBalance = 0.0;
private double minNonUniqueSignalStrandBalance = 0.0;
private double maxNonUniqueSignalStrandBalance = 0.0;
private double minUniqueControlStrandBalance = 0.0;
private double maxUniqueControlStrandBalance = 0.0;
private double minNonUniqueControlStrandBalance = 0.0;
private double maxNonUniqueControlStrandBalance = 0.0;
@Override
public void initialize() {
int nSams = getToolkit().getArguments().samFiles.size();
if ( nSams != 2 ) {
out.println("ERROR: two input bam files (signal and backround control) must be specified");
System.exit(1);
}
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
signalReadGroups = readGroupSets.get(0);
// System.out.println(signalReadGroups.size()+" read groups in signal");
controlReadGroups = readGroupSets.get(1);
// System.out.println(controlReadGroups.size()+" read groups in control");
signalWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
controlWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
signalStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
controlStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getPileup();
List<SAMRecord> reads = pileup.getReads();
// compute coverages at the current site:
PrimitivePair.Int signalCov = new PrimitivePair.Int();
PrimitivePair.Int controlCov = new PrimitivePair.Int();
PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int();
PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int();
for ( SAMRecord r : reads ) {
if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
if ( r.getMappingQuality() == 0 ) {
signalCov.second++;
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++;
}
else {
signalCov.first++;
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++;
}
} else {
if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
if ( r.getMappingQuality() == 0 ) {
controlCov.second++;
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++;
}
else {
controlCov.first++;
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++;
}
} else {
throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")");
}
}
}
GenomeLoc loc = context.getLocation();
// if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second);
if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) {
// we jumped to the next contig, or we are on the same contig but the current position is
// more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift)
checkCurrentWindow(true);
if ( loc.getContigIndex() != curContig ) {
System.out.println("on contig "+loc.getContig());
}
curContig = loc.getContigIndex();
curContigName = loc.getContig();
// prevPos = loc.getStart();
windowStart = loc.getStart();
windowStop = windowStart + WINDOW_SIZE - 1;
signalWindow.clear();
controlWindow.clear();
totalSignalCoverage.assignFrom( signalCov );
totalControlCoverage.assignFrom( controlCov );
totalSignalFwdStrands.assignFrom( signalFwdStrands );
totalControlFwdStrands.assignFrom( controlFwdStrands );
signalWindow.set(0,signalCov);
controlWindow.set(0,controlCov);
signalStrandsWindow.set(0,signalFwdStrands);
controlStrandsWindow.set(0,controlFwdStrands);
return 1;
}
// offset of the current position w.r.t. the start of the window:
int offset = (int)(loc.getStart() - windowStart);
if ( offset >= WINDOW_SIZE ) {
// if we are here, the current position is outside of the current window, but not
// far enough so that we'd need to reinitialize the window from scratch (that was already checked above).
// Now we need to shift.
// We are receiving covered positions in order, so we are guaranteed that everything prior to
// the current position was already counted; if some elements of the windows are still nulls, it means
// there was no coverage there
int shift = offset - WINDOW_SIZE + 1;
// scroll the window(s) base by base until the current position is inside the window. At each step
// we will check if the window meets the requirements and should be printed out.
for ( int i = 0 ; i < shift ; i++ ) {
// we are going to shift; check if the window as it is now is worth printing
checkCurrentWindow(false);
// discard coverage from the first element of the window (this element is about to be shifted out of scope)
if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0));
if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0));
if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0));
if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0));
// advnace window coordinates on the ref
windowStart++;
windowStop++;
// shift the data in the window(s):
signalWindow.shiftData(1);
controlWindow.shiftData(1);
signalStrandsWindow.shiftData(1);
controlStrandsWindow.shiftData(1);
offset--; // this is the new offset w.r.t. to the shifted window
}
}
// at this point, either the current position was inside the current window, or it was outside,
// but the window was already shifted
totalSignalCoverage.add(signalCov);
totalControlCoverage.add(controlCov);
totalSignalFwdStrands.add(signalFwdStrands);
totalControlFwdStrands.add(controlFwdStrands);
signalWindow.set(offset,signalCov);
controlWindow.set(offset,controlCov);
signalStrandsWindow.set(offset,signalFwdStrands);
controlStrandsWindow.set(offset,controlFwdStrands);
return 1;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void onTraversalDone(Integer result) {
printRegion();
super.onTraversalDone(result);
}
/** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so.
* If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with
* the new window if it passes the cutoffs, or left empty.
*
*/
private void checkCurrentWindow(boolean force) {
if ( force ) printRegion();
if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later
if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low
if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal
return;
}
// if we have decent coverage in control, just check for required enrichment in the signal
if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false);
}
/** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that
* are close enough and prints only when a train of close-by windows has ended and next window received is far enough
*/
private void emitWindow(boolean force) {
if ( regionStart == -1 ) {
resetBuffer();
return;
}
if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) {
// new window is far enough from the region we were buffering: emit old region
printRegion();
resetBuffer();
return;
}
// current window is too close (overlapping) with a previous one: we need to merge
lastWindowStart = windowStart;
maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0));
minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) );
maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
}
private void resetBuffer() {
regionStart = windowStart;
lastWindowStart = windowStart;
maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0);
maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0);
minSignalReads.assignFrom(maxSignalReads);
maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0);
maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0);
minControlReads.assignFrom(maxControlReads);
minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0);
minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0);
minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0);
minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first;
minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second;
minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first;
minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second;
}
private void printRegion() {
if ( regionStart == -1 ) return;
out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+
minSignalReads.first+"-"+maxSignalReads.first+"\t"+
minSignalReads.second+"-"+maxSignalReads.second+"\t"+
minControlReads.first+"-"+maxControlReads.first+"\t"+
minControlReads.second+"-"+maxControlReads.second+"\t");
out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique);
out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique);
out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal);
out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance);
out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance);
out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance);
out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance);
if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
out.println();
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
}
}