Oneoff project, totally unrelated to anything

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2776 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2010-02-04 19:44:50 +00:00
parent 334da80e8b
commit f7e7bcd2ef
3 changed files with 968 additions and 0 deletions

View File

@ -0,0 +1,167 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import net.sf.samtools.SAMRecord;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 3, 2009
* Time: 11:54:35 AM
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class DSBWalker extends LocusWalker<Integer,Integer> {
@Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
int COV_CUTOFF = 0;
@Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
int MINLENGTH_CUTOFF = 0;
private int MERGE_DIST = 300; // merge intervals that are closer than this distance from one another
private long maxcov = 0;
private long maxz = 0;
private long mergedmaxcov = 0;
private long mergedmaxz = 0;
GenomeLoc mergedInterval = null;
GenomeLoc currentInterval = null;
private long nIntervals = 0;
private void emit(GenomeLoc l) {
if ( mergedInterval == null ) {
mergedInterval = l.clone();
mergedmaxcov = maxcov;
mergedmaxz = maxz;
return;
}
if ( mergedInterval.getContigIndex() != l.getContigIndex() ) {
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
if ( length >= MINLENGTH_CUTOFF ) {
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
nIntervals++;
}
mergedInterval = l.clone();
mergedmaxcov = maxcov;
mergedmaxz = maxz;
return;
}
// merged interval exists and new interval is on the same contig. Check if the new interval
// is close enough so we got to merge and keep waiting:
if ( l.getStart() - mergedInterval.getStop() < MERGE_DIST ) {
mergedInterval = GenomeLocParser.setStop(mergedInterval,l.getStop());
if ( maxcov > mergedmaxcov) mergedmaxcov = maxcov;
if ( maxz > mergedmaxz ) mergedmaxz = maxz;
return;
}
// nope, new interval is far enough. Print old one and keep current one.
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
if ( length >= MINLENGTH_CUTOFF ) {
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
nIntervals++;
}
mergedInterval = l.clone();
mergedmaxcov = maxcov;
mergedmaxz = maxz;
}
public void onTraversalDone() {
if ( mergedInterval != null ) {
long length = mergedInterval.getStop()-mergedInterval.getStart()+1;
if ( length >= MINLENGTH_CUTOFF ) {
out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval
nIntervals++;
}
}
System.out.println(nIntervals+" intervals detected.");
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getPileup();
List<SAMRecord> reads = pileup.getReads();
int nZero = pileup.getNumberOfMappingQualityZeroReads();
int nonZCoverage = reads.size() - nZero;
if ( nonZCoverage >= COV_CUTOFF ) {
// if we were not inside an interval, start one:
if ( currentInterval == null ) {
maxcov = nonZCoverage;
maxz = nZero;
currentInterval = context.getLocation().clone();
// System.out.println("Setting current to "+currentInterval);
return 0;
}
// if we were inside an interval and we just jumped onto a new contig, get rid of the old interval
if ( currentInterval.compareContigs(context.getLocation()) != 0 ) {
// we just moved to a new contig
System.out.println("On contig "+context.getLocation().getContig());
emit(currentInterval);
maxcov = nonZCoverage;
maxz = nZero;
currentInterval = context.getLocation().clone();
return 0;
}
// we are on the same contig, we are within the interval, so we need to extend the current interval:
currentInterval = GenomeLocParser.setStop(currentInterval,context.getLocation().getStop()); // still within the interval, adjust stop
//System.out.println("Extending current to "+currentInterval +" ("+context.getLocation()+", "+context.getLocation().getStop()+")");
if ( nonZCoverage > maxcov ) maxcov = nonZCoverage; // adjust maxcov
if ( nZero > maxz ) maxz = nZero; // adjust maxz
} else {
// low coverage, if we were inside an interval, it stops now:
if ( currentInterval != null ) {
// System.out.println("Emitting current as "+currentInterval);
emit(currentInterval);
currentInterval = null;
maxcov = 0;
maxz = 0;
}
}
return 0;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
}
}

View File

@ -0,0 +1,360 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.CircularArray;
import org.broadinstitute.sting.utils.PrimitivePair;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.Set;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 12, 2009
* Time: 2:25:44 PM
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class DSBWalkerV2 extends LocusWalker<Integer,Integer> {
// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true)
// int COV_CUTOFF = 0;
// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false)
// int MINLENGTH_CUTOFF = 0;
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
int WINDOW_SIZE = 100;
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
double ENRICHMENT_CUTOFF = 5.0;
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
int MIN_SIGNAL = 10;
private CircularArray<PrimitivePair.Int> signalWindow = null;
private CircularArray<PrimitivePair.Int> controlWindow = null;
private CircularArray<PrimitivePair.Int> signalStrandsWindow = null;
private CircularArray<PrimitivePair.Int> controlStrandsWindow = null;
private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long();
private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long();
private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long();
private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long();
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
private long windowStart = -1;
private long windowStop = -1;
private int curContig = -1;
private String curContigName = "";
// the following variables are for buffering and merging windows :
private long regionStart = -1;
private long lastWindowStart = -1;
private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int();
private PrimitivePair.Int minSignalReads = new PrimitivePair.Int();
private PrimitivePair.Int maxControlReads = new PrimitivePair.Int();
private PrimitivePair.Int minControlReads = new PrimitivePair.Int();
private double minEnrichmentUnique;
private double maxEnrichmentUnique;
private double minEnrichmentNonUnique;
private double maxEnrichmentNonUnique;
private double minEnrichmentTotal;
private double maxEnrichmentTotal;
private double minUniqueSignalStrandBalance = 0.0;
private double maxUniqueSignalStrandBalance = 0.0;
private double minNonUniqueSignalStrandBalance = 0.0;
private double maxNonUniqueSignalStrandBalance = 0.0;
private double minUniqueControlStrandBalance = 0.0;
private double maxUniqueControlStrandBalance = 0.0;
private double minNonUniqueControlStrandBalance = 0.0;
private double maxNonUniqueControlStrandBalance = 0.0;
@Override
public void initialize() {
int nSams = getToolkit().getArguments().samFiles.size();
if ( nSams != 2 ) {
out.println("ERROR: two input bam files (signal and backround control) must be specified");
System.exit(1);
}
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
signalReadGroups = readGroupSets.get(0);
// System.out.println(signalReadGroups.size()+" read groups in signal");
controlReadGroups = readGroupSets.get(1);
// System.out.println(controlReadGroups.size()+" read groups in control");
signalWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
controlWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
signalStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
controlStrandsWindow = new CircularArray<PrimitivePair.Int>(WINDOW_SIZE);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getPileup();
List<SAMRecord> reads = pileup.getReads();
// compute coverages at the current site:
PrimitivePair.Int signalCov = new PrimitivePair.Int();
PrimitivePair.Int controlCov = new PrimitivePair.Int();
PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int();
PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int();
for ( SAMRecord r : reads ) {
if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
if ( r.getMappingQuality() == 0 ) {
signalCov.second++;
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++;
}
else {
signalCov.first++;
if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++;
}
} else {
if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) {
if ( r.getMappingQuality() == 0 ) {
controlCov.second++;
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++;
}
else {
controlCov.first++;
if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++;
}
} else {
throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")");
}
}
}
GenomeLoc loc = context.getLocation();
// if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second);
if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) {
// we jumped to the next contig, or we are on the same contig but the current position is
// more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift)
checkCurrentWindow(true);
if ( loc.getContigIndex() != curContig ) {
System.out.println("on contig "+loc.getContig());
}
curContig = loc.getContigIndex();
curContigName = loc.getContig();
// prevPos = loc.getStart();
windowStart = loc.getStart();
windowStop = windowStart + WINDOW_SIZE - 1;
signalWindow.clear();
controlWindow.clear();
totalSignalCoverage.assignFrom( signalCov );
totalControlCoverage.assignFrom( controlCov );
totalSignalFwdStrands.assignFrom( signalFwdStrands );
totalControlFwdStrands.assignFrom( controlFwdStrands );
signalWindow.set(0,signalCov);
controlWindow.set(0,controlCov);
signalStrandsWindow.set(0,signalFwdStrands);
controlStrandsWindow.set(0,controlFwdStrands);
return 1;
}
// offset of the current position w.r.t. the start of the window:
int offset = (int)(loc.getStart() - windowStart);
if ( offset >= WINDOW_SIZE ) {
// if we are here, the current position is outside of the current window, but not
// far enough so that we'd need to reinitialize the window from scratch (that was already checked above).
// Now we need to shift.
// We are receiving covered positions in order, so we are guaranteed that everything prior to
// the current position was already counted; if some elements of the windows are still nulls, it means
// there was no coverage there
int shift = offset - WINDOW_SIZE + 1;
// scroll the window(s) base by base until the current position is inside the window. At each step
// we will check if the window meets the requirements and should be printed out.
for ( int i = 0 ; i < shift ; i++ ) {
// we are going to shift; check if the window as it is now is worth printing
checkCurrentWindow(false);
// discard coverage from the first element of the window (this element is about to be shifted out of scope)
if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0));
if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0));
if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0));
if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0));
// advnace window coordinates on the ref
windowStart++;
windowStop++;
// shift the data in the window(s):
signalWindow.shiftData(1);
controlWindow.shiftData(1);
signalStrandsWindow.shiftData(1);
controlStrandsWindow.shiftData(1);
offset--; // this is the new offset w.r.t. to the shifted window
}
}
// at this point, either the current position was inside the current window, or it was outside,
// but the window was already shifted
totalSignalCoverage.add(signalCov);
totalControlCoverage.add(controlCov);
totalSignalFwdStrands.add(signalFwdStrands);
totalControlFwdStrands.add(controlFwdStrands);
signalWindow.set(offset,signalCov);
controlWindow.set(offset,controlCov);
signalStrandsWindow.set(offset,signalFwdStrands);
controlStrandsWindow.set(offset,controlFwdStrands);
return 1;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return sum+value; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void onTraversalDone(Integer result) {
printRegion();
super.onTraversalDone(result);
}
/** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so.
* If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with
* the new window if it passes the cutoffs, or left empty.
*
*/
private void checkCurrentWindow(boolean force) {
if ( force ) printRegion();
if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later
if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low
if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal
return;
}
// if we have decent coverage in control, just check for required enrichment in the signal
if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false);
}
/** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that
* are close enough and prints only when a train of close-by windows has ended and next window received is far enough
*/
private void emitWindow(boolean force) {
if ( regionStart == -1 ) {
resetBuffer();
return;
}
if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) {
// new window is far enough from the region we were buffering: emit old region
printRegion();
resetBuffer();
return;
}
// current window is too close (overlapping) with a previous one: we need to merge
lastWindowStart = windowStart;
maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0));
minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0));
maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0));
minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0));
maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0));
maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0));
minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) );
maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0) );
maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first);
maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second);
maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first);
maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second);
}
private void resetBuffer() {
regionStart = windowStart;
lastWindowStart = windowStart;
maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0);
maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0);
minSignalReads.assignFrom(maxSignalReads);
maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0);
maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0);
minControlReads.assignFrom(maxControlReads);
minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0);
minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0);
minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/
((totalControlCoverage.first+ totalControlCoverage.second)/36.0);
minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first;
minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second;
minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first;
minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second;
}
private void printRegion() {
if ( regionStart == -1 ) return;
out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+
minSignalReads.first+"-"+maxSignalReads.first+"\t"+
minSignalReads.second+"-"+maxSignalReads.second+"\t"+
minControlReads.first+"-"+maxControlReads.first+"\t"+
minControlReads.second+"-"+maxControlReads.second+"\t");
out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique);
out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique);
out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal);
out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance);
out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance);
out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance);
out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance);
if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
out.println();
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
}
}

View File

@ -0,0 +1,441 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import net.sf.samtools.SAMRecord;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Jan 3, 2010
* Time: 1:58:38 PM
* To change this template use File | Settings | File Templates.
*/
public class DSBWalkerV3 extends ReadWalker<Integer,Integer> {
@Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true)
int WINDOW_SIZE = 100;
@Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true)
double ENRICHMENT_CUTOFF = 5.0;
@Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+
"(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+
"so that control*enrichmentCutoff is too low to be convincing)",required=true)
int MIN_SIGNAL = 10;
@Argument(fullName="coverageFactor",shortName="cf",doc="Total number of uniquely mapped signal reads/total number of uniquely mapped control reads",required=false)
double COVERAGE_FACTOR=1.0;
@Argument(fullName="coverageFactorNU",shortName="cfnu",doc="Total number of non-uniquely mapped signal reads/total number of non-uniquely mapped control reads",required=false)
double COVERAGE_FACTOR_NU=1.0;
private Set<String> signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able
private Set<String> controlReadGroups ; // to properly assign the reads coming from a merged stream
private GenomeLoc currentWindow = null;
private String currentContig = "chrM";
private LinkedList<SAMRecord> readsInSignalWindow = null;
private LinkedList<SAMRecord> readsInControlWindow = null;
private WindowStats signalCountsInCurrWindow = new WindowStats();
private WindowStats controlCountsInCurrWindow = new WindowStats();
// following variables are used by emitWindow to buffer adjacent windows
private int MERGE_CUTOFF = -1;
private long regionStart = -1;
private long lastWindowStart = -1;
private int addedSinceLastEmit = 0; // how many sliding window steps where buffered since the last emit (i.e. since the last window that really passed significance criteria)
// buffered read count stats for the windows inside the currently held merged print region:
private List<WindowStats> signalReadCountsBuffer = new ArrayList<WindowStats>(1000);
private List<WindowStats> controlReadCountsBuffer = new ArrayList<WindowStats>(1000);
/** Clears buffered reads and all counts. DOES NOT clear buffered print region */
private void resetWindows() {
readsInSignalWindow.clear();
readsInControlWindow.clear();
signalCountsInCurrWindow.clear();
controlCountsInCurrWindow.clear();
}
private void addSignal(SAMRecord read) {
readsInSignalWindow.add(read);
signalCountsInCurrWindow.addRead(read);
}
private void addControl(SAMRecord read) {
readsInControlWindow.add(read);
controlCountsInCurrWindow.addRead(read);
}
/** Discard signal reads that start strictly before the specified position and
* update associated counts
* @param pos
*/
private void purgeSignal(long pos) {
Iterator<SAMRecord> it = readsInSignalWindow.iterator();
while ( it.hasNext() ) {
SAMRecord r = it.next();
if ( r.getAlignmentStart() >= pos ) return; // we are done
// read starts before pos: discard it and update the counts:
signalCountsInCurrWindow.removeRead(r);
it.remove();
}
}
/** Discard signal reads that start strictly before the specified position and
* update associated counts
* @param pos
*/
private void purgeControl(long pos) {
Iterator<SAMRecord> it = readsInControlWindow.iterator();
while ( it.hasNext() ) {
SAMRecord r = it.next();
if ( r.getAlignmentStart() >= pos ) return; // we are done
// read starts before pos: discard it and update the counts:
controlCountsInCurrWindow.removeRead(r);
it.remove();
}
}
private void resetWindowMergingBuffer(long start) {
regionStart = start;
lastWindowStart = start;
signalReadCountsBuffer.clear();
controlReadCountsBuffer.clear();
signalReadCountsBuffer.add(signalCountsInCurrWindow.clone());
controlReadCountsBuffer.add(controlCountsInCurrWindow.clone());
}
/** Delayed print: the window starting at 'start' will be added to the print buffer; if the window is close enough
* to the current contents if the buffer, the addition will result in merging the window with the buffer;
* otherwise, the old contents of the buffer will be printed and the buffer will be re-initialized with new window.
* It is assumed that counters are in synch with the start position passed to this method.
* @param start
*/
private void emitWindow(long start) {
// System.out.println("Emitting at "+start);
if ( regionStart == -1 ) { // we did not keep any region so far; initialize the buffer and return, will print later
resetWindowMergingBuffer(start);
addedSinceLastEmit = 0;
return;
}
if ( start > lastWindowStart + MERGE_CUTOFF ) {
// this loop is a dummy: we have already cleared those unneeded
// counts in shiftWindows(); stays here to avoid generating bugs later
// if we change something in shiftWindows()
for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) {
signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1);
controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1);
}
printRegion();
resetWindowMergingBuffer(start);
return;
}
// the current window is too close to the previous one: we have to merge;
// NOTE: if window is too close, bufferAccepts() returned true, so the counts are already
// added.
lastWindowStart = start;
addedSinceLastEmit = 0;
// signalReadCountsBuffer.add(uniqueSignalReads);
// controlReadCountsBuffer.add(uniqueControlReads);
}
private boolean bufferAccepts(long pos) {
return ( regionStart != -1 && pos <= lastWindowStart+MERGE_CUTOFF);
}
private void printRegion() {
if ( regionStart == -1 ) return;
long regionStop = lastWindowStart+WINDOW_SIZE-1;
double[] tmpEnrU = new double[signalReadCountsBuffer.size()];
int[] tmpSignalU = new int[signalReadCountsBuffer.size()];
int[] tmpControlU = new int[signalReadCountsBuffer.size()];
double[] tmpEnrNU = new double[signalReadCountsBuffer.size()];
int[] tmpSignalNU = new int[signalReadCountsBuffer.size()];
int[] tmpControlNU = new int[signalReadCountsBuffer.size()];
double[] tmpFWDSignalFracU = new double[signalReadCountsBuffer.size()];
double[] tmpFWDControlFracU = new double[signalReadCountsBuffer.size()];
double[] tmpFWDSignalFracNU = new double[signalReadCountsBuffer.size()];
double[] tmpFWDControlFracNU = new double[signalReadCountsBuffer.size()];
int lastInd = signalReadCountsBuffer.size() - 1;
// out.println("Size="+signalReadCountsBuffer.size()+":");
for ( int i = 0 ; i <= lastInd ; i++ ) {
tmpEnrU[i]= ( ((double) signalReadCountsBuffer.get(i).uniqueReads) / (controlReadCountsBuffer.get(i).uniqueReads+1.0 ) ) / COVERAGE_FACTOR ;
tmpSignalU[i] = signalReadCountsBuffer.get(i).uniqueReads;
tmpControlU[i] = controlReadCountsBuffer.get(i).uniqueReads;
tmpEnrNU[i]= ( ((double) signalReadCountsBuffer.get(i).nonUniqueReads) / (controlReadCountsBuffer.get(i).nonUniqueReads+1.0 ) ) / COVERAGE_FACTOR_NU ;
tmpSignalNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads;
tmpControlNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads;
tmpFWDSignalFracU[i] = signalReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).uniqueFWDReads) / signalReadCountsBuffer.get(i).uniqueReads ) : 0.5;
tmpFWDControlFracU[i] = controlReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).uniqueFWDReads) / controlReadCountsBuffer.get(i).uniqueReads ) : 0.5;
tmpFWDSignalFracNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).nonUniqueFWDReads) / signalReadCountsBuffer.get(i).nonUniqueReads ) : 0.5;
tmpFWDControlFracNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).nonUniqueFWDReads) / controlReadCountsBuffer.get(i).nonUniqueReads ) : 0.5;
}
Arrays.sort(tmpEnrU);
Arrays.sort(tmpSignalU);
Arrays.sort(tmpControlU);
Arrays.sort(tmpEnrNU);
Arrays.sort(tmpSignalNU);
Arrays.sort(tmpControlNU);
Arrays.sort(tmpFWDSignalFracU);
Arrays.sort(tmpFWDControlFracU);
Arrays.sort(tmpFWDSignalFracNU);
Arrays.sort(tmpFWDControlFracNU);
out.print(currentContig+":"+regionStart+"-"+regionStop+"\t"+
(regionStop-regionStart+1) +"\t"+
"signal_unique:"+ tmpSignalU[0]+"-"+ tmpSignalU[lastInd/2]+"-"+ tmpSignalU[lastInd]+"\t"+
"control_unique:"+ tmpControlU[0]+"-"+ tmpControlU[lastInd/2]+"-"+ tmpControlU[lastInd]);
out.printf("\tsignal_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDSignalFracU[0],tmpFWDSignalFracU[lastInd/2],tmpFWDSignalFracU[lastInd]);
out.printf("\tcontrol_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDControlFracU[0],tmpFWDControlFracU[lastInd/2],tmpFWDControlFracU[lastInd]);
out.print("\tsignal_nonnunique:"+ tmpSignalNU[0]+"-"+ tmpSignalNU[lastInd/2]+"-"+ tmpSignalNU[lastInd]+"\t"+
"control_nonunique:"+ tmpControlNU[0]+"-"+ tmpControlNU[lastInd/2]+"-"+ tmpControlNU[lastInd]);
out.printf("\tsignal_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDSignalFracNU[0],tmpFWDSignalFracNU[lastInd/2],tmpFWDSignalFracNU[lastInd]);
out.printf("\tcontrol_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDControlFracNU[0],tmpFWDControlFracNU[lastInd/2],tmpFWDControlFracNU[lastInd]);
out.printf("\tnorm_enrichment_unique:%.2f-%.2f-%.2f",tmpEnrU[0],tmpEnrU[lastInd/2],tmpEnrU[lastInd]);
out.printf("\tnorm_enrichment_nonunique:%.2f-%.2f-%.2f",tmpEnrNU[0],tmpEnrNU[lastInd/2],tmpEnrNU[lastInd]);
// if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER");
out.println();
regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty
// System.exit(1);
}
private void updateWindowMergingBuffer(long i) {
if ( bufferAccepts(i) ) {
// we are not too far away from last window added to the buffer that actually passed significance criteria;
// in this case we have to keep buffering since another significant window may be encountered soon
// System.out.println("Updating buffer at "+i+" with "+ uniqueSignalReads);
signalReadCountsBuffer.add(signalCountsInCurrWindow.clone());
controlReadCountsBuffer.add(controlCountsInCurrWindow.clone());
addedSinceLastEmit++;
} else {
// we are too far from the last significant window; if another significant window comes later, it will not
// be merged into this region but will start a new one. In this case we have to erase all the counts we have been
// saving since the last significant window (the latter is where the current region is going to end!)
for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) {
signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1);
controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1);
}
printRegion(); // print current region right away, why not? next significant window will start new region for sure.
}
}
private void shiftWindows(long pos) {
// we shift windows when there is a read that does not fit into the current window.
// the position, to which the shift is performed, is the first position such that the new read
// can be accomodated. Hence we can safely slide up to pos, only discarding reads that go out of scope -
// we are guaranteed that there will be no new reads to add until we reach pos.
for ( long i = currentWindow.getStart() ; i < pos ; i++ ) {
// if ( readsInSignalWindow.size() == 0 ) {
// i = pos-1;
// continue;
// };
// if ( readsInSignalWindow.getFirst().getAlignmentStart() > i ) {
// i = readsInSignalWindow.getFirst().getAlignmentStart() - 1; // jump directly to next read position
// continue;
// }
purgeSignal(i); // remove all the reads that start before current position i (and update all the counters)
purgeControl(i);
updateWindowMergingBuffer(i);
if ( ( controlCountsInCurrWindow.uniqueReads + 1 ) * ENRICHMENT_CUTOFF < MIN_SIGNAL ) {
// too few control reads
if ( signalCountsInCurrWindow.uniqueReads >= MIN_SIGNAL ) {
// emit signal only if it is higher that hard cut-off:
emitWindow(i); // print current window (print can be buffered and delayed!)
}
} else {
// enough control reads;
// check for actual enrichment:
if ( ((double) signalCountsInCurrWindow.uniqueReads) / (controlCountsInCurrWindow.uniqueReads+1.0) > ENRICHMENT_CUTOFF ) {
emitWindow(i); // print current window (print can be buffered and delayed!)
}
}
}
// we emitted intermediate windows up to pos-1 as/if needed and purged everything that starts before pos-1
// now we have to purge everything that starts before pos and return (no emitting yet, as we are about to add a read upon return):
purgeSignal(pos);
purgeControl(pos);
currentWindow = GenomeLocParser.createGenomeLoc(currentWindow.getContigIndex(),pos,pos+WINDOW_SIZE-1);
}
@Override
public void initialize() {
int nSams = getToolkit().getArguments().samFiles.size();
if ( nSams != 2 ) {
out.println("ERROR: two input bam files (signal and backround control) must be specified");
System.exit(1);
}
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
signalReadGroups = readGroupSets.get(0);
// System.out.println(signalReadGroups.size()+" read groups in signal");
controlReadGroups = readGroupSets.get(1);
// System.out.println(controlReadGroups.size()+" read groups in control");
currentWindow = GenomeLocParser.createGenomeLoc(0,1,WINDOW_SIZE);
readsInSignalWindow = new LinkedList<SAMRecord>();
readsInControlWindow = new LinkedList<SAMRecord>();
MERGE_CUTOFF = WINDOW_SIZE;
ENRICHMENT_CUTOFF *= COVERAGE_FACTOR;
currentContig = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(0).getSequenceName();
}
public Integer map(char[] ref, SAMRecord read) {
if ( AlignmentUtils.isReadUnmapped(read) ) return 0;
if ( read.getReferenceIndex() > currentWindow.getContigIndex() ) {
printRegion(); // print all we had on the previous contig
currentWindow = GenomeLocParser.createGenomeLoc(read.getReferenceIndex(),
read.getAlignmentStart(),
read.getAlignmentStart()+WINDOW_SIZE-1);
currentContig = read.getReferenceName();
resetWindows();
} else {
// we are on the same contig
if ( read.getAlignmentEnd() > currentWindow.getStop() ) {
// can not accomodate the read inside the current window - shift!
// System.out.println("read ends at "+read.getAlignmentEnd()+" window ends at "+currentWindow.getStop()+ " shifting to "+ (currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() )) +" ("+uniqueSignalReads+"/"+uniqueControlReads+")");
// while shifting the window, the following method will issue (delayed) print commands for
// all intermediate windows that pass significance criteria:
shiftWindows(currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() ));
}
// now the read will fit into the window
}
// at this point we are guaranteed that the read will fit into the window
if ( signalReadGroups.contains( read.getReadGroup().getReadGroupId() ) ) {
addSignal(read);
} else if ( controlReadGroups.contains( read.getReadGroup().getReadGroupId() )) {
addControl(read);
} else {
throw new StingException("Read "+read + " belongs to unrecognized read group");
}
return 1;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
public Integer reduceInit() {
return 0; //To change body of implemented methods use File | Settings | File Templates.
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
public Integer reduce(Integer value, Integer sum) {
return value+sum; //To change body of implemented methods use File | Settings | File Templates.
}
/** Auxiliary class that encapsulates the task of monitoring counts of various read traits in some set of reads
* (for instance, reads in the current window). Counted traits include uniquely/non-uniquely mapped reads,
* forward-strand aligned reads etc.
*/
class WindowStats implements Cloneable {
public int uniqueReads = 0;
public int nonUniqueReads = 0;
public int uniqueFWDReads = 0;
public int nonUniqueFWDReads = 0;
/** Reset all counts to 0 */
public void clear() {
uniqueReads = nonUniqueReads = uniqueFWDReads = nonUniqueFWDReads = 0;
}
/** Examines the read and increments the counts for all the monitored traits observed in this read. */
public void addRead(SAMRecord r) {
if ( r.getMappingQuality() == 0 ) {
// nonunique
nonUniqueReads++;
if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads++;
} else {
// unique
uniqueReads++;
if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads++;
}
}
/** Examines the read and decrements the counts for all the monitored traits observed in this read. */
public void removeRead(SAMRecord r) {
if ( r.getMappingQuality() == 0 ) {
// nonunique
nonUniqueReads--;
if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads--;
}
else {
// unique
uniqueReads--;
if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads--;
}
}
/** allocates new object, copies this object into it, and returns the copy */
public WindowStats clone() {
WindowStats ret = new WindowStats();
ret.uniqueReads = this.uniqueReads;
ret.nonUniqueReads = this.nonUniqueReads;
ret.uniqueFWDReads = this.uniqueFWDReads;
ret.nonUniqueFWDReads = this.nonUniqueFWDReads;
return ret;
}
}
}