diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalker.java new file mode 100644 index 000000000..71c5c26dc --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalker.java @@ -0,0 +1,167 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import net.sf.samtools.SAMRecord; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Dec 3, 2009 + * Time: 11:54:35 AM + * To change this template use File | Settings | File Templates. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) + +public class DSBWalker extends LocusWalker { + @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true) + int COV_CUTOFF = 0; + @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false) + int MINLENGTH_CUTOFF = 0; + + private int MERGE_DIST = 300; // merge intervals that are closer than this distance from one another + + private long maxcov = 0; + private long maxz = 0; + private long mergedmaxcov = 0; + private long mergedmaxz = 0; + GenomeLoc mergedInterval = null; + GenomeLoc currentInterval = null; + + private long nIntervals = 0; + + private void emit(GenomeLoc l) { + if ( mergedInterval == null ) { + mergedInterval = l.clone(); + mergedmaxcov = maxcov; + mergedmaxz = maxz; + return; + } + + if ( mergedInterval.getContigIndex() != l.getContigIndex() ) { + long length = mergedInterval.getStop()-mergedInterval.getStart()+1; + if ( length >= MINLENGTH_CUTOFF ) { + out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval + nIntervals++; + } + mergedInterval = l.clone(); + mergedmaxcov = maxcov; + mergedmaxz = maxz; + return; + } + + // merged interval exists and new interval is on the same contig. Check if the new interval + // is close enough so we got to merge and keep waiting: + + if ( l.getStart() - mergedInterval.getStop() < MERGE_DIST ) { + mergedInterval = GenomeLocParser.setStop(mergedInterval,l.getStop()); + if ( maxcov > mergedmaxcov) mergedmaxcov = maxcov; + if ( maxz > mergedmaxz ) mergedmaxz = maxz; + return; + } + + // nope, new interval is far enough. Print old one and keep current one. + + long length = mergedInterval.getStop()-mergedInterval.getStart()+1; + if ( length >= MINLENGTH_CUTOFF ) { + out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval + nIntervals++; + } + mergedInterval = l.clone(); + mergedmaxcov = maxcov; + mergedmaxz = maxz; + + } + + public void onTraversalDone() { + if ( mergedInterval != null ) { + long length = mergedInterval.getStop()-mergedInterval.getStart()+1; + if ( length >= MINLENGTH_CUTOFF ) { + out.println(mergedInterval+"\t"+length+"\t"+mergedmaxcov+"\t"+mergedmaxz); // eject old interval + nIntervals++; + } + } + System.out.println(nIntervals+" intervals detected."); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + ReadBackedPileup pileup = context.getPileup(); + List reads = pileup.getReads(); + + int nZero = pileup.getNumberOfMappingQualityZeroReads(); + + int nonZCoverage = reads.size() - nZero; + + if ( nonZCoverage >= COV_CUTOFF ) { + + // if we were not inside an interval, start one: + if ( currentInterval == null ) { + maxcov = nonZCoverage; + maxz = nZero; + currentInterval = context.getLocation().clone(); +// System.out.println("Setting current to "+currentInterval); + return 0; + } + + // if we were inside an interval and we just jumped onto a new contig, get rid of the old interval + if ( currentInterval.compareContigs(context.getLocation()) != 0 ) { + // we just moved to a new contig + System.out.println("On contig "+context.getLocation().getContig()); + emit(currentInterval); + maxcov = nonZCoverage; + maxz = nZero; + currentInterval = context.getLocation().clone(); + return 0; + } + + // we are on the same contig, we are within the interval, so we need to extend the current interval: + currentInterval = GenomeLocParser.setStop(currentInterval,context.getLocation().getStop()); // still within the interval, adjust stop + //System.out.println("Extending current to "+currentInterval +" ("+context.getLocation()+", "+context.getLocation().getStop()+")"); + if ( nonZCoverage > maxcov ) maxcov = nonZCoverage; // adjust maxcov + if ( nZero > maxz ) maxz = nZero; // adjust maxz + } else { + // low coverage, if we were inside an interval, it stops now: + if ( currentInterval != null ) { + // System.out.println("Emitting current as "+currentInterval); + emit(currentInterval); + currentInterval = null; + maxcov = 0; + maxz = 0; + } + } + + return 0; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return sum+value; //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV2.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV2.java new file mode 100644 index 000000000..cf7677ac6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV2.java @@ -0,0 +1,360 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.CircularArray; +import org.broadinstitute.sting.utils.PrimitivePair; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Dec 12, 2009 + * Time: 2:25:44 PM + * To change this template use File | Settings | File Templates. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) + +public class DSBWalkerV2 extends LocusWalker { +// @Argument(fullName="coverage",shortName="C",doc="Regions with coverage above specified threshold will be reported",required=true) +// int COV_CUTOFF = 0; +// @Argument(fullName="minLength",shortName="ml",doc="Only regions longer than the specified value will be reported",required=false) +// int MINLENGTH_CUTOFF = 0; + @Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true) + int WINDOW_SIZE = 100; + @Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true) + double ENRICHMENT_CUTOFF = 5.0; + @Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+ + "(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+ + "so that control*enrichmentCutoff is too low to be convincing)",required=true) + int MIN_SIGNAL = 10; + + private CircularArray signalWindow = null; + private CircularArray controlWindow = null; + private CircularArray signalStrandsWindow = null; + private CircularArray controlStrandsWindow = null; + + private PrimitivePair.Long totalSignalCoverage = new PrimitivePair.Long(); + private PrimitivePair.Long totalControlCoverage = new PrimitivePair.Long(); + private PrimitivePair.Long totalSignalFwdStrands = new PrimitivePair.Long(); + private PrimitivePair.Long totalControlFwdStrands = new PrimitivePair.Long(); + + private Set signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able + private Set controlReadGroups ; // to properly assign the reads coming from a merged stream + + private long windowStart = -1; + private long windowStop = -1; + private int curContig = -1; + private String curContigName = ""; + + // the following variables are for buffering and merging windows : + private long regionStart = -1; + private long lastWindowStart = -1; + private PrimitivePair.Int maxSignalReads = new PrimitivePair.Int(); + private PrimitivePair.Int minSignalReads = new PrimitivePair.Int(); + private PrimitivePair.Int maxControlReads = new PrimitivePair.Int(); + private PrimitivePair.Int minControlReads = new PrimitivePair.Int(); + private double minEnrichmentUnique; + private double maxEnrichmentUnique; + private double minEnrichmentNonUnique; + private double maxEnrichmentNonUnique; + private double minEnrichmentTotal; + private double maxEnrichmentTotal; + private double minUniqueSignalStrandBalance = 0.0; + private double maxUniqueSignalStrandBalance = 0.0; + private double minNonUniqueSignalStrandBalance = 0.0; + private double maxNonUniqueSignalStrandBalance = 0.0; + private double minUniqueControlStrandBalance = 0.0; + private double maxUniqueControlStrandBalance = 0.0; + private double minNonUniqueControlStrandBalance = 0.0; + private double maxNonUniqueControlStrandBalance = 0.0; + + @Override + public void initialize() { + int nSams = getToolkit().getArguments().samFiles.size(); + + if ( nSams != 2 ) { + out.println("ERROR: two input bam files (signal and backround control) must be specified"); + System.exit(1); + } + List> readGroupSets = getToolkit().getMergedReadGroupsByReaders(); + signalReadGroups = readGroupSets.get(0); +// System.out.println(signalReadGroups.size()+" read groups in signal"); + controlReadGroups = readGroupSets.get(1); +// System.out.println(controlReadGroups.size()+" read groups in control"); + signalWindow = new CircularArray(WINDOW_SIZE); + controlWindow = new CircularArray(WINDOW_SIZE); + signalStrandsWindow = new CircularArray(WINDOW_SIZE); + controlStrandsWindow = new CircularArray(WINDOW_SIZE); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + ReadBackedPileup pileup = context.getPileup(); + List reads = pileup.getReads(); + + // compute coverages at the current site: + PrimitivePair.Int signalCov = new PrimitivePair.Int(); + PrimitivePair.Int controlCov = new PrimitivePair.Int(); + PrimitivePair.Int signalFwdStrands = new PrimitivePair.Int(); + PrimitivePair.Int controlFwdStrands = new PrimitivePair.Int(); + + for ( SAMRecord r : reads ) { + if ( signalReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) { + if ( r.getMappingQuality() == 0 ) { + signalCov.second++; + if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.second++; + } + else { + signalCov.first++; + if ( ! r.getReadNegativeStrandFlag() ) signalFwdStrands.first++; + } + } else { + if ( controlReadGroups.contains( r.getReadGroup().getReadGroupId() ) ) { + if ( r.getMappingQuality() == 0 ) { + controlCov.second++; + if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.second++; + } + else { + controlCov.first++; + if ( ! r.getReadNegativeStrandFlag() ) controlFwdStrands.first++; + } + } else { + throw new StingException("Read "+r+" belongs to unknown read group ("+r.getReadGroup()+")"); + } + } + } + + GenomeLoc loc = context.getLocation(); + + // if ( curContig != 0 ) System.out.println(loc+" "+signalCov.first+" "+signalCov.second+" "+controlCov.first+" "+controlCov.second); + + if ( loc.getContigIndex() != curContig || loc.getStart() >= windowStop+WINDOW_SIZE ) { + // we jumped to the next contig, or we are on the same contig but the current position is + // more than WINDOW_SIZE away from the current window's end (i.e. there's nothing to shift) + checkCurrentWindow(true); + + if ( loc.getContigIndex() != curContig ) { + System.out.println("on contig "+loc.getContig()); + } + curContig = loc.getContigIndex(); + curContigName = loc.getContig(); +// prevPos = loc.getStart(); + windowStart = loc.getStart(); + windowStop = windowStart + WINDOW_SIZE - 1; + signalWindow.clear(); + controlWindow.clear(); + totalSignalCoverage.assignFrom( signalCov ); + totalControlCoverage.assignFrom( controlCov ); + totalSignalFwdStrands.assignFrom( signalFwdStrands ); + totalControlFwdStrands.assignFrom( controlFwdStrands ); + signalWindow.set(0,signalCov); + controlWindow.set(0,controlCov); + signalStrandsWindow.set(0,signalFwdStrands); + controlStrandsWindow.set(0,controlFwdStrands); + return 1; + } + + // offset of the current position w.r.t. the start of the window: + int offset = (int)(loc.getStart() - windowStart); + + if ( offset >= WINDOW_SIZE ) { + // if we are here, the current position is outside of the current window, but not + // far enough so that we'd need to reinitialize the window from scratch (that was already checked above). + // Now we need to shift. + + // We are receiving covered positions in order, so we are guaranteed that everything prior to + // the current position was already counted; if some elements of the windows are still nulls, it means + // there was no coverage there + + int shift = offset - WINDOW_SIZE + 1; + + // scroll the window(s) base by base until the current position is inside the window. At each step + // we will check if the window meets the requirements and should be printed out. + for ( int i = 0 ; i < shift ; i++ ) { + + // we are going to shift; check if the window as it is now is worth printing + checkCurrentWindow(false); + + // discard coverage from the first element of the window (this element is about to be shifted out of scope) + if ( signalWindow.get(0) != null ) totalSignalCoverage.subtract(signalWindow.get(0)); + if ( signalStrandsWindow.get(0) != null ) totalSignalFwdStrands.subtract(signalStrandsWindow.get(0)); + + if ( controlWindow.get(0) != null ) totalControlCoverage.subtract(controlWindow.get(0)); + if ( controlStrandsWindow.get(0) != null ) totalControlFwdStrands.subtract(controlStrandsWindow.get(0)); + + // advnace window coordinates on the ref + windowStart++; + windowStop++; + + // shift the data in the window(s): + signalWindow.shiftData(1); + controlWindow.shiftData(1); + signalStrandsWindow.shiftData(1); + controlStrandsWindow.shiftData(1); + + offset--; // this is the new offset w.r.t. to the shifted window + } + + } + + // at this point, either the current position was inside the current window, or it was outside, + // but the window was already shifted + totalSignalCoverage.add(signalCov); + totalControlCoverage.add(controlCov); + totalSignalFwdStrands.add(signalFwdStrands); + totalControlFwdStrands.add(controlFwdStrands); + signalWindow.set(offset,signalCov); + controlWindow.set(offset,controlCov); + signalStrandsWindow.set(offset,signalFwdStrands); + controlStrandsWindow.set(offset,controlFwdStrands); + return 1; + } + + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return sum+value; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public void onTraversalDone(Integer result) { + printRegion(); + super.onTraversalDone(result); + } + + /** Checks if the currently held window satisfies the conditions set up for significance, and invokes buffered printout if so. + * If the parameter is set to true, printout of previously held region is forced, and the buffer is reinitialized with + * the new window if it passes the cutoffs, or left empty. + * + */ + private void checkCurrentWindow(boolean force) { + if ( force ) printRegion(); + if ( signalWindow.get(0) == null && controlWindow.get(0) == null ) return; // do not emit windows that start from empty cell; we will get them later + if ( totalControlCoverage.first * ENRICHMENT_CUTOFF / 36.0 < MIN_SIGNAL ) { // control coverage zero or too low + if ( totalSignalCoverage.first /28.0 > MIN_SIGNAL ) emitWindow(false); // require at least MIN_SIGNAL coverage for signal + return; + } + + // if we have decent coverage in control, just check for required enrichment in the signal + if ( ((double)totalSignalCoverage.first/28.0) / (totalControlCoverage.first/36.0) > ENRICHMENT_CUTOFF ) emitWindow(false); + } + + /** This is actually a delayed print command: it buffers the successive windows set for printout, merges the windows that + * are close enough and prints only when a train of close-by windows has ended and next window received is far enough + */ + private void emitWindow(boolean force) { + + if ( regionStart == -1 ) { + resetBuffer(); + return; + } + + if ( force || windowStart > lastWindowStart + WINDOW_SIZE ) { + // new window is far enough from the region we were buffering: emit old region + + printRegion(); + resetBuffer(); + return; + } + + // current window is too close (overlapping) with a previous one: we need to merge + + lastWindowStart = windowStart; + maxSignalReads.first = Math.max(maxSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0)); + maxSignalReads.second = Math.max(maxSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0)); + minSignalReads.first = Math.min(minSignalReads.first, (int)Math.round(totalSignalCoverage.first/28.0)); + minSignalReads.second = Math.min(minSignalReads.second,(int)Math.round(totalSignalCoverage.second/28.0)); + maxControlReads.first = Math.max(maxControlReads.first,(int)Math.round(totalControlCoverage.first/36.0)); + maxControlReads.second = Math.max(maxControlReads.second,(int)Math.round(totalControlCoverage.second/36.0)); + minControlReads.first = Math.min(minControlReads.first,(int)Math.round(totalControlCoverage.first/36.0)); + minControlReads.second = Math.min(minControlReads.second,(int)Math.round(totalControlCoverage.second/36.0)); + maxEnrichmentUnique = Math.max(maxEnrichmentUnique,((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0)); + minEnrichmentUnique = Math.min(minEnrichmentUnique, ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0)); + maxEnrichmentNonUnique = Math.max(maxEnrichmentNonUnique,((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0)); + minEnrichmentNonUnique = Math.min( minEnrichmentNonUnique, ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0) ); + maxEnrichmentTotal = Math.max( maxEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/ + ((totalControlCoverage.first+ totalControlCoverage.second)/36.0) ); + minEnrichmentTotal = Math.min( minEnrichmentTotal, ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/ + ((totalControlCoverage.first+ totalControlCoverage.second)/36.0) ); + + + maxUniqueSignalStrandBalance = Math.max(maxUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first); + minUniqueSignalStrandBalance = Math.min(minUniqueSignalStrandBalance,((double)totalSignalFwdStrands.first)/totalSignalCoverage.first); + maxNonUniqueSignalStrandBalance = Math.max(maxNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second); + minNonUniqueSignalStrandBalance = Math.min(minNonUniqueSignalStrandBalance,((double)totalSignalFwdStrands.second)/totalSignalCoverage.second); + maxUniqueControlStrandBalance = Math.max(maxUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first); + minUniqueControlStrandBalance = Math.min(minUniqueControlStrandBalance,((double)totalControlFwdStrands.first)/totalControlCoverage.first); + maxNonUniqueControlStrandBalance = Math.max(maxNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second); + minNonUniqueControlStrandBalance = Math.min(minNonUniqueControlStrandBalance,((double)totalControlFwdStrands.second)/totalControlCoverage.second); + + + } + + private void resetBuffer() { + regionStart = windowStart; + lastWindowStart = windowStart; + maxSignalReads.first = (int)Math.round(totalSignalCoverage.first/28.0); + maxSignalReads.second = (int)Math.round(totalSignalCoverage.second/28.0); + minSignalReads.assignFrom(maxSignalReads); + maxControlReads.first = (int)Math.round(totalControlCoverage.first/36.0); + maxControlReads.second = (int)Math.round(totalControlCoverage.second/36.0); + minControlReads.assignFrom(maxControlReads); + minEnrichmentUnique = maxEnrichmentUnique = ((double)totalSignalCoverage.first/28.0)/(totalControlCoverage.first/36.0); + minEnrichmentNonUnique = maxEnrichmentNonUnique = ((double)totalSignalCoverage.second/28.0)/(totalControlCoverage.second/36.0); + minEnrichmentTotal = maxEnrichmentTotal = ((double)(totalSignalCoverage.first+totalSignalCoverage.second)/28.0)/ + ((totalControlCoverage.first+ totalControlCoverage.second)/36.0); + + minUniqueSignalStrandBalance = maxUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.first)/totalSignalCoverage.first; + minNonUniqueSignalStrandBalance = maxNonUniqueSignalStrandBalance = ((double)totalSignalFwdStrands.second)/totalSignalCoverage.second; + minUniqueControlStrandBalance = maxUniqueControlStrandBalance = ((double)totalControlFwdStrands.first)/totalControlCoverage.first; + minNonUniqueControlStrandBalance = maxNonUniqueControlStrandBalance = ((double)totalControlFwdStrands.second)/totalControlCoverage.second; + } + + private void printRegion() { + if ( regionStart == -1 ) return; + out.print(curContigName+":"+regionStart+"-"+windowStop+"\t"+(windowStop-regionStart+1) +"\t"+ + minSignalReads.first+"-"+maxSignalReads.first+"\t"+ + minSignalReads.second+"-"+maxSignalReads.second+"\t"+ + minControlReads.first+"-"+maxControlReads.first+"\t"+ + minControlReads.second+"-"+maxControlReads.second+"\t"); + out.printf("%.2f-%.2f\t",minEnrichmentUnique,maxEnrichmentUnique); + out.printf("%.2f-%.2f\t",minEnrichmentNonUnique,maxEnrichmentNonUnique); + out.printf("%.2f-%.2f\t",minEnrichmentTotal,maxEnrichmentTotal); + out.printf("%.2f-%.2f\t",minUniqueSignalStrandBalance,maxUniqueSignalStrandBalance); + out.printf("%.2f-%.2f\t",minNonUniqueSignalStrandBalance,maxNonUniqueSignalStrandBalance); + out.printf("%.2f-%.2f\t",minUniqueControlStrandBalance,maxUniqueControlStrandBalance); + out.printf("%.2f-%.2f",minNonUniqueControlStrandBalance,maxNonUniqueControlStrandBalance); + + if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER"); + out.println(); + + regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV3.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV3.java new file mode 100644 index 000000000..1725557d1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DSBWalkerV3.java @@ -0,0 +1,441 @@ +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.*; +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Jan 3, 2010 + * Time: 1:58:38 PM + * To change this template use File | Settings | File Templates. + */ +public class DSBWalkerV3 extends ReadWalker { + + @Argument(fullName="windowSize",shortName="W",doc="Size of the sliding window",required=true) + int WINDOW_SIZE = 100; + @Argument(fullName="enrichmentCutoff",shortName="E",doc="Report windows with enrichment (signal/control) above this cutoff",required=true) + double ENRICHMENT_CUTOFF = 5.0; + @Argument(fullName="minSignal",shortName="ms",doc="Do not report windows with signal lower than this value "+ + "(this cutoff is secondary to enrichmentCutoff and guards against windows where control signal is 0 or too low,"+ + "so that control*enrichmentCutoff is too low to be convincing)",required=true) + int MIN_SIGNAL = 10; + @Argument(fullName="coverageFactor",shortName="cf",doc="Total number of uniquely mapped signal reads/total number of uniquely mapped control reads",required=false) + double COVERAGE_FACTOR=1.0; + @Argument(fullName="coverageFactorNU",shortName="cfnu",doc="Total number of non-uniquely mapped signal reads/total number of non-uniquely mapped control reads",required=false) + double COVERAGE_FACTOR_NU=1.0; + + private Set signalReadGroups; // we are going to remember which read groups are stimulated tagged and which are unstimulated untagged in order to be able + private Set controlReadGroups ; // to properly assign the reads coming from a merged stream + + private GenomeLoc currentWindow = null; + private String currentContig = "chrM"; + + + private LinkedList readsInSignalWindow = null; + private LinkedList readsInControlWindow = null; + + private WindowStats signalCountsInCurrWindow = new WindowStats(); + private WindowStats controlCountsInCurrWindow = new WindowStats(); + + + // following variables are used by emitWindow to buffer adjacent windows + private int MERGE_CUTOFF = -1; + + private long regionStart = -1; + private long lastWindowStart = -1; + private int addedSinceLastEmit = 0; // how many sliding window steps where buffered since the last emit (i.e. since the last window that really passed significance criteria) + // buffered read count stats for the windows inside the currently held merged print region: + private List signalReadCountsBuffer = new ArrayList(1000); + private List controlReadCountsBuffer = new ArrayList(1000); + + + /** Clears buffered reads and all counts. DOES NOT clear buffered print region */ + private void resetWindows() { + readsInSignalWindow.clear(); + readsInControlWindow.clear(); + signalCountsInCurrWindow.clear(); + controlCountsInCurrWindow.clear(); + } + + private void addSignal(SAMRecord read) { + readsInSignalWindow.add(read); + signalCountsInCurrWindow.addRead(read); + } + + private void addControl(SAMRecord read) { + readsInControlWindow.add(read); + controlCountsInCurrWindow.addRead(read); + } + + /** Discard signal reads that start strictly before the specified position and + * update associated counts + * @param pos + */ + private void purgeSignal(long pos) { + Iterator it = readsInSignalWindow.iterator(); + while ( it.hasNext() ) { + SAMRecord r = it.next(); + if ( r.getAlignmentStart() >= pos ) return; // we are done + + // read starts before pos: discard it and update the counts: + signalCountsInCurrWindow.removeRead(r); + it.remove(); + } + } + + /** Discard signal reads that start strictly before the specified position and + * update associated counts + * @param pos + */ + private void purgeControl(long pos) { + Iterator it = readsInControlWindow.iterator(); + while ( it.hasNext() ) { + SAMRecord r = it.next(); + if ( r.getAlignmentStart() >= pos ) return; // we are done + + // read starts before pos: discard it and update the counts: + controlCountsInCurrWindow.removeRead(r); + it.remove(); + } + } + + private void resetWindowMergingBuffer(long start) { + regionStart = start; + lastWindowStart = start; + signalReadCountsBuffer.clear(); + controlReadCountsBuffer.clear(); + signalReadCountsBuffer.add(signalCountsInCurrWindow.clone()); + controlReadCountsBuffer.add(controlCountsInCurrWindow.clone()); + } + + /** Delayed print: the window starting at 'start' will be added to the print buffer; if the window is close enough + * to the current contents if the buffer, the addition will result in merging the window with the buffer; + * otherwise, the old contents of the buffer will be printed and the buffer will be re-initialized with new window. + * It is assumed that counters are in synch with the start position passed to this method. + * @param start + */ + private void emitWindow(long start) { + // System.out.println("Emitting at "+start); + + if ( regionStart == -1 ) { // we did not keep any region so far; initialize the buffer and return, will print later + resetWindowMergingBuffer(start); + addedSinceLastEmit = 0; + return; + } + + if ( start > lastWindowStart + MERGE_CUTOFF ) { + // this loop is a dummy: we have already cleared those unneeded + // counts in shiftWindows(); stays here to avoid generating bugs later + // if we change something in shiftWindows() + for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) { + signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1); + controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1); + } + printRegion(); + resetWindowMergingBuffer(start); + return; + } + + // the current window is too close to the previous one: we have to merge; + // NOTE: if window is too close, bufferAccepts() returned true, so the counts are already + // added. + lastWindowStart = start; + addedSinceLastEmit = 0; +// signalReadCountsBuffer.add(uniqueSignalReads); +// controlReadCountsBuffer.add(uniqueControlReads); + + } + + private boolean bufferAccepts(long pos) { + return ( regionStart != -1 && pos <= lastWindowStart+MERGE_CUTOFF); + } + + + + private void printRegion() { + if ( regionStart == -1 ) return; + + long regionStop = lastWindowStart+WINDOW_SIZE-1; + + double[] tmpEnrU = new double[signalReadCountsBuffer.size()]; + int[] tmpSignalU = new int[signalReadCountsBuffer.size()]; + int[] tmpControlU = new int[signalReadCountsBuffer.size()]; + double[] tmpEnrNU = new double[signalReadCountsBuffer.size()]; + int[] tmpSignalNU = new int[signalReadCountsBuffer.size()]; + int[] tmpControlNU = new int[signalReadCountsBuffer.size()]; + + double[] tmpFWDSignalFracU = new double[signalReadCountsBuffer.size()]; + double[] tmpFWDControlFracU = new double[signalReadCountsBuffer.size()]; + double[] tmpFWDSignalFracNU = new double[signalReadCountsBuffer.size()]; + double[] tmpFWDControlFracNU = new double[signalReadCountsBuffer.size()]; + + int lastInd = signalReadCountsBuffer.size() - 1; + + // out.println("Size="+signalReadCountsBuffer.size()+":"); + + for ( int i = 0 ; i <= lastInd ; i++ ) { + tmpEnrU[i]= ( ((double) signalReadCountsBuffer.get(i).uniqueReads) / (controlReadCountsBuffer.get(i).uniqueReads+1.0 ) ) / COVERAGE_FACTOR ; + + tmpSignalU[i] = signalReadCountsBuffer.get(i).uniqueReads; + tmpControlU[i] = controlReadCountsBuffer.get(i).uniqueReads; + + tmpEnrNU[i]= ( ((double) signalReadCountsBuffer.get(i).nonUniqueReads) / (controlReadCountsBuffer.get(i).nonUniqueReads+1.0 ) ) / COVERAGE_FACTOR_NU ; + + tmpSignalNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads; + tmpControlNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads; + + tmpFWDSignalFracU[i] = signalReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).uniqueFWDReads) / signalReadCountsBuffer.get(i).uniqueReads ) : 0.5; + tmpFWDControlFracU[i] = controlReadCountsBuffer.get(i).uniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).uniqueFWDReads) / controlReadCountsBuffer.get(i).uniqueReads ) : 0.5; + tmpFWDSignalFracNU[i] = signalReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)signalReadCountsBuffer.get(i).nonUniqueFWDReads) / signalReadCountsBuffer.get(i).nonUniqueReads ) : 0.5; + tmpFWDControlFracNU[i] = controlReadCountsBuffer.get(i).nonUniqueReads > 0 ? ( ((double)controlReadCountsBuffer.get(i).nonUniqueFWDReads) / controlReadCountsBuffer.get(i).nonUniqueReads ) : 0.5; + } + + Arrays.sort(tmpEnrU); + Arrays.sort(tmpSignalU); + Arrays.sort(tmpControlU); + + Arrays.sort(tmpEnrNU); + Arrays.sort(tmpSignalNU); + Arrays.sort(tmpControlNU); + + Arrays.sort(tmpFWDSignalFracU); + Arrays.sort(tmpFWDControlFracU); + Arrays.sort(tmpFWDSignalFracNU); + Arrays.sort(tmpFWDControlFracNU); + + + out.print(currentContig+":"+regionStart+"-"+regionStop+"\t"+ + (regionStop-regionStart+1) +"\t"+ + "signal_unique:"+ tmpSignalU[0]+"-"+ tmpSignalU[lastInd/2]+"-"+ tmpSignalU[lastInd]+"\t"+ + "control_unique:"+ tmpControlU[0]+"-"+ tmpControlU[lastInd/2]+"-"+ tmpControlU[lastInd]); + + out.printf("\tsignal_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDSignalFracU[0],tmpFWDSignalFracU[lastInd/2],tmpFWDSignalFracU[lastInd]); + out.printf("\tcontrol_fwd_frac_unique:%.1f-%.1f-%.1f",tmpFWDControlFracU[0],tmpFWDControlFracU[lastInd/2],tmpFWDControlFracU[lastInd]); + + out.print("\tsignal_nonnunique:"+ tmpSignalNU[0]+"-"+ tmpSignalNU[lastInd/2]+"-"+ tmpSignalNU[lastInd]+"\t"+ + "control_nonunique:"+ tmpControlNU[0]+"-"+ tmpControlNU[lastInd/2]+"-"+ tmpControlNU[lastInd]); + + out.printf("\tsignal_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDSignalFracNU[0],tmpFWDSignalFracNU[lastInd/2],tmpFWDSignalFracNU[lastInd]); + out.printf("\tcontrol_fwd_frac_nonunique:%.1f-%.1f-%.1f",tmpFWDControlFracNU[0],tmpFWDControlFracNU[lastInd/2],tmpFWDControlFracNU[lastInd]); + + out.printf("\tnorm_enrichment_unique:%.2f-%.2f-%.2f",tmpEnrU[0],tmpEnrU[lastInd/2],tmpEnrU[lastInd]); + out.printf("\tnorm_enrichment_nonunique:%.2f-%.2f-%.2f",tmpEnrNU[0],tmpEnrNU[lastInd/2],tmpEnrNU[lastInd]); + + // if ( minUniqueSignalStrandBalance > 0.75 || minUniqueSignalStrandBalance < 0.25 ) out.print("\tS_U_STRAND_FILTER"); + out.println(); + + regionStart = -1; // to indicate that there is nothing left to print, the buffer is empty + // System.exit(1); + } + + private void updateWindowMergingBuffer(long i) { + if ( bufferAccepts(i) ) { + // we are not too far away from last window added to the buffer that actually passed significance criteria; + // in this case we have to keep buffering since another significant window may be encountered soon + // System.out.println("Updating buffer at "+i+" with "+ uniqueSignalReads); + signalReadCountsBuffer.add(signalCountsInCurrWindow.clone()); + controlReadCountsBuffer.add(controlCountsInCurrWindow.clone()); + addedSinceLastEmit++; + } else { + // we are too far from the last significant window; if another significant window comes later, it will not + // be merged into this region but will start a new one. In this case we have to erase all the counts we have been + // saving since the last significant window (the latter is where the current region is going to end!) + for ( ; addedSinceLastEmit > 0 ; addedSinceLastEmit-- ) { + signalReadCountsBuffer.remove(signalReadCountsBuffer.size()-1); + controlReadCountsBuffer.remove(controlReadCountsBuffer.size()-1); + } + printRegion(); // print current region right away, why not? next significant window will start new region for sure. + } + + } + + private void shiftWindows(long pos) { + // we shift windows when there is a read that does not fit into the current window. + // the position, to which the shift is performed, is the first position such that the new read + // can be accomodated. Hence we can safely slide up to pos, only discarding reads that go out of scope - + // we are guaranteed that there will be no new reads to add until we reach pos. + + + for ( long i = currentWindow.getStart() ; i < pos ; i++ ) { +// if ( readsInSignalWindow.size() == 0 ) { +// i = pos-1; +// continue; +// }; + +// if ( readsInSignalWindow.getFirst().getAlignmentStart() > i ) { +// i = readsInSignalWindow.getFirst().getAlignmentStart() - 1; // jump directly to next read position +// continue; +// } + + purgeSignal(i); // remove all the reads that start before current position i (and update all the counters) + purgeControl(i); + + updateWindowMergingBuffer(i); + + if ( ( controlCountsInCurrWindow.uniqueReads + 1 ) * ENRICHMENT_CUTOFF < MIN_SIGNAL ) { + // too few control reads + if ( signalCountsInCurrWindow.uniqueReads >= MIN_SIGNAL ) { + // emit signal only if it is higher that hard cut-off: + emitWindow(i); // print current window (print can be buffered and delayed!) + } + } else { + // enough control reads; + // check for actual enrichment: + if ( ((double) signalCountsInCurrWindow.uniqueReads) / (controlCountsInCurrWindow.uniqueReads+1.0) > ENRICHMENT_CUTOFF ) { + emitWindow(i); // print current window (print can be buffered and delayed!) + } + } + + } + + // we emitted intermediate windows up to pos-1 as/if needed and purged everything that starts before pos-1 + // now we have to purge everything that starts before pos and return (no emitting yet, as we are about to add a read upon return): + + purgeSignal(pos); + purgeControl(pos); + + currentWindow = GenomeLocParser.createGenomeLoc(currentWindow.getContigIndex(),pos,pos+WINDOW_SIZE-1); + } + + @Override + public void initialize() { + int nSams = getToolkit().getArguments().samFiles.size(); + + if ( nSams != 2 ) { + out.println("ERROR: two input bam files (signal and backround control) must be specified"); + System.exit(1); + } + List> readGroupSets = getToolkit().getMergedReadGroupsByReaders(); + signalReadGroups = readGroupSets.get(0); +// System.out.println(signalReadGroups.size()+" read groups in signal"); + controlReadGroups = readGroupSets.get(1); +// System.out.println(controlReadGroups.size()+" read groups in control"); + + currentWindow = GenomeLocParser.createGenomeLoc(0,1,WINDOW_SIZE); + readsInSignalWindow = new LinkedList(); + readsInControlWindow = new LinkedList(); + + MERGE_CUTOFF = WINDOW_SIZE; + ENRICHMENT_CUTOFF *= COVERAGE_FACTOR; + currentContig = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(0).getSequenceName(); + } + + + public Integer map(char[] ref, SAMRecord read) { + + if ( AlignmentUtils.isReadUnmapped(read) ) return 0; + + if ( read.getReferenceIndex() > currentWindow.getContigIndex() ) { + printRegion(); // print all we had on the previous contig + + currentWindow = GenomeLocParser.createGenomeLoc(read.getReferenceIndex(), + read.getAlignmentStart(), + read.getAlignmentStart()+WINDOW_SIZE-1); + currentContig = read.getReferenceName(); + resetWindows(); + } else { + // we are on the same contig + if ( read.getAlignmentEnd() > currentWindow.getStop() ) { + // can not accomodate the read inside the current window - shift! + // System.out.println("read ends at "+read.getAlignmentEnd()+" window ends at "+currentWindow.getStop()+ " shifting to "+ (currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() )) +" ("+uniqueSignalReads+"/"+uniqueControlReads+")"); + + // while shifting the window, the following method will issue (delayed) print commands for + // all intermediate windows that pass significance criteria: + shiftWindows(currentWindow.getStart() + ( read.getAlignmentEnd() - currentWindow.getStop() )); + } + // now the read will fit into the window + } + + // at this point we are guaranteed that the read will fit into the window + + if ( signalReadGroups.contains( read.getReadGroup().getReadGroupId() ) ) { + addSignal(read); + } else if ( controlReadGroups.contains( read.getReadGroup().getReadGroupId() )) { + addControl(read); + } else { + throw new StingException("Read "+read + " belongs to unrecognized read group"); + } + return 1; + } + + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return value+sum; //To change body of implemented methods use File | Settings | File Templates. + } + + + /** Auxiliary class that encapsulates the task of monitoring counts of various read traits in some set of reads + * (for instance, reads in the current window). Counted traits include uniquely/non-uniquely mapped reads, + * forward-strand aligned reads etc. + */ + class WindowStats implements Cloneable { + public int uniqueReads = 0; + public int nonUniqueReads = 0; + public int uniqueFWDReads = 0; + public int nonUniqueFWDReads = 0; + + /** Reset all counts to 0 */ + public void clear() { + uniqueReads = nonUniqueReads = uniqueFWDReads = nonUniqueFWDReads = 0; + } + + /** Examines the read and increments the counts for all the monitored traits observed in this read. */ + public void addRead(SAMRecord r) { + if ( r.getMappingQuality() == 0 ) { + // nonunique + nonUniqueReads++; + if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads++; + } else { + // unique + uniqueReads++; + if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads++; + } + } + + /** Examines the read and decrements the counts for all the monitored traits observed in this read. */ + public void removeRead(SAMRecord r) { + if ( r.getMappingQuality() == 0 ) { + // nonunique + nonUniqueReads--; + if ( ! r.getReadNegativeStrandFlag() ) nonUniqueFWDReads--; + } + else { + // unique + uniqueReads--; + if ( ! r.getReadNegativeStrandFlag() ) uniqueFWDReads--; + } + + } + + /** allocates new object, copies this object into it, and returns the copy */ + public WindowStats clone() { + WindowStats ret = new WindowStats(); + ret.uniqueReads = this.uniqueReads; + ret.nonUniqueReads = this.nonUniqueReads; + ret.uniqueFWDReads = this.uniqueFWDReads; + ret.nonUniqueFWDReads = this.nonUniqueFWDReads; + return ret; + } + } +}