TraverseActiveRegions now writes out very nice active region and activity profile IGV formatted files

This commit is contained in:
Mark DePristo 2013-01-23 13:44:40 -05:00
parent 8026199e4c
commit 09edc6baeb
4 changed files with 156 additions and 46 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.traversals;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -38,10 +39,12 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.activeregion.*;
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.*;
/**
@ -79,26 +82,35 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
private GenomeLoc spanOfLastReadSeen = null;
private ActivityProfile activityProfile = null;
int maxReadsInMemory = 0;
ActiveRegionWalker walker;
/**
* Have the debugging output streams been initialized already?
*
* We have to do lazy initialization because when the initialize() function is called
* the streams aren't yet initialized in the GATK walker.
*/
private boolean streamsInitialized = false;
@Override
public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
super.initialize(engine, walker, progressMeter);
final ActiveRegionWalker arWalker = (ActiveRegionWalker)walker;
if ( arWalker.wantsExtendedReads() && ! arWalker.wantsNonPrimaryReads() ) {
throw new IllegalArgumentException("Active region walker " + arWalker + " requested extended events but not " +
this.walker = (ActiveRegionWalker)walker;
if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) {
throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " +
"non-primary reads, an inconsistent state. Please modify the walker");
}
activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
walkerHasPresetRegions = arWalker.hasPresetActiveRegions();
walkerHasPresetRegions = this.walker.hasPresetActiveRegions();
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser());
if ( walkerHasPresetRegions ) {
// we load all of the preset locations into the
for ( final GenomeLoc loc : arWalker.getPresetActiveRegions()) {
workQueue.add(new ActiveRegion(loc, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
for ( final GenomeLoc loc : this.walker.getPresetActiveRegions()) {
workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
}
}
}
@ -327,6 +339,104 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
&& read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() );
}
// -------------------------------------------------------------------------------------
//
// Functions to write out activity profiles and active regions
//
// -------------------------------------------------------------------------------------
/**
* Initialize the debugging output streams (activity profile and active regions), if not done so already
*/
@Ensures("streamsInitialized == true")
private void initializeOutputStreamsIfNecessary() {
if ( ! streamsInitialized ) {
streamsInitialized = true;
if ( walker.activityProfileOutStream != null ) {
printIGVFormatHeader(walker.activityProfileOutStream, "line", "ActivityProfile");
}
if ( walker.activeRegionOutStream != null ) {
printIGVFormatHeader(walker.activeRegionOutStream, "line", "ActiveRegions");
}
}
}
/**
* Helper function to write out a IGV formatted line to out, at loc, with values
*
* http://www.broadinstitute.org/software/igv/IGV
*
* @param out a non-null PrintStream where we'll write our line
* @param graphType the type of graph to show in IGV for this track
* @param columns the column names for this IGV track
*/
@Requires({
"out != null",
"graphType != null",
"columns.length > 0"
})
private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) {
out.printf("#track graphType=%s%n", graphType);
out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns));
}
/**
* Helper function to write out a IGV formatted line to out, at loc, with values
*
* http://www.broadinstitute.org/software/igv/IGV
*
* @param out a non-null PrintStream where we'll write our line
* @param loc the location of values
* @param featureName string name of this feature (see IGV format)
* @param values the floating point values to associate with loc and feature name in out
*/
@Requires({
"out != null",
"loc != null",
"values.length > 0"
})
private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) {
// note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1
out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName);
for ( final double value : values )
out.print(String.format("\t%.3f", value));
out.println();
}
/**
* Write out activity profile information, if requested by the walker
*
* @param states the states in the current activity profile
*/
@Requires("states != null")
private void writeActivityProfile(final List<ActivityProfileState> states) {
if ( walker.activityProfileOutStream != null ) {
initializeOutputStreamsIfNecessary();
for ( final ActivityProfileState state : states ) {
printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", state.isActiveProb);
}
}
}
/**
* Write out each active region to the walker activeRegionOutStream
*
* @param region the region we're currently operating on
*/
@Requires("region != null")
private void writeActiveRegion(final ActiveRegion region) {
if( walker.activeRegionOutStream != null ) {
initializeOutputStreamsIfNecessary();
printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(),
"end-marker", 0.0);
printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(),
"size=" + region.getLocation().size(), region.isActive ? 1.0 : -1.0);
}
}
// -------------------------------------------------------------------------------------
//
// Functions to process active regions that are ready for map / reduce calls
@ -349,26 +459,6 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
if ( ! walkerHasPresetRegions ) {
activityProfile.add(state);
}
if ( walker.activityProfileOutStream != null )
walker.activityProfileOutStream.printf("%s\t%d\t%d\t%.2f%n",
locus.getLocation().getContig(), locus.getLocation().getStart(), locus.getLocation().getStart(),
state.isActiveProb);
}
/**
* Write out each active region to the walker activeRegionOutStream
*
* @param walker
*/
private void writeActiveRegionsToStream( final ActiveRegionWalker<M, T> walker ) {
// Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : workQueue ) {
if( activeRegion.isActive ) {
walker.activeRegionOutStream.println( activeRegion.getLocation() );
}
}
}
/**
@ -384,23 +474,20 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." );
}
if ( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
while( workQueue.peek() != null ) {
final ActiveRegion activeRegion = workQueue.peek();
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
sum = processActiveRegion( workQueue.remove(), sum, walker );
} else {
break;
}
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
while( workQueue.peek() != null ) {
final ActiveRegion activeRegion = workQueue.peek();
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
writeActivityProfile(activeRegion.getSupportingStates());
writeActiveRegion(activeRegion);
sum = processActiveRegion( workQueue.remove(), sum, walker );
} else {
break;
}
return sum;
}
return sum;
}
private T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {

View File

@ -61,10 +61,26 @@ import java.util.*;
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
@RemoveProgramRecords
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
@Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results bed file", required = false)
/**
* If provided, this walker will write out its activity profile (per bp probabilities of being active)
* to this file in the IGV formatted TAB deliminated output:
*
* http://www.broadinstitute.org/software/igv/IGV
*
* Intended to make debugging the activity profile calculations easier
*/
@Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false)
public PrintStream activityProfileOutStream = null;
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
/**
* If provided, this walker will write out its active and inactive regions
* to this file in the IGV formatted TAB deliminated output:
*
* http://www.broadinstitute.org/software/igv/IGV
*
* Intended to make debugging the active region calculations easier
*/
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false)
public PrintStream activeRegionOutStream = null;
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -44,6 +45,7 @@ import java.util.ArrayList;
public class ActiveRegion implements HasGenomeLocation {
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
private final List<ActivityProfileState> supportingStates;
private final GenomeLoc activeRegionLoc;
private final GenomeLoc extendedLoc;
private final int extension;
@ -51,8 +53,9 @@ public class ActiveRegion implements HasGenomeLocation {
private final GenomeLocParser genomeLocParser;
public final boolean isActive;
public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
public ActiveRegion( final GenomeLoc activeRegionLoc, final List<ActivityProfileState> supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
this.activeRegionLoc = activeRegionLoc;
this.supportingStates = new ArrayList<ActivityProfileState>(supportingStates);
this.isActive = isActive;
this.genomeLocParser = genomeLocParser;
this.extension = extension;
@ -112,6 +115,8 @@ public class ActiveRegion implements HasGenomeLocation {
public GenomeLoc getExtendedLoc() { return extendedLoc; }
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
public List<ActivityProfileState> getSupportingStates() { return supportingStates; }
public int getExtension() { return extension; }
public int size() { return reads.size(); }
public void clearReads() { reads.clear(); }

View File

@ -336,7 +336,9 @@ public class ActivityProfile {
return null;
// we need to create the active region, and clip out the states we're extracting from this profile
stateList.subList(0, offsetOfNextRegionEnd + 1).clear();
final List<ActivityProfileState> sub = stateList.subList(0, offsetOfNextRegionEnd + 1);
final List<ActivityProfileState> supportingStates = new ArrayList<ActivityProfileState>(sub);
sub.clear();
// update the start and stop locations as necessary
if ( stateList.isEmpty() ) {
@ -345,7 +347,7 @@ public class ActivityProfile {
regionStartLoc = stateList.get(0).getLoc();
}
final GenomeLoc regionLoc = parser.createGenomeLoc(first.getLoc().getContig(), first.getLoc().getStart(), first.getLoc().getStart() + offsetOfNextRegionEnd);
return new ActiveRegion(regionLoc, isActiveRegion, parser, activeRegionExtension);
return new ActiveRegion(regionLoc, supportingStates, isActiveRegion, parser, activeRegionExtension);
}
/**