TraverseActiveRegions now writes out very nice active region and activity profile IGV formatted files
This commit is contained in:
parent
8026199e4c
commit
09edc6baeb
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.traversals;
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
|
import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
|
@ -38,10 +39,12 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.activeregion.*;
|
import org.broadinstitute.sting.utils.activeregion.*;
|
||||||
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
|
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -79,26 +82,35 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
private GenomeLoc spanOfLastReadSeen = null;
|
private GenomeLoc spanOfLastReadSeen = null;
|
||||||
private ActivityProfile activityProfile = null;
|
private ActivityProfile activityProfile = null;
|
||||||
int maxReadsInMemory = 0;
|
int maxReadsInMemory = 0;
|
||||||
|
ActiveRegionWalker walker;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Have the debugging output streams been initialized already?
|
||||||
|
*
|
||||||
|
* We have to do lazy initialization because when the initialize() function is called
|
||||||
|
* the streams aren't yet initialized in the GATK walker.
|
||||||
|
*/
|
||||||
|
private boolean streamsInitialized = false;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
|
public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
|
||||||
super.initialize(engine, walker, progressMeter);
|
super.initialize(engine, walker, progressMeter);
|
||||||
|
|
||||||
final ActiveRegionWalker arWalker = (ActiveRegionWalker)walker;
|
this.walker = (ActiveRegionWalker)walker;
|
||||||
if ( arWalker.wantsExtendedReads() && ! arWalker.wantsNonPrimaryReads() ) {
|
if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) {
|
||||||
throw new IllegalArgumentException("Active region walker " + arWalker + " requested extended events but not " +
|
throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " +
|
||||||
"non-primary reads, an inconsistent state. Please modify the walker");
|
"non-primary reads, an inconsistent state. Please modify the walker");
|
||||||
}
|
}
|
||||||
|
|
||||||
activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||||
maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
||||||
walkerHasPresetRegions = arWalker.hasPresetActiveRegions();
|
walkerHasPresetRegions = this.walker.hasPresetActiveRegions();
|
||||||
|
|
||||||
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser());
|
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser());
|
||||||
if ( walkerHasPresetRegions ) {
|
if ( walkerHasPresetRegions ) {
|
||||||
// we load all of the preset locations into the
|
// we load all of the preset locations into the
|
||||||
for ( final GenomeLoc loc : arWalker.getPresetActiveRegions()) {
|
for ( final GenomeLoc loc : this.walker.getPresetActiveRegions()) {
|
||||||
workQueue.add(new ActiveRegion(loc, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
|
workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -327,6 +339,104 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
&& read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() );
|
&& read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Functions to write out activity profiles and active regions
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the debugging output streams (activity profile and active regions), if not done so already
|
||||||
|
*/
|
||||||
|
@Ensures("streamsInitialized == true")
|
||||||
|
private void initializeOutputStreamsIfNecessary() {
|
||||||
|
if ( ! streamsInitialized ) {
|
||||||
|
streamsInitialized = true;
|
||||||
|
if ( walker.activityProfileOutStream != null ) {
|
||||||
|
printIGVFormatHeader(walker.activityProfileOutStream, "line", "ActivityProfile");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( walker.activeRegionOutStream != null ) {
|
||||||
|
printIGVFormatHeader(walker.activeRegionOutStream, "line", "ActiveRegions");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function to write out a IGV formatted line to out, at loc, with values
|
||||||
|
*
|
||||||
|
* http://www.broadinstitute.org/software/igv/IGV
|
||||||
|
*
|
||||||
|
* @param out a non-null PrintStream where we'll write our line
|
||||||
|
* @param graphType the type of graph to show in IGV for this track
|
||||||
|
* @param columns the column names for this IGV track
|
||||||
|
*/
|
||||||
|
@Requires({
|
||||||
|
"out != null",
|
||||||
|
"graphType != null",
|
||||||
|
"columns.length > 0"
|
||||||
|
})
|
||||||
|
private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) {
|
||||||
|
out.printf("#track graphType=%s%n", graphType);
|
||||||
|
out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function to write out a IGV formatted line to out, at loc, with values
|
||||||
|
*
|
||||||
|
* http://www.broadinstitute.org/software/igv/IGV
|
||||||
|
*
|
||||||
|
* @param out a non-null PrintStream where we'll write our line
|
||||||
|
* @param loc the location of values
|
||||||
|
* @param featureName string name of this feature (see IGV format)
|
||||||
|
* @param values the floating point values to associate with loc and feature name in out
|
||||||
|
*/
|
||||||
|
@Requires({
|
||||||
|
"out != null",
|
||||||
|
"loc != null",
|
||||||
|
"values.length > 0"
|
||||||
|
})
|
||||||
|
private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) {
|
||||||
|
// note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1
|
||||||
|
out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName);
|
||||||
|
for ( final double value : values )
|
||||||
|
out.print(String.format("\t%.3f", value));
|
||||||
|
out.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write out activity profile information, if requested by the walker
|
||||||
|
*
|
||||||
|
* @param states the states in the current activity profile
|
||||||
|
*/
|
||||||
|
@Requires("states != null")
|
||||||
|
private void writeActivityProfile(final List<ActivityProfileState> states) {
|
||||||
|
if ( walker.activityProfileOutStream != null ) {
|
||||||
|
initializeOutputStreamsIfNecessary();
|
||||||
|
for ( final ActivityProfileState state : states ) {
|
||||||
|
printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", state.isActiveProb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write out each active region to the walker activeRegionOutStream
|
||||||
|
*
|
||||||
|
* @param region the region we're currently operating on
|
||||||
|
*/
|
||||||
|
@Requires("region != null")
|
||||||
|
private void writeActiveRegion(final ActiveRegion region) {
|
||||||
|
if( walker.activeRegionOutStream != null ) {
|
||||||
|
initializeOutputStreamsIfNecessary();
|
||||||
|
printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(),
|
||||||
|
"end-marker", 0.0);
|
||||||
|
printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(),
|
||||||
|
"size=" + region.getLocation().size(), region.isActive ? 1.0 : -1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------------------
|
// -------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Functions to process active regions that are ready for map / reduce calls
|
// Functions to process active regions that are ready for map / reduce calls
|
||||||
|
|
@ -349,26 +459,6 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
if ( ! walkerHasPresetRegions ) {
|
if ( ! walkerHasPresetRegions ) {
|
||||||
activityProfile.add(state);
|
activityProfile.add(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( walker.activityProfileOutStream != null )
|
|
||||||
walker.activityProfileOutStream.printf("%s\t%d\t%d\t%.2f%n",
|
|
||||||
locus.getLocation().getContig(), locus.getLocation().getStart(), locus.getLocation().getStart(),
|
|
||||||
state.isActiveProb);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Write out each active region to the walker activeRegionOutStream
|
|
||||||
*
|
|
||||||
* @param walker
|
|
||||||
*/
|
|
||||||
private void writeActiveRegionsToStream( final ActiveRegionWalker<M, T> walker ) {
|
|
||||||
// Just want to output the active regions to a file, not actually process them
|
|
||||||
for( final ActiveRegion activeRegion : workQueue ) {
|
|
||||||
if( activeRegion.isActive ) {
|
|
||||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -384,23 +474,20 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( walker.activeRegionOutStream != null ) {
|
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||||
writeActiveRegionsToStream(walker);
|
while( workQueue.peek() != null ) {
|
||||||
return sum;
|
final ActiveRegion activeRegion = workQueue.peek();
|
||||||
} else {
|
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
|
||||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
|
||||||
while( workQueue.peek() != null ) {
|
writeActivityProfile(activeRegion.getSupportingStates());
|
||||||
final ActiveRegion activeRegion = workQueue.peek();
|
writeActiveRegion(activeRegion);
|
||||||
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
|
sum = processActiveRegion( workQueue.remove(), sum, walker );
|
||||||
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
|
} else {
|
||||||
sum = processActiveRegion( workQueue.remove(), sum, walker );
|
break;
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
private T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
|
private T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
|
||||||
|
|
|
||||||
|
|
@ -61,10 +61,26 @@ import java.util.*;
|
||||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
|
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
|
||||||
@RemoveProgramRecords
|
@RemoveProgramRecords
|
||||||
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||||
@Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results bed file", required = false)
|
/**
|
||||||
|
* If provided, this walker will write out its activity profile (per bp probabilities of being active)
|
||||||
|
* to this file in the IGV formatted TAB deliminated output:
|
||||||
|
*
|
||||||
|
* http://www.broadinstitute.org/software/igv/IGV
|
||||||
|
*
|
||||||
|
* Intended to make debugging the activity profile calculations easier
|
||||||
|
*/
|
||||||
|
@Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false)
|
||||||
public PrintStream activityProfileOutStream = null;
|
public PrintStream activityProfileOutStream = null;
|
||||||
|
|
||||||
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
|
/**
|
||||||
|
* If provided, this walker will write out its active and inactive regions
|
||||||
|
* to this file in the IGV formatted TAB deliminated output:
|
||||||
|
*
|
||||||
|
* http://www.broadinstitute.org/software/igv/IGV
|
||||||
|
*
|
||||||
|
* Intended to make debugging the active region calculations easier
|
||||||
|
*/
|
||||||
|
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false)
|
||||||
public PrintStream activeRegionOutStream = null;
|
public PrintStream activeRegionOutStream = null;
|
||||||
|
|
||||||
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
|
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -44,6 +45,7 @@ import java.util.ArrayList;
|
||||||
public class ActiveRegion implements HasGenomeLocation {
|
public class ActiveRegion implements HasGenomeLocation {
|
||||||
|
|
||||||
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||||
|
private final List<ActivityProfileState> supportingStates;
|
||||||
private final GenomeLoc activeRegionLoc;
|
private final GenomeLoc activeRegionLoc;
|
||||||
private final GenomeLoc extendedLoc;
|
private final GenomeLoc extendedLoc;
|
||||||
private final int extension;
|
private final int extension;
|
||||||
|
|
@ -51,8 +53,9 @@ public class ActiveRegion implements HasGenomeLocation {
|
||||||
private final GenomeLocParser genomeLocParser;
|
private final GenomeLocParser genomeLocParser;
|
||||||
public final boolean isActive;
|
public final boolean isActive;
|
||||||
|
|
||||||
public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
|
public ActiveRegion( final GenomeLoc activeRegionLoc, final List<ActivityProfileState> supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
|
||||||
this.activeRegionLoc = activeRegionLoc;
|
this.activeRegionLoc = activeRegionLoc;
|
||||||
|
this.supportingStates = new ArrayList<ActivityProfileState>(supportingStates);
|
||||||
this.isActive = isActive;
|
this.isActive = isActive;
|
||||||
this.genomeLocParser = genomeLocParser;
|
this.genomeLocParser = genomeLocParser;
|
||||||
this.extension = extension;
|
this.extension = extension;
|
||||||
|
|
@ -112,6 +115,8 @@ public class ActiveRegion implements HasGenomeLocation {
|
||||||
public GenomeLoc getExtendedLoc() { return extendedLoc; }
|
public GenomeLoc getExtendedLoc() { return extendedLoc; }
|
||||||
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
|
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
|
||||||
|
|
||||||
|
public List<ActivityProfileState> getSupportingStates() { return supportingStates; }
|
||||||
|
|
||||||
public int getExtension() { return extension; }
|
public int getExtension() { return extension; }
|
||||||
public int size() { return reads.size(); }
|
public int size() { return reads.size(); }
|
||||||
public void clearReads() { reads.clear(); }
|
public void clearReads() { reads.clear(); }
|
||||||
|
|
|
||||||
|
|
@ -336,7 +336,9 @@ public class ActivityProfile {
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
// we need to create the active region, and clip out the states we're extracting from this profile
|
// we need to create the active region, and clip out the states we're extracting from this profile
|
||||||
stateList.subList(0, offsetOfNextRegionEnd + 1).clear();
|
final List<ActivityProfileState> sub = stateList.subList(0, offsetOfNextRegionEnd + 1);
|
||||||
|
final List<ActivityProfileState> supportingStates = new ArrayList<ActivityProfileState>(sub);
|
||||||
|
sub.clear();
|
||||||
|
|
||||||
// update the start and stop locations as necessary
|
// update the start and stop locations as necessary
|
||||||
if ( stateList.isEmpty() ) {
|
if ( stateList.isEmpty() ) {
|
||||||
|
|
@ -345,7 +347,7 @@ public class ActivityProfile {
|
||||||
regionStartLoc = stateList.get(0).getLoc();
|
regionStartLoc = stateList.get(0).getLoc();
|
||||||
}
|
}
|
||||||
final GenomeLoc regionLoc = parser.createGenomeLoc(first.getLoc().getContig(), first.getLoc().getStart(), first.getLoc().getStart() + offsetOfNextRegionEnd);
|
final GenomeLoc regionLoc = parser.createGenomeLoc(first.getLoc().getContig(), first.getLoc().getStart(), first.getLoc().getStart() + offsetOfNextRegionEnd);
|
||||||
return new ActiveRegion(regionLoc, isActiveRegion, parser, activeRegionExtension);
|
return new ActiveRegion(regionLoc, supportingStates, isActiveRegion, parser, activeRegionExtension);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue