Get gVCF to work without --dontTrimActiveRegions

Story:

https://www.pivotaltracker.com/story/show/65048706
https://www.pivotaltracker.com/story/show/65116908

Changes:

ActiveRegionTrimmer in now an argument collection and it returns not only the trimmed down active region but also the non-variant containing flanking regions
HaplotypeCaller code has been simplified significantly pushing some functionality two other classes like ActiveRegion and AssemblyResultSet.

Fixed a problem with the way the trimming was done causing some gVCF non-variant records no have conservative 0,0,0 PLs
This commit is contained in:
Valentin Ruano-Rubio 2014-02-03 18:26:22 -05:00
parent 1de7a27471
commit 98ffcf6833
9 changed files with 677 additions and 284 deletions

View File

@ -43,109 +43,492 @@
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/ */
package org.broadinstitute.sting.gatk.walkers.haplotypecaller; package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.LinkedList; import java.util.*;
import java.util.List;
import java.util.TreeSet;
/** /**
* Trim down an active region based on a set of variants found across the haplotypes within the region * Helper component to manage active region trimming
* *
* User: depristo * <p/>
* Date: 4/27/13 * It receives the user arguments that controls trimming and also performs the trimming region calculation.
* Time: 2:10 PM *
* @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.org&gt;
*/ */
class ActiveRegionTrimmer { class ActiveRegionTrimmer {
private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class);
private final boolean logTrimming;
private final int snpPadding, nonSnpPadding, maxDistanceInExtensionForGenotyping;
private final GenomeLocParser parser;
/** /**
* Create a new ActiveRegionTrimmer * Genome location parser use in order to create and manipulate genomic intervals.
*
* @param logTrimming should we log our trimming events?
* @param snpPadding how much bp context should we ensure around snps?
* @param nonSnpPadding how much bp context should we ensure around anything not a snp?
* @param maxDistanceInExtensionForGenotyping the max extent we are will to go into the extended region of the
* origin active region in order to properly genotype events in the
* non-extended active region?
* @param parser a genome loc parser so we can create genome locs
*/ */
ActiveRegionTrimmer(boolean logTrimming, int snpPadding, int nonSnpPadding, int maxDistanceInExtensionForGenotyping, GenomeLocParser parser) { private GenomeLocParser locParser;
if ( snpPadding < 0 ) throw new IllegalArgumentException("snpPadding must be >= 0 but got " + snpPadding);
if ( nonSnpPadding < 0 ) throw new IllegalArgumentException("nonSnpPadding must be >= 0 but got " + nonSnpPadding);
if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping);
if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
logger.debug("Trimmer created with parameters " + logTrimming + " " + snpPadding + " " + nonSnpPadding + " " + maxDistanceInExtensionForGenotyping); /**
this.logTrimming = logTrimming; * Holds the debug flag. If {@code true} the trimmer will output debugging messages into the log.
this.snpPadding = snpPadding; */
this.nonSnpPadding = nonSnpPadding; private boolean debug;
this.maxDistanceInExtensionForGenotyping = maxDistanceInExtensionForGenotyping;
this.parser = parser; /**
* Holds the extension to be used based on whether GGA mode is on or off.
*/
private int usableExtension;
/**
* Records whether the trimming intervals are going to be used to emit reference confidence, {@code true},
* or regular HC output {@code false}.
*/
private boolean emitReferenceConfidence;
@Hidden
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
/**
* the maximum extent into the full active region extension that we're willing to go in genotyping our events
*/
@Hidden
@Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false)
protected int discoverExtension = 25;
@Hidden
@Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false)
protected int ggaExtension = 300;
/**
* Include at least this many bases around an event for calling it
*/
@Hidden
@Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false)
protected int indelPadding = 150;
@Hidden
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
protected int snpPadding = 20;
/**
* Holds a reference the trimmer logger.
*/
private final static Logger logger = Logger.getLogger(ActiveRegionTrimmer.class);
/**
* Initializes the trimmer.
*
* <p/>
* This method should be called once and only once before any trimming is performed.
*
*
* @param glp the genome-location-parser to be used when operating with genomic locations.
* @param debug whether to show extra debug log messages.
* @param isGGA whether the trimming region calculator should act as if we are in GGA mode or not.
* @param emitReferenceConfidence indicates whether we plan to use this trimmer to generate trimmed regions
* to be used for emitting reference confidence.
*
* @throws IllegalStateException if this trim calculator has already been initialized.
* @throws IllegalArgumentException if the input location parser is {@code null}.
* @throws UserException.BadArgumentValue if any of the user argument values is invalid.
*/
void initialize(final GenomeLocParser glp, final boolean debug, final boolean isGGA, final boolean emitReferenceConfidence) {
if (locParser != null)
throw new IllegalStateException(getClass().getSimpleName() + " instance initialized twice");
if (glp == null)
throw new IllegalArgumentException("input genome-loc-parser cannot be null");
checkUserArguments();
locParser = glp;
this.debug = debug;
usableExtension = isGGA ? ggaExtension : discoverExtension;
this.emitReferenceConfidence = emitReferenceConfidence;
} }
/** /**
* Trim down the active region to a region large enough to properly genotype the events found within the active * Checks user trimming argument values
* region span, excluding all variants that only occur within its extended span.
* *
* This function merely creates the region, but it doesn't populate the reads back into the region. * @throws UserException.BadArgumentValue if there is some problem with any of the arguments values.
*
* @param region our full active region
* @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position
* @param emitReferenceConfidence are we going to estimate the reference confidence with this active region?
* @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
*/ */
public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet<VariantContext> allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) { private void checkUserArguments() {
if ( snpPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundSNPs","" + snpPadding + "< 0");
if ( indelPadding < 0 ) throw new UserException.BadArgumentValue("paddingAroundIndels","" + indelPadding + "< 0");
if ( discoverExtension < 0) throw new UserException.BadArgumentValue("maxDiscARExtension","" + discoverExtension + "< 0");
if ( ggaExtension < 0) throw new UserException.BadArgumentValue("maxGGAAREExtension","" + ggaExtension + "< 0");
}
if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region /**
return null; * Holds the result of trimming.
*
*
*
*/
public static class Result {
/**
* Indicates whether trimming is required per data and user request.
*/
protected final boolean needsTrimming;
/**
* Holds the input active region.
*/
protected final ActiveRegion originalRegion;
/**
* Holds the smaller range that contain all relevant callable variants in the
* input active region (not considering the extension).
*
*/
protected final GenomeLoc callableSpan;
/**
* Maximum available range for the trimmed variant region.
*/
protected final GenomeLoc maximumSpan;
/**
* The trimmed variant region span including the extension.
*/
protected final GenomeLoc extendedSpan;
/**
* The ideal trimmer variant region span including the extension.
*/
protected final GenomeLoc idealSpan;
/**
* Returns the ideal trimming span.
*
* <p/>
* The ideal span is the one containing all callable variation overlapping the original active region span
* (without extension) and the applicable padding {@link #getPadding()} in both sides.
*
*
* @return never {@code null}.
*/
@SuppressWarnings("unused")
public GenomeLoc getIdealSpan() {
return idealSpan;
}
/**
* Holds the flanking spans that do not contain the callable variants.
* <p/>
* The first element of the pair is the left (up-stream) non-variant flank, whereas the second element is
* the right (down-stream) non-variant flank.
*/
protected final Pair<GenomeLoc,GenomeLoc> nonVariantFlanks;
/**
* Holds the collection of callable events within the variant trimming region.
*/
protected final List<VariantContext> callableEvents;
/**
* Required padding around the variant trimming region.
*/
protected final int padding;
/**
* Returns the required padding around callable variation.
*
* <p/>
* Notice that due to the limiting span of the original active region (including its extension) it
* is possible that the resulting final trimmed variant region span does not satisfies the padding. However
* that should be rare.
*
* @return 0 or greater.
*/
@SuppressWarnings("unused")
public int getPadding() {
return padding;
}
/**
* Holds the maximum extension around the original active region span considered for the trimmed
* variation region.
*/
protected final int usableExtension;
/**
* Returns the maximum extension around the original active region span considered for the trimmed
* variation region.
*
* <p/>
* From time to time, the trimmed region may require a span beyond the input original active region's.
* For example when there is a callable event close ot one of its ends and the required padding makes it
* round beyond that limit.
*
* <p/>
* Notice that due to the limiting span of the original active region (including its extended region) it
* is possible that the resulting final trimmed variant region span goes beyond this extension including more of
* the original active region own extension.
*
* @return 0 or greater.
*/
@SuppressWarnings("unused")
public int getUsableExtension() {
return usableExtension;
}
/**
* Holds variant-containing callable region.
* <p/>
* This is lazy-initialized using {@link #callableSpan}.
*/
protected ActiveRegion callableRegion;
/**
* Non-variant left flank region.
* <p/>
* This is lazy-initialized using
* {@link #nonVariantFlanks}.{@link Pair#getFirst() getFirst()}.
*/
private ActiveRegion leftFlankRegion;
/**
* Non-variant right flank region.
* <p/>
* This is lazy-initialized using
* {@link #nonVariantFlanks}.{@link Pair#getFirst() getSecond()}.
*/
private ActiveRegion rightFlankRegion;
/**
* Whether the variant trimmed region is going to be used for emitting reference confidence records.
*/
private final boolean emitReferenceConfidence;
/**
* Creates a trimming result given all its properties.
*
* @param emitReferenceConfidence whether reference confidence output modes are on.
* @param needsTrimming whether there is any trimming needed at all.
* @param originalRegion the original active region.
* @param padding padding around contained callable variation events.
* @param extension the extension applied to the trimmed variant span.
* @param overlappingEvents contained callable variation events.
* @param nonVariantFlanks pair of non-variant flank spans around the variant containing span.
* @param extendedSpan final trimmed variant span including the extension.
* @param idealSpan the ideal span, that contains.
* @param maximumSpan maximum possible trimmed span based on the input original active region extended span.
* @param callableSpan variant containing span without padding.
*/
protected Result(final boolean emitReferenceConfidence, final boolean needsTrimming, final ActiveRegion originalRegion,
final int padding, final int extension,
final List<VariantContext> overlappingEvents, final Pair<GenomeLoc,GenomeLoc> nonVariantFlanks,
final GenomeLoc extendedSpan,
final GenomeLoc idealSpan,
final GenomeLoc maximumSpan,
final GenomeLoc callableSpan) {
this.emitReferenceConfidence = emitReferenceConfidence;
this.needsTrimming = needsTrimming;
this.originalRegion = originalRegion;
this.nonVariantFlanks = nonVariantFlanks;
this.padding = padding;
this.usableExtension = extension;
this.callableEvents = overlappingEvents;
this.callableSpan = callableSpan;
this.idealSpan = idealSpan;
this.maximumSpan = maximumSpan;
this.extendedSpan = extendedSpan;
if (!extendedSpan.isUnmapped() && !callableSpan.isUnmapped() && !extendedSpan.containsP(callableSpan))
throw new IllegalArgumentException("the extended callable span must include the callable span");
}
/**
* Checks whether there is any variation present in the target region.
*
* @return {@code true} if there is any variant, {@code false} otherwise.
*/
public boolean isVariationPresent() {
return ! callableEvents.isEmpty();
}
/**
* Checks whether the active region needs trimming.
*/
public boolean needsTrimming() {
return needsTrimming;
}
/**
* Returns the trimmed variant containing region
*
* @throws IllegalStateException if there is no variation detected.
*
* @return never {@code null}.
*/
public ActiveRegion getCallableRegion() {
if (callableRegion == null && !extendedSpan.isUnmapped())
//TODO this conditional is a patch to retain the current standard HC run behaviour
//TODO we should simply remove this difference between trimming with or without GVCF
//TODO embracing slight changes in the standard HC output
callableRegion = emitReferenceConfidence ? originalRegion.trim(callableSpan, extendedSpan) : originalRegion.trim(extendedSpan);
else if (extendedSpan.isUnmapped())
throw new IllegalStateException("there is no variation thus no variant region");
return callableRegion;
}
/**
* Checks whether there is a non-empty left flanking non-variant trimmed out region.
* @return {@code true} if there is a non-trivial left flank region, {@code false} otherwise.
*/
public boolean hasLeftFlankingRegion() {
return ! nonVariantFlanks.getFirst().isUnmapped();
}
/**
* Checks whether there is a non-empty right flanking non-variant trimmed out region.
* @return {@code true} if there is a non-trivial right flank region, {@code false} otherwise.
*/
public boolean hasRightFlankingRegion() {
return ! nonVariantFlanks.getSecond().isUnmapped();
}
/**
* Returns the trimmed out left non-variant region.
* <p/>
* Notice that in case of no variation, the whole original region is considered the left flanking region.
*
* @throws IllegalStateException if there is not such as left flanking region.
*/
public ActiveRegion nonVariantLeftFlankRegion() {
if (leftFlankRegion == null && ! nonVariantFlanks.getFirst().isUnmapped())
leftFlankRegion = originalRegion.trim(nonVariantFlanks.getFirst(),originalRegion.getExtension());
else if (nonVariantFlanks.getFirst().isUnmapped())
throw new IllegalStateException("there is no left flank non-variant trimmed out region");
return leftFlankRegion;
}
/**
* Returns the trimmed out right non-variant region.
*/
public ActiveRegion nonVariantRightFlankRegion() {
if (rightFlankRegion == null && ! nonVariantFlanks.getSecond().isUnmapped())
rightFlankRegion = originalRegion.trim(nonVariantFlanks.getSecond(),originalRegion.getExtension());
else if (nonVariantFlanks.getSecond().isUnmapped())
throw new IllegalStateException("there is no right flank non-variant trimmed out region");
return rightFlankRegion;
}
/**
* Creates a result indicating that there was no trimming to be done.
*/
protected static Result noTrimming(final boolean emitReferenceConfidence,
final ActiveRegion targetRegion, final int padding,
final int usableExtension,final List<VariantContext> events) {
final GenomeLoc targetRegionLoc = targetRegion.getLocation();
final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension,events,new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED),
targetRegionLoc,targetRegionLoc,targetRegionLoc,targetRegionLoc);
result.callableRegion = targetRegion;
return result;
}
/**
* Creates a result indicating that no variation was found.
*/
protected static Result noVariation(final boolean emitReferenceConfidence, final ActiveRegion targetRegion,
final int padding, final int usableExtension) {
final Result result = new Result(emitReferenceConfidence,false,targetRegion,padding,usableExtension,
Collections.<VariantContext>emptyList(),new Pair<>(targetRegion.getLocation(),GenomeLoc.UNMAPPED),
GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED);
result.leftFlankRegion = targetRegion;
return result;
}
}
/**
* Returns a trimming result object from which the variant trimmed region and flanking non-variant sections
* can be recovered latter.
*
* @param originalRegion the genome location range to trim.
* @param allVariantsWithinExtendedRegion list of variants contained in the trimming location. Variants therein
* not overlapping with {@code originalRegion} are simply ignored.
* @return never {@code null}.
*/
public Result trim(final ActiveRegion originalRegion,
final TreeSet<VariantContext> allVariantsWithinExtendedRegion) {
if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants,
return Result.noVariation(emitReferenceConfidence,originalRegion,snpPadding, usableExtension);
final List<VariantContext> withinActiveRegion = new LinkedList<>(); final List<VariantContext> withinActiveRegion = new LinkedList<>();
final GenomeLoc originalRegionRange = originalRegion.getLocation();
boolean foundNonSnp = false; boolean foundNonSnp = false;
GenomeLoc trimLoc = null; GenomeLoc variantSpan = null;
for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
final GenomeLoc vcLoc = parser.createGenomeLoc(vc); final GenomeLoc vcLoc = locParser.createGenomeLoc(vc);
if ( region.getLocation().overlapsP(vcLoc) ) { if ( originalRegionRange.overlapsP(vcLoc) ) {
if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding foundNonSnp = foundNonSnp || ! vc.isSNP();
foundNonSnp = true; variantSpan = variantSpan == null ? vcLoc : variantSpan.endpointSpan(vcLoc);
trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
withinActiveRegion.add(vc); withinActiveRegion.add(vc);
} }
} }
final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding ); final int padding = foundNonSnp ? indelPadding : snpPadding;
// we don't actually have anything in the region after removing variants that don't overlap the region's full location // we don't actually have anything in the region after skipping out variants that don't overlap
if ( trimLoc == null ) return null; // the region's full location
if ( variantSpan == null )
return Result.noVariation(emitReferenceConfidence,originalRegion,padding, usableExtension);
// final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); if ( dontTrimActiveRegions)
// Try to have one kmer before and after any event. return Result.noTrimming(emitReferenceConfidence,originalRegion, padding, usableExtension, withinActiveRegion);
final GenomeLoc regionLoc = region.getLocation(); final GenomeLoc maximumSpan = locParser.createPaddedGenomeLoc(originalRegionRange, usableExtension);
final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); final GenomeLoc idealSpan = locParser.createPaddedGenomeLoc(variantSpan, padding);
final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad); final GenomeLoc finalSpan = maximumSpan.intersect(idealSpan).union(variantSpan);
final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
final ActiveRegion trimmedRegion = region.trim(finalSpan); final Pair<GenomeLoc,GenomeLoc> nonVariantRegions = nonVariantTargetRegions(originalRegion, variantSpan);
if ( logTrimming ) {
if ( debug ) {
logger.info("events : " + withinActiveRegion); logger.info("events : " + withinActiveRegion);
logger.info("region : " + regionLoc); logger.info("region : " + originalRegion);
logger.info("trimLoc : " + trimLoc); logger.info("variantSpan : " + variantSpan);
logger.info("pad : " + pad); logger.info("pad : " + padding);
logger.info("idealSpan : " + idealSpan); logger.info("idealSpan : " + idealSpan);
logger.info("maxSpan : " + maxSpan); logger.info("maximumSpan : " + maximumSpan);
logger.info("finalSpan : " + finalSpan); logger.info("finalSpan : " + finalSpan);
logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
} }
return trimmedRegion;
return new Result(emitReferenceConfidence,true,originalRegion,padding, usableExtension,withinActiveRegion,nonVariantRegions,finalSpan,idealSpan,maximumSpan,variantSpan);
}
/**
* Calculates the list of region to trim away.
* @param targetRegion region for which to generate the flanking regions.
* @param variantSpan the span of the core region containing relevant variation and required padding.
* @return never {@code null}; 0, 1 or 2 element list.
*/
private Pair<GenomeLoc,GenomeLoc> nonVariantTargetRegions(final ActiveRegion targetRegion, final GenomeLoc variantSpan) {
final GenomeLoc targetRegionRange = targetRegion.getLocation();
final int finalStart = variantSpan.getStart();
final int finalStop = variantSpan.getStop();
final int targetStart = targetRegionRange.getStart();
final int targetStop = targetRegionRange.getStop();
final boolean preTrimmingRequired = targetStart < finalStart;
final boolean postTrimmingRequired = targetStop > finalStop;
if (preTrimmingRequired) {
final String contig = targetRegionRange.getContig();
return postTrimmingRequired ? new Pair<>(
locParser.createGenomeLoc(contig, targetStart, finalStart - 1),
locParser.createGenomeLoc(contig, finalStop + 1, targetStop)) :
new Pair<>(locParser.createGenomeLoc(contig, targetStart, finalStart - 1),GenomeLoc.UNMAPPED);
} else if (postTrimmingRequired)
return new Pair<>(locParser.createGenomeLoc(targetRegionRange.getContig(), finalStop + 1, targetStop),GenomeLoc.UNMAPPED);
else
return new Pair<>(GenomeLoc.UNMAPPED,GenomeLoc.UNMAPPED);
} }
} }

View File

@ -51,8 +51,10 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadT
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.collections.CountSet; import org.broadinstitute.sting.utils.collections.CountSet;
import org.broadinstitute.sting.utils.collections.CountSet; import org.broadinstitute.sting.utils.haplotype.EventMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.haplotype.HaplotypeSizeAndBaseComparator;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.io.StringWriter; import java.io.StringWriter;
@ -84,6 +86,9 @@ public class AssemblyResultSet {
private Haplotype refHaplotype; private Haplotype refHaplotype;
private boolean wasTrimmed = false; private boolean wasTrimmed = false;
private final CountSet kmerSizes; private final CountSet kmerSizes;
private TreeSet<VariantContext> variationEvents;
private boolean debug;
private static Logger logger = Logger.getLogger(AssemblyResultSet.class);
/** /**
* Constructs a new empty assembly result set. * Constructs a new empty assembly result set.
@ -95,21 +100,29 @@ public class AssemblyResultSet {
kmerSizes = new CountSet(4); kmerSizes = new CountSet(4);
} }
/**
* Change the debug status for this assembly-result-set.
* @param newValue new value for the debug status.
*/
void setDebug(final boolean newValue) {
debug = newValue;
}
/** /**
* Trims an assembly result set down based on a new set of trimmed haplotypes. * Trims an assembly result set down based on a new set of trimmed haplotypes.
* *
* @param originalByTrimmedHaplotypes map from trimmed to original haplotypes.
* @param trimmedActiveRegion the trimmed down active region. * @param trimmedActiveRegion the trimmed down active region.
* *
* @throws NullPointerException if any argument in {@code null} or * @throws NullPointerException if any argument in {@code null} or
* if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys. * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys.
* @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones. * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones.
* *
*
* @return never {@code null}, a new trimmed assembly result set. * @return never {@code null}, a new trimmed assembly result set.
*/ */
public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion, public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion) {
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes) {
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes = calculateOriginalByTrimmedHaplotypes(trimmedActiveRegion);
if (refHaplotype == null) throw new IllegalStateException(); if (refHaplotype == null) throw new IllegalStateException();
if (trimmedActiveRegion == null) throw new NullPointerException(); if (trimmedActiveRegion == null) throw new NullPointerException();
final AssemblyResultSet result = new AssemblyResultSet(); final AssemblyResultSet result = new AssemblyResultSet();
@ -131,6 +144,53 @@ public class AssemblyResultSet {
return result; return result;
} }
private Map<Haplotype, Haplotype> calculateOriginalByTrimmedHaplotypes(final ActiveRegion trimmedActiveRegion) {
if ( debug ) logger.info("Trimming active region " + getRegionForGenotyping() + " with " + getHaplotypeCount() + " haplotypes");
final List<Haplotype> haplotypeList = getHaplotypeList();
// trim down the haplotypes
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes = new HashMap<>();
for ( final Haplotype h : haplotypeList ) {
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
if ( trimmed != null ) {
if (originalByTrimmedHaplotypes.containsKey(trimmed)) {
if (trimmed.isReference()) {
originalByTrimmedHaplotypes.remove(trimmed);
originalByTrimmedHaplotypes.put(trimmed, h);
}
} else
originalByTrimmedHaplotypes.put(trimmed,h);
} else if (h.isReference())
throw new IllegalStateException("trimming eliminates the reference haplotype");
else if ( debug ) {
logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() +
" because it starts with or ends with an insertion or deletion when trimmed to " +
trimmedActiveRegion.getExtendedLoc());
}
}
// create the final list of trimmed haplotypes
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet());
// resort the trimmed haplotypes.
Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator());
final Map<Haplotype,Haplotype> sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size());
for (final Haplotype trimmed : trimmedHaplotypes)
sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed));
if ( debug ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " +
trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " +
haplotypeList.size() + " to only " + trimmedHaplotypes.size());
if ( debug )
for ( final Haplotype remaining: trimmedHaplotypes )
logger.info("Remains: " + remaining + " cigar " + remaining.getCigar());
return sortedOriginalByTrimmedHaplotypes;
}
/** /**
* Query the reference haplotype in the result set. * Query the reference haplotype in the result set.
* @return {@code null} if none wasn't yet added, otherwise a reference haplotype. * @return {@code null} if none wasn't yet added, otherwise a reference haplotype.
@ -463,4 +523,21 @@ public class AssemblyResultSet {
throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different"); throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different");
} }
/**
* Returns a sorted set of variant events that best explain the haplotypes found by the assembly
* across kmerSizes.
*
* <p/>
* The result is sorted incrementally by location.
*
* @return never {@code null}, but perhaps an empty collection.
*/
public TreeSet<VariantContext> getVariationEvents() {
if (variationEvents == null) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList,fullReferenceWithPadding,paddedReferenceLoc,debug);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);
}
return variationEvents;
}
} }

View File

@ -214,6 +214,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
private double log10GlobalReadMismappingRate; private double log10GlobalReadMismappingRate;
/**
* Active region trimmer reference.
*/
@ArgumentCollection
protected ActiveRegionTrimmer trimmer = new ActiveRegionTrimmer();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; } public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
/** /**
@ -446,10 +452,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
protected boolean debugGraphTransformations = false; protected boolean debugGraphTransformations = false;
@Hidden
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
protected boolean dontTrimActiveRegions = false;
@Hidden @Hidden
@Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false)
protected boolean dontUseSoftClippedBases = false; protected boolean dontUseSoftClippedBases = false;
@ -479,28 +481,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false)
protected int minObservationsForKmerToBeSolid = 20; protected int minObservationsForKmerToBeSolid = 20;
/**
* the maximum extent into the full active region extension that we're willing to go in genotyping our events
*/
@Hidden
@Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false)
protected int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25;
@Hidden
@Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false)
protected int MAX_GGA_ACTIVE_REGION_EXTENSION = 300;
/**
* Include at least this many bases around an event for calling it
*/
@Hidden
@Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false)
protected int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
@Hidden
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
protected int PADDING_AROUND_SNPS_FOR_CALLING = 20;
/** /**
* Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base
* insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator).
@ -533,8 +513,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// reference base padding size // reference base padding size
private static final int REFERENCE_PADDING = 500; private static final int REFERENCE_PADDING = 500;
private ActiveRegionTrimmer trimmer = null;
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
@ -675,9 +653,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader());
} }
trimmer = new ActiveRegionTrimmer(DEBUG, PADDING_AROUND_SNPS_FOR_CALLING, PADDING_AROUND_OTHERS_FOR_CALLING, trimmer.initialize(getToolkit().getGenomeLocParser(), DEBUG,
UAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ? MAX_GGA_ACTIVE_REGION_EXTENSION : MAX_DISCOVERY_ACTIVE_REGION_EXTENSION, UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence());
getToolkit().getGenomeLocParser());
} }
private void initializeReferenceConfidenceModel(final Set<String> samples, final Set<VCFHeaderLine> headerInfo) { private void initializeReferenceConfidenceModel(final Set<String> samples, final Set<VCFHeaderLine> headerInfo) {
@ -691,7 +668,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) {
throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER);
} }
SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = 0.0;
try { try {
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
@ -799,10 +775,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return NO_CALLS; return NO_CALLS;
if( !originalActiveRegion.isActive() ) { if( !originalActiveRegion.isActive() )
// Not active so nothing to do! // Not active so nothing to do!
return referenceModelForNoVariation(originalActiveRegion, true); return referenceModelForNoVariation(originalActiveRegion, true);
}
final List<VariantContext> activeAllelesToGenotype = new ArrayList<>(); final List<VariantContext> activeAllelesToGenotype = new ArrayList<>();
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
@ -819,20 +794,41 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
} }
// run the local assembler, getting back a collection of information on how we should proceed // run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResultSet assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); final AssemblyResultSet untrimmedAssemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
final TreeSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents();
// TODO - line bellow might be unecessary : it might be that assemblyResult will always have those alleles anyway
// TODO - so check and remove if that is the case:
allVariationEvents.addAll(activeAllelesToGenotype);
final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents);
if (!trimmingResult.isVariationPresent())
return referenceModelForNoVariation(originalActiveRegion,false);
final AssemblyResultSet assemblyResult =
trimmingResult.needsTrimming() ? untrimmedAssemblyResult.trimTo(trimmingResult.getCallableRegion()) : untrimmedAssemblyResult;
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
// abort early if something is out of the acceptable range
if( ! assemblyResult.isVariationPresent() ) {
return referenceModelForNoVariation(originalActiveRegion, false);
} // only the reference haplotype remains so nothing else to do!
if (dontGenotype) return NO_CALLS; // user requested we not proceed
// filter out reads from genotyping which fail mapping quality based criteria // filter out reads from genotyping which fail mapping quality based criteria
//TODO - why don't do this before any assembly is done? Why not just once at the beginning of this method
//TODO - on the originalActiveRegion?
//TODO - if you move this up you might have to consider to change referenceModelForNoVariation
//TODO - that does also filter reads.
final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( regionForGenotyping ); final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( regionForGenotyping );
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads ); final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
// abort early if something is out of the acceptable range
// TODO is this ever true at this point??? perhaps GGA. Need to check.
if( ! assemblyResult.isVariationPresent() )
return referenceModelForNoVariation(originalActiveRegion, false);
// For sure this is not true if gVCF is on.
if (dontGenotype) return NO_CALLS; // user requested we not proceed
// TODO is this ever true at this point??? perhaps GGA. Need to check.
if( regionForGenotyping.size() == 0 ) { if( regionForGenotyping.size() == 0 ) {
// no reads remain after filtering so nothing else to do! // no reads remain after filtering so nothing else to do!
return referenceModelForNoVariation(originalActiveRegion, false); return referenceModelForNoVariation(originalActiveRegion, false);
@ -881,10 +877,20 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
if ( !containsCalls(calledHaplotypes) ) { if ( !containsCalls(calledHaplotypes) ) {
// no called all of the potential haplotypes // no called all of the potential haplotypes
return referenceModelForNoVariation(originalActiveRegion, false); return referenceModelForNoVariation(originalActiveRegion, false);
} else } else {
return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), final List<VariantContext> result = new LinkedList<>();
// output left-flanking non-variant section:
if (trimmingResult.hasLeftFlankingRegion())
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantLeftFlankRegion(),false));
// output variant containing region.
result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
stratifiedReadMap, calledHaplotypes.getCalls()); stratifiedReadMap, calledHaplotypes.getCalls()));
// output right-flanking non-variant section:
if (trimmingResult.hasRightFlankingRegion())
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false));
return result;
}
} else { } else {
return calledHaplotypes.getCalls(); return calledHaplotypes.getCalls();
} }
@ -925,17 +931,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
try { try {
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
assemblyResultSet.debugDump(logger); assemblyResultSet.debugDump(logger);
return assemblyResultSet;
if ( ! dontTrimActiveRegions ) {
final ActiveRegion trimmedActiveRegion = trimActiveRegion(assemblyResultSet,activeAllelesToGenotype);
if (trimmedActiveRegion != null)
return trimAssemblyResultSet(assemblyResultSet, trimmedActiveRegion);
else {
assemblyResultSet.resetVariationPresent();
return assemblyResultSet;
}
} else
return assemblyResultSet;
} catch ( final Exception e ) { } catch ( final Exception e ) {
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested // Capture any exception that might be thrown, and write out the assembly failure BAM if requested
if ( captureAssemblyFailureBAM ) { if ( captureAssemblyFailureBAM ) {
@ -969,17 +966,20 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
*/ */
private List<VariantContext> referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) { private List<VariantContext> referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) {
if ( emitReferenceConfidence() ) { if ( emitReferenceConfidence() ) {
if ( needsToBeFinalized ) finalizeActiveRegion(region); //TODO - why the activeRegion cannot manage its own one-time finalization and filtering?
filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion //TODO - perhaps we can remove the last parameter of this method and the three lines bellow?
if ( needsToBeFinalized )
finalizeActiveRegion(region);
filterNonPassingReads(region);
final GenomeLoc paddedLoc = region.getExtendedLoc(); final GenomeLoc paddedLoc = region.getExtendedLoc();
final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc); final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc);
final List<Haplotype> haplotypes = Collections.singletonList(refHaplotype); final List<Haplotype> haplotypes = Collections.singletonList(refHaplotype);
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region),
Collections.<VariantContext>emptyList()); Collections.<VariantContext>emptyList());
} else { } else
return NO_CALLS; return NO_CALLS;
}
} }
/** /**
@ -1006,91 +1006,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
return map; return map;
} }
private ActiveRegion trimActiveRegion(final AssemblyResultSet resultSet, final Collection<VariantContext> activeAllelesToGenotype) {
if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes");
final List<Haplotype> haplotypeList = resultSet.getHaplotypeList();
final ActiveRegion originalGenotypingRegion = resultSet.getRegionForGenotyping();
EventMap.buildEventMapsForHaplotypes(haplotypeList, resultSet.getFullReferenceWithPadding(), resultSet.getPaddedReferenceLoc(), DEBUG);
final TreeSet<VariantContext> allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypeList);
allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype);
final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalGenotypingRegion, allVariantsWithinFullActiveRegion,false);
if ( trimmedActiveRegion == null ) {
// there were no variants found within the active region itself, so just return null
if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)");
return null;
}
// trim down the reads and add them to the trimmed active region
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(originalGenotypingRegion.getReads().size());
for( final GATKSAMRecord read : originalGenotypingRegion.getReads() ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read,
trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 )
trimmedReads.add(clippedRead);
}
trimmedActiveRegion.clearReads();
trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
return trimmedActiveRegion;
}
/**
* Trims a assembly result set according to the active-region trimming.
*
* @param resultSet the original assembly result set.
* @param trimmedActiveRegion the trimmed active region to trim to.
* @return the assembly result set trimmed.
*/
private AssemblyResultSet trimAssemblyResultSet(final AssemblyResultSet resultSet, final ActiveRegion trimmedActiveRegion) {
if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes");
final List<Haplotype> haplotypeList = resultSet.getHaplotypeList();
// trim down the haplotypes
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes = new HashMap<>();
for ( final Haplotype h : haplotypeList ) {
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
if ( trimmed != null ) {
if (originalByTrimmedHaplotypes.containsKey(trimmed)) {
if (trimmed.isReference()) {
originalByTrimmedHaplotypes.remove(trimmed);
originalByTrimmedHaplotypes.put(trimmed, h);
}
} else
originalByTrimmedHaplotypes.put(trimmed,h);
} else if (h.isReference())
throw new IllegalStateException("trimming eliminates the reference haplotype");
else if ( DEBUG ) {
logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() +
" because it starts with or ends with an insertion or deletion when trimmed to " +
trimmedActiveRegion.getExtendedLoc());
}
}
// create the final list of trimmed haplotypes
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet());
// resort the trimmed haplotypes.
Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator());
final Map<Haplotype,Haplotype> sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size());
for (final Haplotype trimmed : trimmedHaplotypes)
sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed));
if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " +
trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " +
haplotypeList.size() + " to only " + trimmedHaplotypes.size());
if ( DEBUG )
for ( final Haplotype remaining: trimmedHaplotypes )
logger.info("Remains: " + remaining + " cigar " + remaining.getCigar());
return resultSet.trimTo(trimmedActiveRegion,sortedOriginalByTrimmedHaplotypes);
}
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// reduce // reduce

View File

@ -46,8 +46,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller; package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.sun.istack.internal.NotNull;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.Haplotype;
@ -82,6 +80,7 @@ public class KmerSequence implements List<Kmer> {
* @param hap the haplotype to represent as a sequence of kmers. * @param hap the haplotype to represent as a sequence of kmers.
* @param kmerSize the kmer size. * @param kmerSize the kmer size.
*/ */
@SuppressWarnings("unused")
public KmerSequence(final Haplotype hap, final int kmerSize) { public KmerSequence(final Haplotype hap, final int kmerSize) {
this(hap.getBases(), kmerSize); this(hap.getBases(), kmerSize);
} }
@ -96,7 +95,6 @@ public class KmerSequence implements List<Kmer> {
this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length); this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length);
} }
/** /**
* Creates a kmer sequence out of a range of a byte array * Creates a kmer sequence out of a range of a byte array
* *
@ -186,7 +184,6 @@ public class KmerSequence implements List<Kmer> {
} }
@Override @Override
@NotNull
public Iterator<Kmer> iterator() { public Iterator<Kmer> iterator() {
return new Iterator<Kmer>() { return new Iterator<Kmer>() {
@ -209,16 +206,14 @@ public class KmerSequence implements List<Kmer> {
}; };
} }
@NotNull
@Override @Override
public Object[] toArray() { public Object[] toArray() {
return toArray(new Kmer[size()]); return toArray(new Kmer[size()]);
} }
@Override @Override
@NotNull
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public <T> T[] toArray(@NotNull final T[] a) { public <T> T[] toArray(final T[] a) {
if (a == null) { if (a == null) {
throw new IllegalArgumentException(); throw new IllegalArgumentException();
} else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) { } else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) {
@ -261,17 +256,17 @@ public class KmerSequence implements List<Kmer> {
} }
@Override @Override
public boolean addAll(final int index, @NotNull final Collection<? extends Kmer> c) { public boolean addAll(final int index, final Collection<? extends Kmer> c) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public boolean removeAll(@NotNull final Collection<?> c) { public boolean removeAll(final Collection<?> c) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public boolean retainAll(@NotNull final Collection<?> c) { public boolean retainAll(final Collection<?> c) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@ -352,19 +347,16 @@ public class KmerSequence implements List<Kmer> {
} }
@Override @Override
@NotNull
public ListIterator<Kmer> listIterator() { public ListIterator<Kmer> listIterator() {
return new MyListIterator(0); return new MyListIterator(0);
} }
@Override @Override
@NotNull
public ListIterator<Kmer> listIterator(final int index) { public ListIterator<Kmer> listIterator(final int index) {
return new MyListIterator(index); return new MyListIterator(index);
} }
@Override @Override
@NotNull
public List<Kmer> subList(final int fromIndex, final int toIndex) { public List<Kmer> subList(final int fromIndex, final int toIndex) {
return subsequence(fromIndex,toIndex); return subsequence(fromIndex,toIndex);
} }
@ -373,7 +365,6 @@ public class KmerSequence implements List<Kmer> {
* Returns the byte array representation of the kmer sequence. * Returns the byte array representation of the kmer sequence.
* @return never {@code null}. * @return never {@code null}.
*/ */
@NotNull
public byte[] getBytes() { public byte[] getBytes() {
if (start == 0 && rawLength == sequence.length) if (start == 0 && rawLength == sequence.length)
return sequence; return sequence;

View File

@ -46,7 +46,6 @@
package org.broadinstitute.sting.utils.collections; package org.broadinstitute.sting.utils.collections;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import com.sun.istack.internal.NotNull;
import java.lang.reflect.Array; import java.lang.reflect.Array;
import java.util.*; import java.util.*;
@ -264,7 +263,7 @@ public class CountSet implements Cloneable, Set<Integer> {
* @return <code>true</code> if <code>value</code> is inside the set, <code>false</code> otherwise. * @return <code>true</code> if <code>value</code> is inside the set, <code>false</code> otherwise.
*/ */
public boolean contains(final int value) { public boolean contains(final int value) {
return Arrays.binarySearch(elements,0,size,value) >= 0; return Arrays.binarySearch(elements, 0, size, value) >= 0;
} }
/** /**
@ -300,14 +299,13 @@ public class CountSet implements Cloneable, Set<Integer> {
return false; //To change body of implemented methods use File | Settings | File Templates. return false; //To change body of implemented methods use File | Settings | File Templates.
} }
@Override @Override
@NotNull
public Iterator<Integer> iterator() { public Iterator<Integer> iterator() {
return new MyIterator(); return new MyIterator();
} }
@Override @Override
@NotNull
public Object[] toArray() { public Object[] toArray() {
final Integer[] result = new Integer[size]; final Integer[] result = new Integer[size];
for (int i = 0; i < size; i++) for (int i = 0; i < size; i++)
@ -316,7 +314,6 @@ public class CountSet implements Cloneable, Set<Integer> {
} }
@Override @Override
@NotNull
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public <T> T[] toArray(final T[] a) { public <T> T[] toArray(final T[] a) {
if (a == null) if (a == null)
@ -328,7 +325,7 @@ public class CountSet implements Cloneable, Set<Integer> {
throw new ArrayStoreException(); throw new ArrayStoreException();
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
final T[] dest = (a.length < size) ? (T[]) (Object[]) Array.newInstance(componentClass, size) : a; final T[] dest = (a.length < size) ? (T[]) Array.newInstance(componentClass, size) : a;
for (int i = 0; i < size; i++) for (int i = 0; i < size; i++)
dest[i] = (T) (Integer) elements[i]; dest[i] = (T) (Integer) elements[i];
@ -339,7 +336,6 @@ public class CountSet implements Cloneable, Set<Integer> {
* Copies the content of the set into an integer array. The result can be freely modified by the invoker. * Copies the content of the set into an integer array. The result can be freely modified by the invoker.
* @return never <code>null</code> but a zero-length array if the set is empty. * @return never <code>null</code> but a zero-length array if the set is empty.
*/ */
@NotNull
public int[] toIntArray() { public int[] toIntArray() {
return Arrays.copyOfRange(elements,0,size); return Arrays.copyOfRange(elements,0,size);
} }
@ -470,7 +466,6 @@ public class CountSet implements Cloneable, Set<Integer> {
* Returns a copy of this set which can be changed without modifying the original one. * Returns a copy of this set which can be changed without modifying the original one.
* @return never {@code null}. * @return never {@code null}.
*/ */
@NotNull
@SuppressWarnings("all") @SuppressWarnings("all")
public CountSet clone() { public CountSet clone() {
return new CountSet(this); return new CountSet(this);

View File

@ -146,7 +146,7 @@ public class AssemblyResultSetUnitTest extends BaseTest
for (final Haplotype h : haplotypesAndResultSets.keySet()) for (final Haplotype h : haplotypesAndResultSets.keySet())
originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h); originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h);
final AssemblyResultSet trimmed = subject.trimTo(newRegion, originalHaplotypesByTrimmed); final AssemblyResultSet trimmed = subject.trimTo(newRegion);
Assert.assertFalse(subject.wasTrimmed()); Assert.assertFalse(subject.wasTrimmed());
Assert.assertTrue(trimmed.wasTrimmed()); Assert.assertTrue(trimmed.wasTrimmed());

View File

@ -66,11 +66,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data // this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "53aa13711a1ceec1453f21c705723f04"}); tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "53aa13711a1ceec1453f21c705723f04"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "1dfe1fc8079938adf1565450671094d4"}); tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7735be71f57e62929947c289cd48bb9c"});
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "d98245380500a6decfc26dcaadb2c4d2"}); tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "f0a761c310519133ed4f3ad465d986fc"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "c4c28e74eda133f99e0864ad16c965c4"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "aa7c0e3bec4ac307068f85f58d48625f"});
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e4d36f165b2ddbb923d3c9a402e96f1b"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "cf2167a563f86af4df26733e2aa6ced6"});
return tests.toArray(new Object[][]{}); return tests.toArray(new Object[][]{});
} }

View File

@ -45,8 +45,6 @@
*/ */
package org.broadinstitute.sting.utils; package org.broadinstitute.sting.utils;
import com.sun.istack.internal.NotNull;
import java.util.Random; import java.util.Random;
/** /**
@ -70,6 +68,7 @@ public class RandomDNA {
* described in {@link Random} documentation. * described in {@link Random} documentation.
* </p> * </p>
*/ */
@SuppressWarnings("unused")
public RandomDNA() { public RandomDNA() {
random = new Random(); random = new Random();
} }
@ -115,7 +114,6 @@ public class RandomDNA {
* @throws IllegalArgumentException if {@code size} is negative. * @throws IllegalArgumentException if {@code size} is negative.
* @return never {@code null}. * @return never {@code null}.
*/ */
@NotNull
public byte[] nextBases(final int size) { public byte[] nextBases(final int size) {
if (size < 0) throw new IllegalArgumentException("the size cannot be negative"); if (size < 0) throw new IllegalArgumentException("the size cannot be negative");
final byte[] result = new byte[size]; final byte[] result = new byte[size];

View File

@ -32,7 +32,9 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*; import java.util.*;
@ -407,30 +409,42 @@ public class ActiveRegion implements HasGenomeLocation {
} }
/** /**
* Trim this active to just the newExtent, producing a new active region without any reads that has only * Trim this active to just the span, producing a new active region without any reads that has only
* the extent of newExtend intersected with the current extent * the extent of newExtend intersected with the current extent
* @param newExtent the new extend of the active region we want * @param span the new extend of the active region we want
* @param newExtension the extension size we want for the newly trimmed active region * @param extension the extension size we want for the newly trimmed active region
* @return a non-null, empty active region * @return a non-null, empty active region
*/ */
public ActiveRegion trim(final GenomeLoc newExtent, final int newExtension) { public ActiveRegion trim(final GenomeLoc span, final int extension) {
if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null");
if ( extension < 0) throw new IllegalArgumentException("the extension size must be 0 or greater");
final int extendStart = Math.max(1,span.getStart() - extension);
final int maxStop = genomeLocParser.getContigs().getSequence(span.getContigIndex()).getSequenceLength();
final int extendStop = Math.min(span.getStop() + extension, maxStop);
final GenomeLoc extendedSpan = genomeLocParser.createGenomeLoc(span.getContig(), extendStart, extendStop);
return trim(span, extendedSpan);
final GenomeLoc subLoc = getLocation().intersect(newExtent); //TODO - Inconsiste support of substates trimming. Check lack of consistency!!!!
final int subStart = subLoc.getStart() - getLocation().getStart(); // final GenomeLoc subLoc = getLocation().intersect(span);
final int subEnd = subStart + subLoc.size(); // final int subStart = subLoc.getStart() - getLocation().getStart();
final List<ActivityProfileState> subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); // final int subEnd = subStart + subLoc.size();
return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, newExtension ); // final List<ActivityProfileState> subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd);
// return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension );
}
public ActiveRegion trim(final GenomeLoc span) {
return trim(span,span);
} }
/** /**
* Trim this active to no more than the newExtent, producing a new active region without any reads that * Trim this active to no more than the span, producing a new active region with properly trimmed reads that
* attempts to provide the best possible representation of this active region covering the newExtent. * attempts to provide the best possible representation of this active region covering the span.
* *
* The challenge here is that newExtent may (1) be larger than can be represented by this active region * The challenge here is that span may (1) be larger than can be represented by this active region
* + its original extension and (2) the extension must be symmetric on both sides. This algorithm * + its original extension and (2) the extension must be symmetric on both sides. This algorithm
* therefore determines how best to represent newExtent as a subset of the span of this * therefore determines how best to represent span as a subset of the span of this
* region with a padding value that captures as much of the newExtent as possible. * region with a padding value that captures as much of the span as possible.
* *
* For example, suppose this active region is * For example, suppose this active region is
* *
@ -442,18 +456,37 @@ public class ActiveRegion implements HasGenomeLocation {
* The overall constraint is that the active region can never exceed the original active region, and * The overall constraint is that the active region can never exceed the original active region, and
* the extension is chosen to maximize overlap with the desired region * the extension is chosen to maximize overlap with the desired region
* *
* @param newExtent the new extend of the active region we want * @param span the new extend of the active region we want
* @return a non-null, empty active region * @return a non-null, empty active region
*/ */
public ActiveRegion trim(final GenomeLoc newExtent) { public ActiveRegion trim(final GenomeLoc span, final GenomeLoc extendedSpan) {
if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null");
if ( extendedSpan == null ) throw new IllegalArgumentException("Active region extended span cannot be null");
if ( ! extendedSpan.containsP(span))
throw new IllegalArgumentException("The requested extended must fully contain the requested span");
final GenomeLoc subActive = getLocation().intersect(newExtent); final GenomeLoc subActive = getLocation().intersect(span);
final int requiredOnRight = Math.max(newExtent.getStop() - subActive.getStop(), 0); final int requiredOnRight = Math.max(extendedSpan.getStop() - subActive.getStop(), 0);
final int requiredOnLeft = Math.max(subActive.getStart() - newExtent.getStart(), 0); final int requiredOnLeft = Math.max(subActive.getStart() - extendedSpan.getStart(), 0);
final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension());
return new ActiveRegion( subActive, Collections.<ActivityProfileState>emptyList(), isActive, genomeLocParser, requiredExtension ); final ActiveRegion result = new ActiveRegion( subActive, Collections.<ActivityProfileState>emptyList(), isActive, genomeLocParser, requiredExtension );
final List<GATKSAMRecord> myReads = getReads();
final GenomeLoc resultExtendedLoc = result.getExtendedLoc();
final int resultExtendedLocStart = resultExtendedLoc.getStart();
final int resultExtendedLocStop = resultExtendedLoc.getStop();
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(myReads.size());
for( final GATKSAMRecord read : myReads ) {
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion(read,
resultExtendedLocStart, resultExtendedLocStop);
if( result.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 )
trimmedReads.add(clippedRead);
}
result.clearReads();
result.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
return result;
} }
public void setFinalized(final boolean value) { public void setFinalized(final boolean value) {
@ -463,4 +496,5 @@ public class ActiveRegion implements HasGenomeLocation {
public boolean isFinalized() { public boolean isFinalized() {
return hasBeenFinalized; return hasBeenFinalized;
} }
} }