samplePileups = pileup.getPileupsForSamples(samples.keySet());
@@ -99,7 +107,11 @@ final class IntervalStratification extends AbstractStratification {
sampleStratification.addLocus(context.getLocation(), samplePileup);
}
+ gcCount += (ref.getBase() == 'G' || ref.getBase() == 'C') ? 1 : 0;
+ }
+ public double gcContent() {
+ return (double) gcCount / interval.size();
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
index a6cbc1da3..b088951e5 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
@@ -77,7 +77,7 @@ final class ThresHolder {
* If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE
*/
@Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false)
- public int maximumCoverage = 700;
+ public int maximumCoverage = Integer.MAX_VALUE / 2;
/**
* If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
index 9296cc89b..63c35fd65 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
@@ -47,29 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
/**
- * Short one line description of the walker.
- *
- *
- * [Long description of the walker]
- *
- *
- *
- * Input
- *
- * [Description of the Input]
- *
- *
- * Output
- *
- * [Description of the Output]
- *
- *
- * Examples
- *
- * java
- * -jar GenomeAnalysisTK.jar
- * -T [walker name]
- *
+ * Metrics class for the QualifyMissingInterval walker
*
* @author Mauricio Carneiro
* @since 5/1/13
@@ -81,6 +59,8 @@ final class Metrics {
private int reads;
private int refs;
+ public Metrics() {}
+
void reads(int reads) {this.reads = reads;}
void refs(int refs) {this.refs = refs;}
@@ -108,4 +88,13 @@ final class Metrics {
return this;
}
+
+ // Test related constructor and methods
+ protected Metrics(double gccontent, double baseQual, double mapQual, int reads, int refs) {
+ this.gccontent = gccontent;
+ this.baseQual = baseQual;
+ this.mapQual = mapQual;
+ this.reads = reads;
+ this.refs = refs;
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
index eabcf20c1..54fc6e97e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
@@ -47,16 +47,15 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Gather;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
-import org.broadinstitute.sting.gatk.walkers.By;
-import org.broadinstitute.sting.gatk.walkers.DataSource;
-import org.broadinstitute.sting.gatk.walkers.LocusWalker;
-import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
+import org.broadinstitute.sting.gatk.report.GATKReportGatherer;
+import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@@ -76,10 +75,12 @@ import java.util.List;
*
* - Average Base Quality
* - Average Mapping Quality
+ * - Average Depth
* - GC Content
- * - Position in the target
- * - Coding Sequence / Intron
- * - Length of the uncovered area
+ * - Position in the target (Integer.MIN_VALUE if no overlap)
+ * - Length of the overlapping target (zero if no overlap)
+ * - Coding Sequence / Intron (optional)
+ * - Length of the uncovered interval
*
*
* Input
@@ -89,7 +90,7 @@ import java.util.List;
*
* Output
*
- * GC content calculations per interval.
+ * GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval.
*
*
* Example
@@ -107,19 +108,82 @@ import java.util.List;
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@By(DataSource.REFERENCE)
+@PartitionBy(PartitionType.INTERVAL)
public final class QualifyMissingIntervals extends LocusWalker implements NanoSchedulable {
+ /**
+ * A single GATKReport table with the qualifications on why the intervals passed by the -L argument were missing.
+ */
+ @Gather(GATKReportGatherer.class)
@Output
protected PrintStream out;
+ /**
+ * List of targets used in the experiment. This file will be used to calculate the distance your missing
+ * intervals are to the targets (usually exons). Typically this is your hybrid selection targets file
+ * (e.g. Agilent exome target list)
+ */
@Argument(shortName = "targets", required = true)
public String targetsFile;
- @Argument(shortName = "cds", required = false)
- public String cdsFile = null;
+ /**
+ * List of baits to distinguish untargeted intervals from those that are targeted but not covered
+ */
+ @Argument(shortName = "baits", required = false)
+ public String baitsFile = null;
+
+ /**
+ * This value will be used to determine whether or not an interval had too high or too low GC content to be
+ * sequenced. This is only applied if there was not enough data in the interval.
+ */
+ @Argument(doc = "upper and lower bound for an interval to be considered high/low GC content",
+ shortName = "gc", required = false)
+ public double gcThreshold = 0.3;
+
+ /**
+ * The coverage of a missing interval may determine whether or not an interval is sequenceable. A low coverage will
+ * trigger gc content, mapping, base qualities and other checks to figure out why this interval was deemed
+ * unsequenceable.
+ */
+ @Argument(doc = "minimum coverage to be considered sequenceable",
+ shortName = "cov", required = false)
+ public int coverageThreshold = 20;
+
+ /**
+ * An average mapping quality above this value will determine the interval to be mappable.
+ */
+ @Argument(doc = "minimum mapping quality for it to be considered usable",
+ shortName = "mmq", required = false)
+ public byte mappingThreshold = 20;
+
+ /**
+ * An average base quality above this value will rule out the possibility of context specific problems with the
+ * sequencer.
+ */
+ @Argument(doc = "minimum base quality for it to be considered usable",
+ shortName = "mbq", required = false)
+ public byte qualThreshold = 20;
+
+ /**
+ * Intervals that are too small generate biased analysis. For example an interval of size 1 will have GC content
+ * 1 or 0. To avoid misinterpreting small intervals, all intervals below this threshold will be ignored in the
+ * interpretation.
+ */
+ @Argument(doc = "minimum interval length to be considered",
+ shortName = "size", required = false)
+ public byte intervalSizeThreshold = 10;
+
+ enum Interpretation {
+ UNKNOWN,
+ UNMAPPABLE,
+ UNSEQUENCEABLE,
+ GCCONTENT,
+ NO_DATA,
+ SMALL_INTERVAL
+ }
GATKReport simpleReport;
- GenomeLocSortedSet target;
- GenomeLocSortedSet cds;
+ GenomeLocSortedSet targets;
+ GenomeLocSortedSet baits;
public boolean isReduceByInterval() {
return true;
@@ -127,13 +191,13 @@ public final class QualifyMissingIntervals extends LocusWalker
public void initialize() {
// if cds file is not provided, just use the targets file (no harm done)
- if (cdsFile == null)
- cdsFile = targetsFile;
+ if (baitsFile == null)
+ baitsFile = targetsFile;
- simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "CD", "LN");
+ simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "INTERVAL", "GC", "BQ", "MQ", "DP", "POS_IN_TARGET", "TARGET_SIZE", "BAITED", "MISSING_SIZE", "INTERPRETATION");
final GenomeLocParser parser = getToolkit().getGenomeLocParser();
- target = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile));
- cds = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, cdsFile));
+ targets = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile));
+ baits = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, baitsFile));
}
public Metrics reduceInit() {
@@ -174,29 +238,90 @@ public final class QualifyMissingIntervals extends LocusWalker
public void onTraversalDone(List> results) {
for (Pair r : results) {
- GenomeLoc interval = r.getFirst();
- Metrics metrics = r.getSecond();
+ final GenomeLoc interval = r.getFirst();
+ final Metrics metrics = r.getSecond();
+ final List overlappingIntervals = targets.getOverlapping(interval);
+
simpleReport.addRow(
interval.toString(),
metrics.gccontent(),
metrics.baseQual(),
metrics.mapQual(),
metrics.depth(),
- getPositionInTarget(interval),
- cds.overlaps(interval),
- interval.size()
+ getPositionInTarget(interval, overlappingIntervals),
+ getTargetSize(overlappingIntervals),
+ baits.overlaps(interval),
+ interval.size(),
+ interpret(metrics, interval)
);
}
simpleReport.print(out);
out.close();
}
- private int getPositionInTarget(GenomeLoc interval) {
- final List hits = target.getOverlapping(interval);
- int result = 0;
- for (GenomeLoc hit : hits) {
- result = interval.getStart() - hit.getStart(); // if there are multiple hits, we'll get the last one.
+ protected static int getPositionInTarget(final GenomeLoc interval, final List targets) {
+ if (targets.size() > 0) {
+ final GenomeLoc target = targets.get(0);
+
+ // interval is larger on both ends than the target -- return the maximum distance to either side as a negative number. (min of 2 negative numbers)
+ if (interval.getStart() < target.getStart() && interval.getStop() > target.getStop())
+ return Math.min(target.getStart() - interval.getStart(), target.getStop() - interval.getStop());
+
+ // interval is a left overlap -- return a negative number representing the distance between the two starts
+ else if (interval.getStart() < target.getStart())
+ return interval.getStart() - target.getStart();
+
+ // interval is a right overlap -- return a negative number representing the distance between the two stops
+ else if (interval.getStop() > target.getStop())
+ return target.getStop() - interval.getStop();
+
+ // interval is fully contained -- return the smallest distance to the edge of the target (left or right) as a positive number
+ return Math.min(interval.getStart() - target.getStart(), target.getStop() - interval.getStop());
}
- return result;
+ // if there is no overlapping interval, return int min value.
+ return Integer.MIN_VALUE;
}
+
+ private int getTargetSize(final List overlappingIntervals) {
+ return overlappingIntervals.size() > 0 ? overlappingIntervals.get(0).size() : -1;
+ }
+
+ String interpret(final Metrics metrics, final GenomeLoc interval) {
+ if (interval.size() < intervalSizeThreshold) {
+ return Interpretation.SMALL_INTERVAL.toString();
+ }
+ else if (metrics.depth() == 0.0) {
+ return Interpretation.NO_DATA.toString();
+ }
+ return trim(checkMappability(metrics) + checkGCContent(metrics) + checkContext(metrics));
+ }
+
+ String checkMappability(Metrics metrics) {
+ return metrics.depth() >= coverageThreshold && metrics.mapQual() < mappingThreshold ?
+ Interpretation.UNMAPPABLE + ", " : "";
+ }
+
+ String checkGCContent(Metrics metrics) {
+ return metrics.depth() < coverageThreshold && (metrics.gccontent() < gcThreshold || metrics.gccontent() > 1.0-gcThreshold) ?
+ Interpretation.GCCONTENT + ", " : "";
+ }
+
+ String checkContext(Metrics metrics) {
+ return metrics.depth() < coverageThreshold && metrics.baseQual() < qualThreshold ?
+ Interpretation.UNSEQUENCEABLE + ", " : "";
+ }
+
+ String trim (String s) {
+ if (s.isEmpty())
+ return Interpretation.UNKNOWN.toString();
+
+ s = s.trim();
+ if (s.endsWith(","))
+ s = s.substring(0, s.length() - 1);
+ return s;
+ }
+
+
+
+
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
index ddf47805f..6f16a704f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
@@ -76,16 +76,13 @@ public class ConsensusAlleleCounter {
private final int minIndelCountForGenotyping;
private final boolean doMultiAllelicCalls;
private final double minFractionInOneSample;
- private final GenomeLocParser locParser;
- public ConsensusAlleleCounter(final GenomeLocParser locParser,
- final boolean doMultiAllelicCalls,
+ public ConsensusAlleleCounter(final boolean doMultiAllelicCalls,
final int minIndelCountForGenotyping,
final double minFractionInOneSample) {
this.minIndelCountForGenotyping = minIndelCountForGenotyping;
this.doMultiAllelicCalls = doMultiAllelicCalls;
this.minFractionInOneSample = minFractionInOneSample;
- this.locParser = locParser;
}
/**
@@ -289,7 +286,7 @@ public class ConsensusAlleleCounter {
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
- final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
+ final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false);
return mergedVC.getAlleles();
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
index 9c4694955..3cee8f2d8 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
@@ -108,7 +108,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
final List allAllelesToUse){
- List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
+ List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, UAC,true);
if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index 0f3f7739d..4a3231b3e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -89,9 +89,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected static List computeConsensusAlleles(final ReferenceContext ref,
final Map contexts,
final AlignmentContextUtils.ReadOrientation contextType,
- final GenomeLocParser locParser,
final UnifiedArgumentCollection UAC) {
- ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
+ ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
@@ -113,7 +112,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// starting a new site: clear allele list
haplotypeMap.clear();
perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
- alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels);
+ alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels);
if (alleleList.isEmpty())
return null;
}
@@ -212,7 +211,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final ReferenceContext ref,
final Map contexts,
final AlignmentContextUtils.ReadOrientation contextType,
- final GenomeLocParser locParser,
final UnifiedArgumentCollection UAC,
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
@@ -244,7 +242,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
} else {
- alleles = computeConsensusAlleles(ref, contexts, contextType, locParser, UAC);
+ alleles = computeConsensusAlleles(ref, contexts, contextType, UAC);
}
return alleles;
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index 4fae3d6e3..04c5587c3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -95,7 +95,11 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public int MIN_BASE_QUALTY_SCORE = 17;
- @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
+ /**
+ * If the fraction of reads with deletions spanning a locus is greater than this value, the site will not be considered callable and will be skipped.
+ * To disable the use of this parameter, set its value to >1.
+ */
+ @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index ec31e1f2f..5c6e9dc01 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -374,7 +374,7 @@ public class UnifiedGenotyperEngine {
final VariantContext vc,
final GenotypeLikelihoodsCalculationModel.Model model,
final Map perReadAlleleLikelihoodMap) {
- return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap);
+ return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap);
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
index 063e3b218..f1db5bcd7 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java
@@ -86,6 +86,7 @@ class ActiveRegionTrimmer {
if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping);
if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
+ logger.debug("Trimmer created with parameters " + logTrimming + " " + snpPadding + " " + nonSnpPadding + " " + maxDistanceInExtensionForGenotyping);
this.logTrimming = logTrimming;
this.snpPadding = snpPadding;
this.nonSnpPadding = nonSnpPadding;
@@ -101,28 +102,35 @@ class ActiveRegionTrimmer {
*
* @param region our full active region
* @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position
+ * @param emitReferenceConfidence are we going to estimate the reference confidence with this active region?
* @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
*/
- public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) {
+ public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) {
+
if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region
return null;
- final List withinActiveRegion = new LinkedList();
- int pad = snpPadding;
+ final List withinActiveRegion = new LinkedList<>();
+ boolean foundNonSnp = false;
GenomeLoc trimLoc = null;
for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
if ( region.getLocation().overlapsP(vcLoc) ) {
if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
- pad = nonSnpPadding;
+ foundNonSnp = true;
trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
withinActiveRegion.add(vc);
}
}
+ final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding );
// we don't actually have anything in the region after removing variants that don't overlap the region's full location
if ( trimLoc == null ) return null;
+// final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
+ // Try to have one kmer before and after any event.
+
+ final GenomeLoc regionLoc = region.getLocation();
final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad);
final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
@@ -130,6 +138,7 @@ class ActiveRegionTrimmer {
final ActiveRegion trimmedRegion = region.trim(finalSpan);
if ( logTrimming ) {
logger.info("events : " + withinActiveRegion);
+ logger.info("region : " + regionLoc);
logger.info("trimLoc : " + trimLoc);
logger.info("pad : " + pad);
logger.info("idealSpan : " + idealSpan);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java
index f07dbb392..658ffc10e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java
@@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
/**
* Result of assembling, with the resulting graph and status
@@ -57,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
*/
public class AssemblyResult {
private final Status status;
+ private ReadThreadingGraph threadingGraph;
private final SeqGraph graph;
/**
@@ -72,9 +74,25 @@ public class AssemblyResult {
this.graph = graph;
}
+ /**
+ * Returns the threading-graph associated with this assembly-result.
+ */
+ public void setThreadingGraph(final ReadThreadingGraph threadingGraph) {
+ this.threadingGraph = threadingGraph;
+ }
+
+ public ReadThreadingGraph getThreadingGraph() {
+ return threadingGraph;
+ }
+
public Status getStatus() { return status; }
public SeqGraph getGraph() { return graph; }
+ public int getKmerSize() {
+ return graph.getKmerSize();
+ }
+
+
/**
* Status of the assembly result
*/
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java
new file mode 100644
index 000000000..091c09e8d
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java
@@ -0,0 +1,466 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.collections.CountSet;
+import org.broadinstitute.sting.utils.collections.CountSet;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.*;
+
+/**
+ * Collection of read assembly using several kmerSizes.
+ *
+ *
+ * There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those
+ * assemblies.
+ *
+ *
+ *
+ * Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one.
+ *
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.com>
+ */
+public class AssemblyResultSet {
+
+ private final Map assemblyResultByKmerSize;
+ private final Set haplotypes;
+ private final Map assemblyResultByHaplotype;
+ private ActiveRegion regionForGenotyping;
+ private byte[] fullReferenceWithPadding;
+ private GenomeLoc paddedReferenceLoc;
+ private boolean variationPresent;
+ private Haplotype refHaplotype;
+ private boolean wasTrimmed = false;
+ private final CountSet kmerSizes;
+
+ /**
+ * Constructs a new empty assembly result set.
+ */
+ public AssemblyResultSet() {
+ assemblyResultByKmerSize = new LinkedHashMap<>(4);
+ haplotypes = new LinkedHashSet<>(10);
+ assemblyResultByHaplotype = new LinkedHashMap<>(10);
+ kmerSizes = new CountSet(4);
+ }
+
+ /**
+ * Trims an assembly result set down based on a new set of trimmed haplotypes.
+ *
+ * @param originalByTrimmedHaplotypes map from trimmed to original haplotypes.
+ * @param trimmedActiveRegion the trimmed down active region.
+ *
+ * @throws NullPointerException if any argument in {@code null} or
+ * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys.
+ * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones.
+ *
+ *
+ * @return never {@code null}, a new trimmed assembly result set.
+ */
+ public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion,
+ final Map originalByTrimmedHaplotypes) {
+ if (refHaplotype == null) throw new IllegalStateException();
+ if (trimmedActiveRegion == null) throw new NullPointerException();
+ final AssemblyResultSet result = new AssemblyResultSet();
+
+ for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) {
+ final Haplotype original = originalByTrimmedHaplotypes.get(trimmed);
+ if (original == null)
+ throw new NullPointerException("all trimmed haplotypes must have an original one");
+ final AssemblyResult as = assemblyResultByHaplotype.get(original);
+ if (as == null) result.add(trimmed); else result.add(trimmed, as);
+ }
+
+ result.setRegionForGenotyping(trimmedActiveRegion);
+ result.setFullReferenceWithPadding(this.fullReferenceWithPadding);
+ result.setPaddedReferenceLoc(this.paddedReferenceLoc);
+ if (result.refHaplotype == null)
+ throw new IllegalStateException("missing reference haplotype in the trimmed set");
+ result.wasTrimmed = true;
+ return result;
+ }
+
+ /**
+ * Query the reference haplotype in the result set.
+ * @return {@code null} if none wasn't yet added, otherwise a reference haplotype.
+ */
+ public Haplotype getReferenceHaplotype() {
+ return refHaplotype;
+ }
+
+ /**
+ * Checks whether there is any variation present in the assembly result set.
+ *
+ *
+ * This is equivalent to whether there is more than one haplotype.
+ *
+ *
+ * @return {@code true} if there is variation present, {@code false} otherwise.
+ */
+ public boolean isVariationPresent() {
+ return variationPresent && haplotypes.size() > 1;
+ }
+
+ /**
+ * Dumps debugging information into a print-writer.
+ *
+ * @param pw where to dump the information.
+ *
+ * @throws NullPointerException if {@code pw} is {@code null}.
+ */
+ public void debugDump(final PrintWriter pw) {
+ if (getHaplotypeList().size() == 0) {
+ return;
+ }
+ pw.println("Active Region " + this.regionForGenotyping.getLocation());
+ pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc());
+ pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation());
+ pw.println("Haplotype count " + haplotypes.size());
+ final Map kmerSizeToCount = new HashMap<>();
+
+ for (final Map.Entry e : assemblyResultByHaplotype.entrySet()) {
+ final AssemblyResult as = e.getValue();
+ final int kmerSize = as.getGraph().getKmerSize();
+ if (kmerSizeToCount.containsKey(kmerSize)) {
+ kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1);
+ } else {
+ kmerSizeToCount.put(kmerSize,1);
+ }
+ }
+ pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() );
+ Integer[] kmerSizes = new Integer[kmerSizeToCount.size()];
+ kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes);
+ Arrays.sort(kmerSizes);
+ pw.println("Kmer sizes values " + Arrays.toString(kmerSizes));
+ for (int size : kmerSizes) {
+ pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size));
+ }
+ }
+
+ /**
+ * Adds a haplotype to the result set without indicating a generating assembly result.
+ *
+ *
+ * It is possible to call this method with the same haplotype several times. In that the second and further
+ * calls won't have any effect (thus returning {@code false}).
+ *
+ *
+ * @param h the haplotype to add to the assembly result set.
+ *
+ * @throws NullPointerException if {@code h} is {@code null}
+ * @throws IllegalArgumentException if {@code h} does not have a genome location.
+ *
+ * @return {@code true} if the assembly result set has been modified as a result of this call.
+ */
+ public boolean add(final Haplotype h) {
+ if (h == null) throw new NullPointerException("input haplotype cannot be null");
+ if (h.getGenomeLocation() == null)
+ throw new IllegalArgumentException("the haplotype provided must have a genomic location");
+ if (haplotypes.contains(h))
+ return false;
+ haplotypes.add(h);
+ updateReferenceHaplotype(h);
+ return true;
+ }
+
+ /**
+ * Adds simultaneously a haplotype and the generating assembly-result.
+ *
+ *
+ * Haplotypes and their assembly-result can be added multiple times although just the first call will have
+ * any effect (return value is {@code true}).
+ *
+ *
+ *
+ * @param h haplotype to add.
+ * @param ar assembly-result that is assumed to have given rise to that haplotype.
+ *
+ * @throws NullPointerException if {@code h} or {@code ar} is {@code null}.
+ * @throws IllegalArgumentException if {@code h} has not defined genome location.
+ *
+ * @return {@code true} iff this called changes the assembly result set.
+ */
+ public boolean add(final Haplotype h, final AssemblyResult ar) {
+ if (h == null) throw new NullPointerException("input haplotype cannot be null");
+ if (ar == null) throw new NullPointerException("input assembly-result cannot be null");
+ if (h.getGenomeLocation() == null)
+ throw new IllegalArgumentException("the haplotype provided must have a genomic location");
+
+ final boolean assemblyResultAdditionReturn = add(ar);
+
+ if (haplotypes.contains(h)) {
+ final AssemblyResult previousAr = assemblyResultByHaplotype.get(h);
+ if (previousAr == null) {
+ assemblyResultByHaplotype.put(h, ar);
+ return true;
+ } else if (!previousAr.equals(ar))
+ throw new IllegalStateException("there is already a different assembly result for the input haplotype");
+ else
+ return assemblyResultAdditionReturn;
+ } else {
+ haplotypes.add(h);
+ assemblyResultByHaplotype.put(h,ar);
+ updateReferenceHaplotype(h);
+ if (h.isNonReference()) variationPresent = true;
+ return true;
+ }
+ }
+
+ /**
+ * Add a assembly-result object.
+ *
+ * @param ar the assembly result to add.
+ *
+ * @throws NullPointerException if {@code ar} is {@code null}.
+ * @throws IllegalStateException if there is an assembly result with the same kmerSize.
+ * @return {@code true} iff this addition changed the assembly result set.
+ */
+ public boolean add(final AssemblyResult ar) {
+ if (ar == null)
+ throw new NullPointerException();
+ final int kmerSize = ar.getKmerSize();
+ if (assemblyResultByKmerSize.containsKey(kmerSize)) {
+ if (!assemblyResultByKmerSize.get(kmerSize).equals(ar))
+ throw new IllegalStateException("a different assembly result with the same kmerSize was already added");
+ return false;
+ } else {
+ assemblyResultByKmerSize.put(kmerSize, ar);
+ kmerSizes.add(kmerSize);
+ return true;
+ }
+ }
+
+ /**
+ * Returns the current region for genotyping.
+ *
+ * @return might be {@code null}.
+ */
+ public ActiveRegion getRegionForGenotyping() {
+ return regionForGenotyping;
+ }
+
+ /**
+ * Sets the region for genotyping.
+ *
+ * @param regionForGenotyping the new value.
+ */
+ public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) {
+ this.regionForGenotyping = regionForGenotyping;
+ }
+
+ /**
+ * Returns the current full reference with padding.
+ *
+ * @return might be {@code null}.
+ */
+ public byte[] getFullReferenceWithPadding() {
+ return fullReferenceWithPadding;
+ }
+
+ /**
+ * Sets the full reference with padding base sequence.
+ *
+ * @param fullReferenceWithPadding the new value.
+ */
+ public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) {
+ this.fullReferenceWithPadding = fullReferenceWithPadding;
+ }
+
+ /**
+ * Returns the padded reference location.
+ *
+ * @return might be {@code null}
+ */
+ public GenomeLoc getPaddedReferenceLoc() {
+ return paddedReferenceLoc;
+ }
+
+ /**
+ * Changes the padded reference location.
+ * @param paddedReferenceLoc the new value.
+ */
+ public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) {
+ this.paddedReferenceLoc = paddedReferenceLoc;
+ }
+
+ /**
+ * Returns the number of haplotypes in the assembly result set.
+ * @return {@code 0} or greater.
+ */
+ public int getHaplotypeCount() {
+ return haplotypes.size();
+ }
+
+ /**
+ * Returns the haplotypes as a list.
+ *
+ *
+ * The result is unmodifiable.
+ *
+ *
+ * @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly.
+ */
+ public List getHaplotypeList() {
+ return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()]));
+ }
+
+ /**
+ * Returns the maximum kmerSize available.
+ *
+ * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize.
+ *
+ * @return greater than 0.
+ */
+ public int getMaximumKmerSize() {
+ if (kmerSizes.size() == 0)
+ throw new IllegalStateException("there is yet no kmerSize in this assembly result set");
+ return kmerSizes.max();
+ }
+
+ /**
+ * Indicates whether there are more than one kmerSize in the set.
+ *
+ * @return {@code true} iff there is more than one kmerSize assembly in the set.
+ */
+ public boolean hasMultipleKmerSizes() {
+ return kmerSizes.size() > 1;
+ }
+
+ /**
+ * Returns the minimum kmerSize available.
+ *
+ * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize.
+ *
+ * @return greater than 0.
+ */
+ public int getMinimumKmerSize() {
+ if (kmerSizes.size() == 0)
+ throw new IllegalStateException("there is yet no kmerSize in this assembly result set");
+ return kmerSizes.min();
+ }
+
+ /**
+ * Returns a read-threading graph in the assembly set that has a particular kmerSize.
+ *
+ * @param kmerSize the requested kmerSize.
+ *
+ * @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize.
+ */
+ public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) {
+ final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize);
+ if (assemblyResult == null) return null;
+ return assemblyResult.getThreadingGraph();
+ }
+
+ /**
+ * Checks whether this assembly result set was trimmed.
+ *
+ * @return {@code true} iff this assembly result set was trimmed.
+ */
+ public boolean wasTrimmed() {
+ return wasTrimmed;
+ }
+
+ /**
+ * Marks the assembly as not having variation even if it has more than one haplotype.
+ */
+ public void resetVariationPresent() {
+ variationPresent = false;
+ }
+
+ /**
+ * Dumps debugging information into a logger.
+ *
+ * @param logger where to dump the information.
+ *
+ * @throws NullPointerException if {@code logger} is {@code null}.
+ */
+ public void debugDump(final Logger logger) {
+ final StringWriter sw = new StringWriter();
+ final PrintWriter pw = new PrintWriter(sw);
+ debugDump(pw);
+ final String str = sw.toString();
+ final String[] lines = str.split("\n");
+ for (final String line : lines) {
+ if (line.isEmpty()) {
+ continue;
+ }
+ logger.debug(line);
+ }
+ }
+
+ /**
+ * Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the
+ * reference haplotype and updates {@link #refHaplotype} accordingly.
+ *
+ *
+ * This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes}
+ * I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference
+ * haplotype and this has already be set. This is the case even if the new haplotypes and the current reference
+ * are equal.
+ *
+ *
+ * @param newHaplotype the new haplotype.
+ * @throws NullPointerException if {@code newHaplotype} is {@code null}.
+ * @throws IllegalStateException if there is already a reference haplotype.
+ */
+ private void updateReferenceHaplotype(final Haplotype newHaplotype) {
+ if (!newHaplotype.isReference()) return;
+ if (refHaplotype == null)
+ refHaplotype = newHaplotype;
+ else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality.
+ throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different");
+ }
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java
new file mode 100644
index 000000000..be7305085
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java
@@ -0,0 +1,169 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
+
+import java.util.*;
+
+/**
+ * Represents an event block in the graph.
+ *
+ *
+ * Event block is defined as the non-trivial section of the haplotype-graph between two vertices along the
+ * reference route, that has at least one alternative route between those two vertices.
+ *
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
+ */
+public class EventBlock {
+
+ private final HaplotypeGraph graph;
+
+ private final MultiDeBruijnVertex source;
+
+ private final int sourcePosition;
+
+ private final MultiDeBruijnVertex sink;
+
+ private final int sinkPosition;
+
+ private Set> routesAcross;
+
+
+ /**
+ * Constructs a event block given the base haplotype graph and the source and sink vertice (both included in the block)
+ * @param graph the base haplotype graph.
+ * @param source the starting vertex.
+ * @param sink the ending vertex.
+ *
+ * @throws NullPointerException if any of the input is {@code null}.
+ * @throws IllegalArgumentException if {@code source} or {@code sink} are not part of the graphs reference route,
+ * such a route does not exists, any of the vertices is not part of such a route or they are out of order.
+ */
+ public EventBlock(final HaplotypeGraph graph, final MultiDeBruijnVertex source, final MultiDeBruijnVertex sink) {
+ if (graph == null) throw new NullPointerException("the graph cannot be null");
+ if (source == null) throw new NullPointerException("the source vertex is null");
+ if (sink == null) throw new NullPointerException("the sink node is null");
+ this.graph = graph;
+ this.source = source;
+ this.sink = sink;
+ final HaplotypeRoute route = graph.getReferenceRoute();
+ if (route == null)
+ throw new IllegalArgumentException("there is reference route in the graph");
+ this.sourcePosition = route.getVertexPosition(source);
+ this.sinkPosition = route.getVertexPosition(sink);
+ if (sourcePosition == -1)
+ throw new IllegalArgumentException("the source vertex does not belong to the reference route");
+ if (sinkPosition == -1)
+ throw new IllegalArgumentException("the sink vertex does not belong to the reference route");
+ if (sourcePosition > sinkPosition)
+ throw new IllegalArgumentException("source and sink vertices are out of order in reference route");
+ }
+
+ /**
+ * Returns a reference to the event block graph.
+ *
+ * @return never {@code null}.
+ */
+ public HaplotypeGraph getGraph() {
+ return graph;
+ }
+
+ /**
+ * Returns a reference to the block starting vertex.
+ *
+ * @return never {@code null}.
+ */
+ public MultiDeBruijnVertex getSource() {
+ return source;
+ }
+
+ /**
+ * Returns the reference ot the end block vertex.
+ *
+ * @return never {@code null}.
+ */
+ public MultiDeBruijnVertex getSink() {
+ return sink;
+ }
+
+ /**
+ * Returns all possible routes between the event block start and end vertices.
+ * @return never {@code null}, and unmodifiable route set.
+ */
+ public Set> getRoutesAcross() {
+ // catching:
+ if (routesAcross != null) return routesAcross;
+
+ final Set> result = new HashSet<>(10); // 10 is rather generous.
+
+ // bread-first iterative search for all paths.
+ final Queue> queue = new LinkedList<>();
+
+ queue.add(new Route<>(source, graph)); // the seed is the empty route at the start vertex.
+
+ final HaplotypeRoute referenceRoute = graph.getReferenceRoute();
+
+ while (!queue.isEmpty()) {
+ final Route route = queue.remove();
+ final MultiDeBruijnVertex routeEndVertex = route.getLastVertex();
+
+ if (routeEndVertex == sink) // bingo!!!
+ result.add(route);
+ else { // only queue promising extension of this route.
+ final int routeEndPosition = referenceRoute.getVertexPosition(routeEndVertex);
+ if (routeEndPosition == -1 || (routeEndPosition >= sourcePosition && routeEndPosition < sinkPosition))
+ for (final MultiSampleEdge e : graph.outgoingEdgesOf(routeEndVertex))
+ queue.add(new Route<>(route, e));
+ }
+ }
+ return routesAcross = Collections.unmodifiableSet(result);
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java
new file mode 100644
index 000000000..ca4985cef
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java
@@ -0,0 +1,287 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
+import org.broadinstitute.sting.utils.collections.CountSet;
+import org.broadinstitute.sting.utils.collections.CountSet;
+import org.broadinstitute.sting.utils.collections.Pair;
+
+import java.util.*;
+
+/**
+ * Encapsulates the graph traversals needed to find event-blocks.
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
+ */
+public class EventBlockFinder {
+
+ private final HaplotypeGraph graph;
+
+ private final Map,EventBlock> eventBlockCache;
+
+ /**
+ * Constructs a new engine.
+ *
+ * @param graph the base haplotype graph to iterate over.
+ */
+ public EventBlockFinder(final HaplotypeGraph graph) {
+ if (graph == null) throw new NullPointerException();
+ this.graph = graph;
+ eventBlockCache = new HashMap<>(20);
+ }
+
+ /**
+ * Create a new traversal object based on a read anchoring.
+ * @param anchoring
+ * @return never {@code null}.
+ */
+ public Traversal traversal(final ReadAnchoring anchoring) {
+ if (anchoring == null) throw new NullPointerException();
+ return new Traversal(anchoring);
+ }
+
+
+ public class Traversal implements Iterable {
+
+ private final ReadAnchoring anchoring;
+
+ private EventBlock lastEventBlock;
+
+
+ private Traversal(final ReadAnchoring anchoring) {
+ this.anchoring = anchoring;
+ lastEventBlock = findLastEventBlock(anchoring);
+ }
+
+ @Override
+ public java.util.Iterator iterator() {
+ return lastEventBlock == null ? Collections.EMPTY_SET.iterator() : new Iterator();
+ }
+
+ private class Iterator implements java.util.Iterator {
+
+ private MultiDeBruijnVertex currentVertex;
+
+ private Iterator() {
+ currentVertex = anchoring.leftAnchorVertex;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return currentVertex != null;
+ }
+
+ @Override
+ public EventBlock next() {
+ final EventBlock result;
+ if (currentVertex == null)
+ throw new NoSuchElementException("going beyond last event block");
+ else if (currentVertex == lastEventBlock.getSource()) {
+ result = lastEventBlock;
+ currentVertex = null;
+ } else {
+ final EventBlock candidate = findEventBlock(anchoring,false,currentVertex,lastEventBlock.getSource());
+ if (candidate == null) {
+ result = findEventBlock(anchoring,false,currentVertex,anchoring.rightAnchorVertex);
+ currentVertex = null;
+ } else {
+ result = candidate;
+ currentVertex = candidate.getSink();
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+ }
+
+ /**
+ * Finds the last event block.
+ *
+ * It can do it forward or backwards.
+ *
+ *
+ * @param anchoring target read anchoring information.
+ * @return {@code null} if there is no event block, depending on {@code backwards} before or after current
+ */
+ private EventBlock findLastEventBlock(
+ final ReadAnchoring anchoring) {
+ return findEventBlock(anchoring,true,anchoring.leftAnchorVertex,anchoring.rightAnchorVertex);
+ }
+
+ /**
+ * Finds an event block forward or backwards along the reference route.
+ * @param anchoring the read anchoring information.
+ * @param backwards true if the block should be constructed from right to left.
+ * @param leftVertex the left vertex
+ * @param rightVertex the right vertex
+ * @return {@code null} if there is no such a event block between these coordinates.
+ */
+ private EventBlock findEventBlock(
+ final ReadAnchoring anchoring, final boolean backwards,
+ final MultiDeBruijnVertex leftVertex, final MultiDeBruijnVertex rightVertex) {
+
+ MultiDeBruijnVertex currentVertex = backwards ? rightVertex : leftVertex;
+ boolean foundEvent = false;
+ final CountSet pathSizes = new CountSet(10); // typically more than enough.
+ pathSizes.setTo(0);
+
+ // Map between reference vertices where there is some expected open alternative path rejoining and the
+ // predicted length of paths rejoining at that point counting from the beginning of the block.
+ final Map expectedAlternativePathRejoins = new HashMap<>(4);
+
+ // Keeps record of possible left-clipping veritces; those that are located before any event path furcation
+ // has been found. The value indicates the blockLength at the time we traverse that node.
+ final Deque> possibleClippingPoints = new LinkedList<>();
+
+ // We keep the distance from the beggining of the block (leftVertex).
+ int blockLength = 0;
+ while (currentVertex != null) {
+ int openingDegree = backwards ? graph.outDegreeOf(currentVertex) : graph.inDegreeOf(currentVertex);
+ if (openingDegree > 1) {
+ final CountSet joiningPathLengths = expectedAlternativePathRejoins.remove(currentVertex);
+ if (joiningPathLengths != null)
+ pathSizes.addAll(joiningPathLengths);
+ }
+ final boolean isValidBlockEnd = isValidBlockEnd(anchoring, currentVertex, expectedAlternativePathRejoins);
+ if (foundEvent && isValidBlockEnd) // !gotcha we found a valid block end.
+ break;
+ else if (!foundEvent && isValidBlockEnd) // if no event has been found yet, still is a good clipping point.
+ possibleClippingPoints.addLast(new Pair<>(currentVertex, blockLength));
+
+ // We reached the end:
+ if (currentVertex == (backwards ? leftVertex : rightVertex))
+ break;
+
+ // process next vertices, the next one on the reference and also possible start of alternative paths,
+ // updates traversal structures accordingly.
+ currentVertex = advanceOnReferencePath(anchoring, backwards, currentVertex, pathSizes, expectedAlternativePathRejoins);
+ foundEvent |= expectedAlternativePathRejoins.size() > 0;
+ pathSizes.incAll(1);
+ blockLength++;
+ }
+
+ // we have not found an event, thus there is no block to report:
+ if (!foundEvent)
+ return null;
+
+ // We try to clip off as much as we can from the beginning of the block before any event, but at least
+ // leaving enough block length to meet the shortest path unless all paths have the same size (SNPs only)
+ final int maxClipping = pathSizes.size() <= 1 ? blockLength : pathSizes.min();
+ MultiDeBruijnVertex clippingEnd = backwards ? anchoring.rightAnchorVertex : anchoring.leftAnchorVertex;
+ while (!possibleClippingPoints.isEmpty()) {
+ final Pair candidate = possibleClippingPoints.removeLast();
+ if (candidate.getSecond() <= maxClipping) {
+ clippingEnd = candidate.getFirst();
+ break;
+ }
+ }
+
+ return resolveEventBlock(backwards ? new Pair<>(currentVertex, clippingEnd) : new Pair<>(clippingEnd, currentVertex));
+ }
+
+ /**
+ * Gets or constructs a event-block through the cache.
+ * @param borders the source and sink vertex pair for the requested event block.
+ * @return never {@code null}
+ */
+ @Requires("borders != null && border.getFirst() != null && border.getSecond() != null")
+ private EventBlock resolveEventBlock(final Pair borders) {
+ EventBlock result = eventBlockCache.get(borders);
+ if (result == null)
+ eventBlockCache.put(borders,result = new EventBlock(graph, borders.getFirst(),borders.getSecond()));
+ return result;
+ }
+
+ /**
+ * Move on vertex along the reference path checking for the presence of new opening alternative paths.
+ *
+ * @param anchoring anchoring information on the targeted read.
+ * @param backwards whether we are extending the block backwards or forwards.
+ * @param currentVertex the current vertex.
+ * @param pathSizes current block path sizes.
+ * @param expectedAlternativePathRejoins information about location of vertices along the reference path where open alternative paths will rejoin.
+ * @return the next current-vertex, never {@code null} unless there is a bug.
+ */
+ private MultiDeBruijnVertex advanceOnReferencePath(final ReadAnchoring anchoring, final boolean backwards, final MultiDeBruijnVertex currentVertex, final CountSet pathSizes, final Map expectedAlternativePathRejoins) {
+ final Set nextEdges = backwards ? graph.incomingEdgesOf(currentVertex) : graph.outgoingEdgesOf(currentVertex);
+ MultiDeBruijnVertex nextReferenceVertex = null;
+ for (final MultiSampleEdge e : nextEdges) {
+ final MultiDeBruijnVertex nextVertex = backwards ? graph.getEdgeSource(e) : graph.getEdgeTarget(e);
+ if (e.isRef())
+ nextReferenceVertex = nextVertex;
+ else {
+ final CountSet pathSizesPlusOne = pathSizes.clone();
+ pathSizesPlusOne.incAll(1);
+ graph.calculateRejoins(nextVertex, expectedAlternativePathRejoins, anchoring.referenceWithinAnchorsMap.keySet(), pathSizesPlusOne, true, backwards);
+ }
+ }
+ return nextReferenceVertex;
+ }
+
+ /**
+ * Check whether the current vertex is a valid block end.
+ *
+ * @param anchoring reads anchoring information necessary to make the evaluation.
+ * @param currentVertex target potential block end
+ * @param expectedAlternativePathRejoins traversal states regarding open alternative paths.
+ *
+ * @return {@code true} iff so.
+ */
+ private boolean isValidBlockEnd(final ReadAnchoring anchoring, final MultiDeBruijnVertex currentVertex, final Map expectedAlternativePathRejoins) {
+ final boolean isUniqueKmer = anchoring.uniqueKmerOffsets.containsKey(currentVertex);
+ final boolean isAnchorable = graph.getAnchorableVertices().contains(currentVertex) && isUniqueKmer && expectedAlternativePathRejoins.size() == 0;
+ return isUniqueKmer && isAnchorable;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index 82029b872..697d162fd 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.EventMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
@@ -182,11 +183,12 @@ public class GenotypingEngine {
final List priorityList = makePriorityList(eventsAtThisLoc);
// Merge the event to find a common reference representation
- final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
+ final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false);
if( mergedVC == null ) { continue; }
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
- throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
+ // this is possible in GGA mode when the same event is represented in multiple input records
+ throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s).");
}
final Map mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
@@ -335,7 +337,7 @@ public class GenotypingEngine {
for( final String sample : alleleReadMap.keySet() ) {
final int numHaplotypes = mergedVC.getAlleles().size();
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
- final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true);
+ final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java
new file mode 100644
index 000000000..8a35ccb05
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java
@@ -0,0 +1,158 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM;
+import org.broadinstitute.sting.utils.pairhmm.FastLoglessPairHMM;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Read likelihood calculation engine base on applying heuristic on the assembly graph.
+ */
+public class GraphBasedLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
+
+ private static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngine.class);
+
+ /**
+ * Gap extension penalty in Phred scale.
+ */
+ private byte gcpHMM;
+
+ /**
+ * Fast-hmm implementation reused across active regions.
+ */
+ private FlexibleHMM hmm;
+
+ /**
+ * The worst reference vs best-alternative haplotype ratio for any read. The reference haplotype likelihood
+ * is changes to meet this maximum is needed.
+ */
+ private double log10GlobalReadMismappingRate;
+
+ /**
+ * How we resolve cases in where we have haplotypes coming from different kmer sizes.
+ */
+ private HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution;
+
+ private enum DebugMode { NONE, DEBUG, EXTRA_DEBUG };
+
+ private DebugMode debugMode;
+
+ /**
+ * Creates a new likelihood engine.
+ *
+ * @param gapExtensionPenalty the gap extension penalty Phred scale.
+ * @param log10GlobalReadMismappingRate the global read mismapping rate.
+ * @param heterogeneousKmerSizeResolution who to resolve assembly with haplotypes generated from different kmerSizes.
+ * @param debug whether to output some debug messages.
+ * @param debugHaplotypeGraphAndLikelihoods whether to generate haplotype graph and likelihood files, please only use with small intervals.
+ */
+ public GraphBasedLikelihoodCalculationEngine(final int gapExtensionPenalty, final double log10GlobalReadMismappingRate,
+ final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution,
+ final boolean debug, final boolean debugHaplotypeGraphAndLikelihoods) {
+ gcpHMM = (byte) gapExtensionPenalty;
+ hmm = new FastLoglessPairHMM(gcpHMM);
+ this.log10GlobalReadMismappingRate = log10GlobalReadMismappingRate;
+ this.heterogeneousKmerSizeResolution = heterogeneousKmerSizeResolution;
+ debugMode = debugHaplotypeGraphAndLikelihoods ? DebugMode.EXTRA_DEBUG : debug ? DebugMode.DEBUG : DebugMode.NONE;
+ }
+
+
+ @Override
+ public Map computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList) {
+ final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine =
+ new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet,
+ hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution);
+ final List haplotypes = assemblyResultSet.getHaplotypeList();
+ final List supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList();
+ if (supportedHaplotypes.size() != haplotypes.size()) logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size());
+ final Map result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes,
+ perSampleReadList );
+ if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result);
+ return result;
+ }
+
+ /**
+ * A few debug messages associated with the GraphBased likelihoods engine.
+ */
+ private void graphLikelihoodDebugDumps(final ActiveRegion originalActiveRegion, final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine,
+ final Map result) {
+ if (graphLikelihoodEngine.hasCycles())
+ logger.debug("Resulting haplotype graph combining several kmer sizes has cycles");
+ else if (graphLikelihoodEngine.haplotypeGraph.hasNonReferenceEnds())
+ logger.debug("Resulting haplotype graph has ends that do not belong to the reference: " + originalActiveRegion.getLocation());
+ else if (!graphLikelihoodEngine.hasVariation())
+ logger.debug("Resulting haplotype graph does not contain any alternative haplotype path");
+ if (debugMode == DebugMode.EXTRA_DEBUG) {
+ graphLikelihoodEngine.printGraph(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeGraph.dot");
+ final SeqGraph sq = graphLikelihoodEngine.haplotypeGraph.convertToSequenceGraph();
+ sq.simplifyGraph();
+ sq.printGraph(new File(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeSeqGraph.dot"), 10000);
+ try {
+ FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt"));
+ PrintWriter pw = new PrintWriter(fw);
+ //Note: we only output the first sample likelihoods, perhaps should output all of them but for debugging this is normally what is needed.
+ pw.println(result.entrySet().iterator().next().getValue().toString());
+ pw.close();
+ fw.close();
+ } catch (Exception ex) {
+ throw new StingException("", ex);
+ }
+ }
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
new file mode 100644
index 000000000..9d861a445
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java
@@ -0,0 +1,911 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.collections.CountSet;
+import org.broadinstitute.sting.utils.collections.CountSet;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;
+
+import java.util.*;
+
+/**
+ * Fast pseudo-likelihood calculation engine based on the assembly haplotype graph.
+ *
+ *
+ * An instance is good for active region. {@link GraphBasedLikelihoodCalculationEngine} instance them on demand
+ * as requested by the {@code HaplotypeCaller} code.
+ *
+ */
+public class GraphBasedLikelihoodCalculationEngineInstance {
+
+ private final static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngineInstance.class);
+
+
+ /**
+ * Unified kmer size used for the Haplotype graph.
+ */
+ protected final int kmerSize;
+
+ /**
+ * Reference to the haplotype graph.
+ */
+ protected final HaplotypeGraph haplotypeGraph;
+
+ /**
+ * Haplotypes included in the haplotype graph.
+ */
+ private final List haplotypes;
+
+ /**
+ * Whether there is some variation present in the haplotype assembly.
+ */
+ private final boolean hasVariation;
+
+
+ /**
+ * Counts of reads that anchoread somewhere.
+ *
+ * Used for debugging purposes
+ */
+ private int anchoredReads = 0;
+
+ /**
+ * Count of reads that didn't anchor anywere.
+ *
+ * Used for debugging purposes
+ */
+ private int nonAnchoredReads = 0;
+
+ /**
+ * Pair-hmm implementation to use to calculate read likelihoods.
+ */
+ private final FlexibleHMM hmm;
+
+ /**
+ * Maximum likelihood difference between the reference haplotype and the best alternative haplotype.
+ *
+ * If the difference is greater for a read, the reference haplotype likelihood is increase in order to not go
+ * beyond this limit
+ */
+ protected final double log10globalReadMismappingRate;
+
+ protected final EventBlockFinder eventBlockSearchEngine;
+
+
+ /**
+ * Constructs a new engine based on the results of the assembly.
+ *
+ * @param assemblyResultSet assembly-result set
+ * @param hmm fast-hmm implementation to use.
+ * @param log10globalReadMismappingRate maximum cost for the reference haplotype vs the best alternative available.
+ * @param heterogeneousKmerSizeResolution multi-kmersize dataset resolution.
+ * @throws NullPointerException if any argument is null.
+ * @throws IllegalArgumentException if log10globalReadMismappingRate >= 0.
+ */
+ public GraphBasedLikelihoodCalculationEngineInstance(final AssemblyResultSet assemblyResultSet, final FlexibleHMM hmm, final double log10globalReadMismappingRate, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) {
+ if (heterogeneousKmerSizeResolution == null) throw new NullPointerException("the kmerSize resolution cannot be null");
+ if (assemblyResultSet == null) throw new NullPointerException("the assembly result set cannot be null");
+ if (hmm == null) throw new NullPointerException("the fast-hmm component cannot be null");
+ if (log10globalReadMismappingRate >= 0)
+ throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero");
+
+ this.hmm = hmm;
+ this.log10globalReadMismappingRate = log10globalReadMismappingRate;
+
+ haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList());
+ Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR);
+
+ // make sure that kmerSize is not bigger than the smallest haplotype. It can well happen when there are cycles and kmerSize inflates.
+ final Haplotype referenceHaplotype = assemblyResultSet.getReferenceHaplotype();
+ int minHaplotypeLength = referenceHaplotype.length();
+ for (final Haplotype h : haplotypes)
+ if (minHaplotypeLength > h.length())
+ minHaplotypeLength = h.length();
+
+ // Determine the kmerSize to use for the unified haplotype assembly graph
+
+ kmerSize = Math.min(minHaplotypeLength,
+ heterogeneousKmerSizeResolution.useMaximum() ? assemblyResultSet.getMaximumKmerSize() : assemblyResultSet.getMinimumKmerSize());
+
+ haplotypeGraph = new HaplotypeGraph(kmerSize,haplotypes);
+
+
+ if (haplotypeGraph.hasCycles())
+ Utils.warnUser(logger, "cycle caused at merging haplotypes with different kmerSizes: active region " + assemblyResultSet.getRegionForGenotyping() + " will be skipped");
+
+ //TODO haplpotypeGraph.getReferenceSourceVertex() == null
+ //TODO Is a quick patch to ignore cases where the trimming has rendered kmerSize so big that is bigger than the haplotype
+ //TODO and reduction to the minimum haplotype size result in no unique kmers.
+ //TODO the actual solution: we need to impose a maximum trimming at least for Graph-based HC runs as we are loosing
+ //TODO a bit of sensitivity as trimming results in lack of unique kmers.
+ if (haplotypeGraph.hasCycles() || haplotypeGraph.getReferenceHaplotype() == null) {
+ hasVariation = false;
+ eventBlockSearchEngine = null;
+ return;
+ }
+
+ haplotypeGraph.mergeCommonChains();
+ //TODO recover dangling ends. Did not work the last time I tried but may be worth to retry.
+ //haplotypeGraph.recoverDanglingTails(-1);
+ logger.debug("using haplotype graph with kmerSize " + haplotypeGraph.getKmerSize());
+
+ hasVariation = !haplotypeGraph.hasCycles() && haplotypeGraph.getHaplotypes().size() > 1;
+
+ eventBlockSearchEngine = new EventBlockFinder(haplotypeGraph);
+ }
+
+ /**
+ * Determines whether based on result from assembly and the relevant user options we can reuse th existing
+ *
+ * @param assemblyResultSet assembly result set.
+ * @param kmerSize intended kmerSize for the haplotype graph.
+ * @param heterogeneousKmerSizeResolution user instruction as to how to resolve situation where we have haplotypes comming from different kmer sizes.
+ * @return {@code true} iff we can reuse an existing read-threading graph with that kmerSize in the assembly result set.
+ */
+ @SuppressWarnings("unused")
+ private static boolean canReuseReadThreadingGraphAsHaplotypeGraph(final AssemblyResultSet assemblyResultSet, final int kmerSize, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) {
+ return !assemblyResultSet.wasTrimmed() && (!assemblyResultSet.hasMultipleKmerSizes() || heterogeneousKmerSizeResolution.combinesKmerSizes()) &&
+ assemblyResultSet.getUniqueReadThreadingGraph(kmerSize) != null;
+ }
+
+ /**
+ * Checks whether the underlying haplotype graph assembly contains any variation worth analyzing.
+ *
+ * @return {@code true} iff so.
+ */
+ public boolean hasVariation() {
+ return hasVariation;
+ }
+
+ /**
+ * Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the
+ * active region assembly process.
+ *
+ * @param haplotypes to evaluate.
+ * @param perSampleReadList the input read sets stratified per sample.
+ *
+ * @throws NullPointerException if either parameter is {@code null}.
+ *
+ * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
+ * The value maps can be potentially empty though.
+ */
+ public Map computeReadLikelihoods(
+ final List haplotypes,
+ final Map> perSampleReadList) {
+ // General preparation on the input haplotypes:
+ Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR);
+ final Map alleleVersions = new LinkedHashMap<>(haplotypes.size());
+ for (final Haplotype haplotype : haplotypes)
+ alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference()));
+
+ // The actual work:
+ final HashMap result = new HashMap<>(perSampleReadList.size());
+ for (final Map.Entry> e : perSampleReadList.entrySet()) {
+ final String sample = e.getKey();
+ final List reads = e.getValue();
+ final Set mayNeedAdjustment = new HashSet<>(reads.size());
+ // Get the cost/likelihood of each read at relevant subpaths on the tree:
+ final Map> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment);
+ // Create the resulting per-read maps:
+ final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions);
+ result.put(sample, prallm);
+ }
+ logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + "");
+ return result;
+ }
+
+
+ /**
+ * Prints a graph into a dot file.
+ *
+ * @param fileName name of the output file.
+ */
+ public void printGraph(final String fileName) {
+ if (haplotypeGraph != null)
+ haplotypeGraph.printGraph(fileName);
+ }
+
+ /**
+ * Returns the kmerSize the engine is using to match read vs graph kmers thus reducing computation.
+ *
+ * @return greater than 0.
+ */
+ public int getKmerSize() {
+ return kmerSize;
+ }
+
+ /**
+ * Tells whether the underlying haplotype graph contained cycles.
+ *
+ * @return {@code true} iff so.
+ */
+ public boolean hasCycles() {
+ // It is set to null if it contained cycles.
+ return haplotypeGraph == null;
+ }
+
+
+ /**
+ * Builds the result per-read allele likelihood map.
+ *
+ * @param haplotypes haplotypes to process.
+ * @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex.
+ * @param alleleVersions map between haplotypes and the corresponding allele.
+ * @return never {@code null} although perhaps empty.
+ */
+ protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap(
+ final Collection haplotypes,
+ final Map> costsEndingByVertex, final Map alleleVersions) {
+
+ final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap();
+ if (haplotypeGraph == null)
+ return result;
+ final Map maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10);
+ final Set supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes());
+ supportedHaplotypes.retainAll(haplotypes);
+ for (final Haplotype haplotype : supportedHaplotypes)
+ calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex);
+
+ //TODO Does not seem to be needed in practice:
+ //TODO furhter testing/evaluation required before removing it completely.
+ //makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk);
+ applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk);
+ return result;
+ }
+
+ /**
+ * Work done per haplotype to build the result per-read allele likelihood map.
+ *
+ *
+ * Basically for each haplotype we go through its path in the graph collecting all the read cost that we find
+ * on the way. For each read present we add up all its cost resulting in a single value per read, i.e. its
+ * "likelihood".
+ *
+ *
+ * @param haplotype the target haplotype
+ * @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output.
+ * @param result target where to add the read-vs-haplotype likelihoods.
+ * @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read.
+ * @param costsEndingByVertex read costs assorted by their end vertex.
+ */
+ private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype,
+ final Map alleleVersions,
+ final PerReadAlleleLikelihoodMap result,
+ final Map maxAlleleLogLk,
+ final Map> costsEndingByVertex) {
+ final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype);
+ final Set haplotypeVertices = haplotypeRoute.vertexSet();
+ final Map readCostByRead = new HashMap<>();
+ final Set visitedVertices = new HashSet<>(haplotypeVertices.size());
+ final List edgeList = haplotypeRoute.getEdges();
+ MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex();
+ Route pathSoFar = new Route<>(currentVertex, haplotypeGraph);
+ final Iterator edgeIterator = edgeList.iterator();
+ while (true) {
+ visitedVertices.add(currentVertex);
+ final Set finishingAtElementCostSet = costsEndingByVertex.get(currentVertex);
+ updateReadCosts(readCostByRead, visitedVertices, pathSoFar, finishingAtElementCostSet);
+ if (!edgeIterator.hasNext()) break;
+ final MultiSampleEdge nextEdge = edgeIterator.next();
+ pathSoFar = new Route<>(pathSoFar, nextEdge);
+ currentVertex = pathSoFar.getLastVertex();
+ }
+
+ final List readCosts = new ArrayList<>(readCostByRead.values());
+ Collections.sort(readCosts, ReadCost.COMPARATOR);
+ for (final ReadCost rc : readCosts)
+ result.add(rc.read, alleleVersions.get(haplotype), rc.cost);
+
+ for (final ReadCost rc : readCosts) {
+ final Double currentMax = maxAlleleLogLk.get(rc.read);
+ if (currentMax == null || currentMax < rc.cost)
+ maxAlleleLogLk.put(rc.read, rc.cost);
+ }
+ }
+
+ /**
+ * Update the read cost based on the path cost found at a vertex.
+ *
+ * @param readCosts collection of read costs so far
+ * @param visitedVertices visited vertices collection.
+ * @param pathSoFar the haplotype path visited so far.
+ * @param finishingAtElementCostSet collection of path cost to process
+ */
+ private void updateReadCosts(final Map readCosts,
+ final Set visitedVertices,
+ final Route pathSoFar,
+ final Set finishingAtElementCostSet) {
+ if (finishingAtElementCostSet != null) {
+ for (final ReadSegmentCost pc : finishingAtElementCostSet) {
+ if (!visitedVertices.contains(pc.path.getFirstVertex()))
+ continue;
+ if (!pathSoFar.isSuffix(pc.path))
+ continue;
+ ReadCost rc = readCosts.get(pc.read);
+ if (rc == null)
+ readCosts.put(pc.read, rc = new ReadCost(pc.read));
+ rc.cost += pc.cost;
+ }
+ }
+ }
+
+ /**
+ * Likelihood penalty for unreported haplotype vs read likelihood with respect to the worst reported one.
+ */
+ private static final int UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY = -3;
+
+ /**
+ * Re-scales all haplotype vs read likelihoods so that for read, the best haplotype, hash likelihood 0.
+ *
+ * @param alleleVersions map between input haplotypes and output alleles.
+ * @param result where to change the likelihoods.
+ * @param mayNeedAdjustment set of read that might need adjustment. Others might be ignored.
+ * @param maxAlternative map from each read and the maximum alternative haplotype likelihood.
+ */
+ @SuppressWarnings("unused")
+ private void makeLikelihoodAdjustment(final Map alleleVersions,
+ final PerReadAlleleLikelihoodMap result,
+ final Set mayNeedAdjustment,
+ final Map maxAlternative) {
+ final Map> map = result.getLikelihoodReadMap();
+
+ for (final GATKSAMRecord read : mayNeedAdjustment) {
+ final Map existingLikelihoods = map.get(read);
+ if (existingLikelihoods != null) {
+ Allele bestAllele = null;
+ double worstRelativeLikelihood = 0;
+ double bestRelativeLikelihood = Double.NEGATIVE_INFINITY;
+ for (final Map.Entry entry : map.get(read).entrySet()) {
+ final double candidateRelativeLikelihood = entry.getValue();
+ if (candidateRelativeLikelihood > bestRelativeLikelihood) {
+ bestAllele = entry.getKey();
+ bestRelativeLikelihood = candidateRelativeLikelihood;
+ }
+ if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood)
+ worstRelativeLikelihood = candidateRelativeLikelihood;
+ }
+
+ worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY;
+ if (bestAllele == null)
+ throw new IllegalStateException("No best allele for read " + read.getReadName());
+ final double bestLikelihood = 0.0; // the best becomes zero.
+ maxAlternative.put(read, bestLikelihood);
+ for (final Map.Entry entry : alleleVersions.entrySet()) {
+ final Allele a = entry.getValue();
+ final Double relativeLikelihoodO = existingLikelihoods.get(a);
+ final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO;
+ final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood;
+ if (likelihood > 0)
+ throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName());
+ existingLikelihoods.put(a, likelihood);
+ }
+ }
+ }
+ }
+
+ /**
+ * Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele.
+ * The justification of this constraint is explained in
+ * {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}.
+ *
+ * @param alleleVersions correspondence between input haplotypes and output alleles.
+ * @param result the target result map.
+ * @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele.
+ */
+ private void applyGlobalReadMismappingRate(final Map alleleVersions,
+ final PerReadAlleleLikelihoodMap result,
+ final Map maxAlleleLogLk) {
+ if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) {
+ final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype());
+ for (final Map.Entry> entry : result.getLikelihoodReadMap().entrySet()) {
+ final GATKSAMRecord read = entry.getKey();
+ final Map likelihoods = entry.getValue();
+ final Double maxLogLk = maxAlleleLogLk.get(read);
+ if (maxAlleleLogLk == null) continue;
+ final Double referenceLogLk = likelihoods.get(referenceAllele);
+ final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate;
+ if (referenceLogLk == null || referenceLogLk < minReferenceLogLk)
+ likelihoods.put(referenceAllele, minReferenceLogLk);
+ }
+ }
+ }
+
+ /**
+ * Calculates path costs for a set of reads.
+ *
+ *
+ * The resulting map has one entry per read, where the read is the key and the value list of path-cost sets.
+ * Each element in that list corresponds to an event block. Each path cost in one of those sets indicate the
+ * likelihood (cost) of traversing a possible path across the event block using that read.
+ *
+ *
+ * @param reads reads to analyze.
+ * @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment.
+ * @return never {@code null}.
+ */
+ protected Map> calculatePathCostsByRead(
+ final List reads, final Set mayNeedAdjustment) {
+ final Map> result = new HashMap<>(reads.size());
+ if (!hasVariation)
+ return Collections.emptyMap();
+ for (final GATKSAMRecord r : reads) {
+ calculatePathCostsByRead(r, mayNeedAdjustment, result);
+ }
+ return result;
+ }
+
+ /**
+ * Calculates path cost for a single read.
+ *
+ * @param read target read.
+ * @param mayNeedAdjustment set where to add read whose likelihood might need adjustment.
+ * @param result map where to add the result.
+ */
+ private void calculatePathCostsByRead(final GATKSAMRecord read, final Set mayNeedAdjustment,
+ final Map> result) {
+
+ final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph);
+ // cannot anchor so go the tradition pair-hmm way.
+ hmm.loadRead(read);
+ if (!anchoring.isAnchoredSomewhere()) {
+ defaultToRegularPairHMM(anchoring, result);
+ nonAnchoredReads++;
+ return;
+ }
+
+ calculateReadSegmentCosts(anchoring, hmm, result);
+
+ if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result);
+ mayNeedAdjustment.add(read);
+ anchoredReads++;
+ }
+
+ /**
+ * Calculates read vs haplotype likelihoods using the classic PairHMM approach.
+ *
+ *
+ * It basically compares the read with each haplotype full path without short cuts.
+ *
+ *
+ * @param anchoring anchoring information of the read.
+ * @param destination where to leave the results indexed by ending veretex.
+ */
+ private void defaultToRegularPairHMM(final ReadAnchoring anchoring, final Map> destination) {
+
+ for (final Map.Entry entry : haplotypeGraph.getHaplotypeRouteMap().entrySet()) {
+ if (entry.getValue() == null) continue;
+ final byte[] haplotypeBases = entry.getKey().getBases();
+ hmm.loadHaplotypeBases(haplotypeBases);
+ final double cost = hmm.calculateLocalLikelihood(0, anchoring.read.getReadLength(), 0, haplotypeBases.length, false);
+ final ReadSegmentCost readSegmentCost = new ReadSegmentCost(anchoring.read, entry.getValue(), cost);
+ addReadSegmentCost(destination, readSegmentCost);
+ }
+ }
+
+ /**
+ * Add a new read-segment-cost to an ending vertex indexed map.
+ * @param destination where to add the read-segment-cost.
+ * @param cost the read-segment-cost to add.
+ */
+ private void addReadSegmentCost(final Map> destination, final ReadSegmentCost cost) {
+ final MultiDeBruijnVertex endVertex = cost.path.getLastVertex();
+ Set vpcSet = destination.get(endVertex);
+ if (vpcSet == null)
+ destination.put(endVertex, vpcSet = new HashSet<>(10));
+ vpcSet.add(cost);
+ }
+
+ /**
+ * Calculate the likelihood cost of path section of a read across the graph.
+ *
+ *
+ * Given a read, its anchors and other unique kmer mapable to the reference path we can divide the graph
+ * into event blocks: a set of one or more variations and the possible path across that block.
+ *
+ *
+ *
+ * The result value will have one element fo reach block. Each element is the set of all path costs (likelihoods)
+ * to traverse the block using all possible paths (different haplotypes).
+ *
+ *
+ *
+ * The current implementation has some added complexity in order to avoid a situation in where the last part
+ * of the anchored section of the read is thrown out. We first determine the last event block boundaries and we
+ * make sure that we won't run over its left limit when covering for earlier event blocks.
+ *
+ *
+ * @param anchoring target read graph anchoring information.
+ * @param hmm the pair-hmm calculation engine. It must have been loaded with the same {@code read} already.
+ * @param destination where to add the costs.
+ */
+ private void calculateReadSegmentCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) {
+
+ final EventBlockFinder.Traversal traversal = eventBlockSearchEngine.traversal(anchoring);
+
+ for (final EventBlock eventBlock : traversal) {
+
+ // final Set> acrossBlockPaths =
+ // calculateAllPathsBetweenVertices(anchoring,
+ // eventBlock.getSource(), eventBlock.getSink());//eventBlock.getRoutesAcross();
+
+ final Set> acrossBlockPaths = eventBlock.getRoutesAcross();
+
+ int leftBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSource());
+ int rightBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSink());
+ calculateCostForPathSet(anchoring.read, acrossBlockPaths, hmm, leftBlockBoundaryIndex, rightBlockBoundaryIndex, true, false, null, null, destination);
+ }
+ }
+
+ /**
+ * Calculate path cost for a set of paths across a event block.
+ *
+ * @param read the target read.
+ * @param acrossBlockPaths event block paths to evaluate.
+ * @param hmm pair-hmm engine to use to calculate likelihoods.
+ * @param beforeBlockReadOffset kmer offset on the read for the vertex kmer before the block.
+ * @param afterBlockReadOffset kmer offset on the read for the vertex kmer after the block.
+ * @param doClipping whether to perform any clipping in order to save cpu time.
+ * @param prependVertex if not null, the end cost path with be prepended with this vertex.
+ * @param appendVertex if not null, the end cost path will be appended with this vertex.
+ * @param includePathEnds whether to include or exclude the vertices at the very end or beginning of the paths.
+ */
+ private void calculateCostForPathSet(
+ final GATKSAMRecord read, final Set> acrossBlockPaths,
+ final FlexibleHMM hmm, final int beforeBlockReadOffset, final int afterBlockReadOffset,
+ final boolean doClipping, final boolean includePathEnds,
+ final MultiDeBruijnVertex prependVertex,
+ final MultiDeBruijnVertex appendVertex,
+ final Map> destination) {
+
+
+ final Set readSegmentCosts = new TreeSet<>(ReadSegmentComparator.INSTANCE);
+
+ final int readStart = beforeBlockReadOffset + kmerSize;
+ final int readEnd = Math.max(readStart, afterBlockReadOffset + kmerSize - 1);
+ final byte[][] pathBases = new byte[acrossBlockPaths.size()][];
+ final CountSet pathSizes = new CountSet(acrossBlockPaths.size());
+ int nextPath = 0;
+
+ // Complete the read segment cost with the corresponding path bases
+ for (final Route p : acrossBlockPaths) {
+ final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN);
+ pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds);
+ pathSizes.add(readSegmentCost.bases.length);
+ readSegmentCosts.add(readSegmentCost);
+ }
+
+ // Add the read 'path size'.
+ pathSizes.add(readEnd - readStart);
+
+ final byte[] readBases = hmm.getReadBases();
+
+ // Perform right clipping of bases that are common to all paths and read.
+ int rightClipping = !doClipping ? 0 : calculateRightClipping(readEnd, pathBases, readBases,pathSizes);
+
+ // Calculate the costs.
+ for (final ReadSegmentCost readSegmentCost : readSegmentCosts) {
+ hmm.loadHaplotypeBases(readSegmentCost.bases);
+ readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false);
+ if (prependVertex != null)
+ readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path);
+ if (appendVertex != null)
+ readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex);
+ addReadSegmentCost(destination,readSegmentCost);
+ }
+
+
+ }
+
+ /**
+ * Determines how much we can clip away from the right side of a set of path without loosing accuracy when comparing
+ * likelihood vs the read.
+ *
+ * @param readEnd exclusive position right after the last one of the region considered.
+ * @param pathBases bases of possible path in the same event block.
+ * @param readBases full length read bases.
+ * @param pathSizes path size set.
+ *
+ * @return 0 or greater.
+ */
+ private int calculateRightClipping(final int readEnd, final byte[][] pathBases,
+ final byte[] readBases, final CountSet pathSizes) {
+ final int maxClipping = pathSizes.size() > 1 ? 0 : Math.min(pathSizes.min(), kmerSize - 1);
+ int rightClipping = 0;
+ while (rightClipping < maxClipping) {
+ final byte readBase = readBases[readEnd - rightClipping - 1];
+ boolean dontGoFurther = false;
+ for (int i = 0; !dontGoFurther && i < pathBases.length; i++)
+ if (pathBases[i][pathBases[i].length - rightClipping - 1] != readBase)
+ dontGoFurther = true;
+ if (dontGoFurther)
+ break;
+ rightClipping++;
+ }
+ return rightClipping;
+ }
+
+ /**
+ * Calculates a graph path bases.
+ *
+ *
+ * When the path starts on a source vertex, all its sequence is considered as part of the path bases. For regular
+ * vertices start only the suffix (last) base is considered.
+ *
+ *
+ * @param path the targeted path.
+ * @param includePathEnds whether the bases included in the first and last vertex of the path should be included or excluded.
+ * @return never {@code null} but perhaps a zero-length base array if the final requested path length is zero.
+ */
+ //TODO this method could be moved to the Path class, but require consider how to make the API more concise.
+ private byte[] eventBlockPathBases(final Path path,
+ final boolean includePathEnds) {
+ // We first calculate the size of the return.
+ final List vertices = path.getVertices();
+ final boolean pathStartsAtSource = haplotypeGraph.isSource(path.getFirstVertex());
+ final int resultLength = includePathEnds
+ ? vertices.size() + (pathStartsAtSource ? path.getFirstVertex().getSequence().length - 1 : 0)
+ : vertices.size() - 2;
+ // Trivial empty return cases:
+ if (resultLength <= 0)
+ return new byte[0];
+ final byte[] result = new byte[resultLength];
+ if (result.length == 0) {
+ return result;
+ }
+ // General return cases:
+ final ListIterator it = vertices.listIterator(includePathEnds ? 0 : 1); // skip the vertex (exclusive)
+ for (int i = 0; i < resultLength; i++) { // i < resultLength implicitly skips the last vertex (exclusive).
+ final MultiDeBruijnVertex vertex = it.next();
+ if (i == 0 && includePathEnds && pathStartsAtSource) {
+ System.arraycopy(vertex.getSequence(), 0, result, 0, kmerSize);
+ i = kmerSize - 1;
+ } else
+ result[i] = vertex.getSuffix();
+ }
+ return result;
+ }
+
+ /**
+ * Calculate the path cost of dangling ends.
+ *
+ *
+ * A dangling end is the section of the read that falls before the left anchor or after the right anchor.
+ *
+ *
+ * @param anchoring anchoring information of the read vs the haplotype assembly graph.
+ * @param hmm the PairHMM engine to use to calculate likelihoods.
+ * @param destination cost destination.
+ */
+ private void danglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) {
+ if (anchoring.leftAnchorIndex > 0 || anchoring.leftAnchorIndex == 0
+ && anchoring.leftAnchorVertex.hasAmbiguousSequence())
+ leftDanglingEndPathCosts(anchoring, hmm,destination);
+
+ if (anchoring.rightAnchorIndex < anchoring.read.getReadLength() - kmerSize)
+ rightDanglingEndPathCosts(anchoring, hmm, destination);
+ }
+
+ /**
+ * Generates all relevant right dangling end path costs.
+ *
+ * @param anchoring the anchoring information for the read under analysis.
+ * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with
+ * the same read as {@code anchoring} refers to.
+ * @param destination where the place the resulting read-segment-costs.
+ */
+ private void rightDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm,
+ final Map> destination) {
+ final int readStart = anchoring.rightAnchorIndex;
+ final int readEnd = anchoring.read.getReadLength() - kmerSize + 1;
+ final Set> haplotypeRoutes =
+ extendsHaplotypeRoutesForwards(anchoring.rightAnchorVertex);
+ if (haplotypeRoutes.size() >= 2)
+ calculateCostForPathSet(anchoring.read,
+ haplotypeRoutes, hmm, readStart, readEnd, false, true,anchoring.rightAnchorVertex,null,destination);
+
+ }
+
+ /**
+ * Generates all relevant left dangling end path costs.
+ *
+ * @param anchoring the anchoring information for the read under analysis.
+ * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with
+ * the same read as {@code anchoring} refers to.
+ * @param destination where the place the resulting read-segment-costs.
+ */
+ private void leftDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm,
+ final Map> destination) {
+ final int readStart = -kmerSize;
+ final int readEnd = anchoring.leftAnchorIndex;
+ final Set> haplotypeRoutes =
+ extendsHaplotypeRoutesBackwards(anchoring.leftAnchorVertex);
+ if (haplotypeRoutes.size() >= 2) // if there is just one haplotype route there is no relevant variation in the dangling end.
+ calculateCostForPathSet(anchoring.read, haplotypeRoutes, hmm,
+ readStart, readEnd, false, true, null, anchoring.leftAnchorVertex, destination);
+ }
+
+ /**
+ * Construct haplotype routes prefixes to an anchor vertex.
+ *
+ *
+ * The output should contain a route for each haplotype that includes the input anchor vertex.
+ * This route would be the prefix of the haplotype that finishes at that vertex.
+ *
+ *
+ * @param anchorVertex the target anchor vertex.
+ * @return never {@code null}.
+ */
+ private Set> extendsHaplotypeRoutesBackwards(
+ final MultiDeBruijnVertex anchorVertex) {
+ final Set> result = new HashSet<>(haplotypes.size());
+ for (final MultiDeBruijnVertex parent : haplotypeGraph.incomingVerticesOf(anchorVertex))
+ extendsHaplotypeRoutesFrom(parent, result, false);
+ return result;
+ }
+
+ /**
+ * Construct haplotype routes suffix from an anchor vertex.
+ *
+ *
+ * The output should contain a route for each haplotype that includes the input anchor vertex.
+ * This route would be the suffix of the haplotype that starts at that vertex.
+ *
+ *
+ * @param anchorVertex the target anchor vertex.
+ * @return never {@code null}.
+ */
+ private Set> extendsHaplotypeRoutesForwards(
+ final MultiDeBruijnVertex anchorVertex) {
+ final Set> result = new HashSet<>(haplotypes.size());
+ for (final MultiDeBruijnVertex parent : haplotypeGraph.outgoingVerticesOf(anchorVertex))
+ extendsHaplotypeRoutesFrom(parent, result, true);
+ return result;
+ }
+
+ /**
+ * Extends from a vertex considering path furcations that are part of some valid haplotype
+ *
+ *
+ * In other words, it will ignore subpaths that are not valid part of an assembled haplotype.
+ *
+ *
+ * @param start start seed vertex.
+ * @param result destination for found extensions.
+ * @param forward whether to traverse edges forward or backwards.
+ */
+ private void extendsHaplotypeRoutesFrom(final MultiDeBruijnVertex start, final Set> result, final boolean forward) {
+ final Set validHaplotypeRoutes = haplotypeGraph.getEnclosingHaplotypeRoutes(start);
+ if (validHaplotypeRoutes.size() == 0) return;
+ final Deque, Set>> queue = new LinkedList<>();
+ queue.add(new Pair<>(new Route<>(start, haplotypeGraph), validHaplotypeRoutes));
+ while (!queue.isEmpty()) {
+ final Pair, Set> current = queue.remove();
+ final Route path = current.getFirst();
+ final MultiDeBruijnVertex vertex = forward ? path.getLastVertex() : path.getFirstVertex();
+ final Set validRoutes = current.getSecond();
+ for (final HaplotypeRoute hr : validRoutes) {
+ final MultiDeBruijnVertex routeEndVertex = forward ? hr.getLastVertex() : hr.getFirstVertex();
+ if (vertex.equals(routeEndVertex)) {
+ result.add(path);
+ break;
+ }
+ }
+ final Set nextVertices = forward ? haplotypeGraph.outgoingVerticesOf(vertex) :
+ haplotypeGraph.incomingVerticesOf(vertex);
+ for (final MultiDeBruijnVertex candidate : nextVertices) {
+ extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(forward, queue, path, validRoutes, candidate);
+ }
+ }
+ }
+
+ /**
+ * Check on an candidate vertice to exted a path.
+ *
+ *
+ * This method updates the traversal queue accordingly.
+ *
+ *
+ * @param forward whether the extension is forward, or backwards.
+ * @param queue queue with open paths yet to be explored.
+ * @param path path extension to evaluate.
+ * @param validRoutes collection of valid haplotype routes used to discard non-informative extensions.
+ * @param candidate the candidate extending vertex.
+ */
+ private void extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(
+ final boolean forward,
+ final Deque, Set>> queue,
+ final Route path,
+ final Set validRoutes, final MultiDeBruijnVertex candidate) {
+ final Set parentValidHaplotypes = haplotypeGraph.getEnclosingHaplotypeRoutes(candidate);
+ switch (parentValidHaplotypes.size()) {
+ case 0:
+ return;
+ case 1:
+ if (validRoutes.containsAll(parentValidHaplotypes))
+ queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes));
+ else
+ return;
+ break;
+ default:
+ if (parentValidHaplotypes.size() == validRoutes.size() && parentValidHaplotypes.containsAll(validRoutes)) {
+ queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes));
+ } else {
+ final Set newValidHaplotypeRoutes = new HashSet<>(validRoutes.size());
+ for (final HaplotypeRoute hr : validRoutes)
+ if (parentValidHaplotypes.contains(hr))
+ newValidHaplotypeRoutes.add(hr);
+ if (newValidHaplotypeRoutes.size() == 0)
+ return;
+ queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), newValidHaplotypeRoutes));
+ }
+ }
+ }
+
+ public List getHaplotypeList() {
+ return new ArrayList<>(haplotypeGraph.getHaplotypes());
+ }
+
+ /**
+ * Returns the haplotype graph associated with this instance.
+ * @return never {@code null}
+ */
+ public HaplotypeGraph getHaplotypeGraph() {
+ return haplotypeGraph;
+ }
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 0b95ed07e..82015d153 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -92,6 +92,7 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
@@ -154,6 +155,17 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
+ @Hidden
+ @Advanced
+ @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine",
+ doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false)
+ protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM;
+
+ @Hidden
+ @Advanced
+ @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false)
+ protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN;
+
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false)
protected PrintStream graphWriter = null;
@@ -200,6 +212,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
+ private double log10GlobalReadMismappingRate;
+
public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; }
/**
@@ -290,7 +304,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
* B <= X < C
* X >= C
*
- * The default bands give the following GQ blocks:
+ * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks:
*
* [0, 0]
* (0, 10]
@@ -304,7 +318,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
*/
@Advanced
@Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false)
- protected List GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50);
+ protected List GVCFGQBands = Arrays.asList(5, 20, 60);
/**
* This parameter determines the maximum size of an indel considered as potentially segregating in the
@@ -321,6 +335,13 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// general advanced arguments to control haplotype caller behavior
// -----------------------------------------------------------------------------------------------
+ /**
+ * Users should be aware that this argument can really affect the results of the variant calling and should exercise caution.
+ * Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the
+ * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values
+ * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher
+ * depth to produce calls).
+ */
@Advanced
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 2;
@@ -475,10 +496,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
/**
* Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base
* insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator).
+ * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE.
*/
@Advanced
@Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false)
- public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
+ public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
// -----------------------------------------------------------------------------------------------
// done with Haplotype caller parameters
@@ -497,10 +519,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// the genotyping engine
private GenotypingEngine genotypingEngine = null;
- private VariantAnnotatorEngine annotationEngine = null;
-
// fasta reference reader to supplement the edges of the reference sequence
- private CachingIndexedFastaSequenceFile referenceReader;
+ protected CachingIndexedFastaSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 500;
@@ -524,6 +544,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
ReferenceConfidenceModel referenceConfidenceModel = null;
+ // as determined experimentally Nov-Dec 2013
+ protected final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR;
+ protected final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000;
+
//---------------------------------------------------------------------------------------------------------------
//
// initialize
@@ -541,7 +565,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
samplesList.addAll( samples );
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
- // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this?
+ // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine
UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
@@ -553,16 +577,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.CONTAMINATION_FRACTION = 0.0;
- simpleUAC.CONTAMINATION_FRACTION_FILE=null;
+ simpleUAC.CONTAMINATION_FRACTION_FILE = null;
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
- if( UAC.CONTAMINATION_FRACTION_FILE !=null) {
+ if( UAC.CONTAMINATION_FRACTION_FILE != null ) {
UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger));
}
// initialize the output VCF header
- annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
+ final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
Set headerInfo = new HashSet<>();
@@ -589,6 +613,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
+ // a kluge to enforce the use of this indexing strategy
+ if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE ||
+ getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) {
+ throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER);
+ }
+
try {
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
} catch ( IllegalArgumentException e ) {
@@ -623,7 +653,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1;
// configure the global mismapping rate
- final double log10GlobalReadMismappingRate;
if ( phredScaledGlobalReadMismappingRate < 0 ) {
log10GlobalReadMismappingRate = - Double.MAX_VALUE;
} else {
@@ -632,7 +661,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
}
// create our likelihood calculation engine
- likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
+ likelihoodCalculationEngine = createLikelihoodCalculationEngine();
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();
@@ -650,6 +679,26 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
getToolkit().getGenomeLocParser());
}
+ /**
+ * Instantiates the appropriate likelihood calculation engine.
+ *
+ * @return never {@code null}.
+ */
+ private LikelihoodCalculationEngine createLikelihoodCalculationEngine() {
+ switch (likelihoodEngineImplementation) {
+ case PairHMM:
+ return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
+ case GraphBased:
+ return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations);
+ case Random:
+ return new RandomLikelihoodCalculationEngine();
+ default:
+ //Note: we do not include in the error message list as it is of no grand public interest.
+ throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine +
+ "'. Please use one of the following instead: 'PairHMM' and 'GraphBased'.");
+ }
+ }
+
//---------------------------------------------------------------------------------------------------------------
//
// isActive
@@ -747,7 +796,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
}
// run the local assembler, getting back a collection of information on how we should proceed
- final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
+ final AssemblyResultSet assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
+ final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
// abort early if something is out of the acceptable range
if( ! assemblyResult.isVariationPresent() ) {
@@ -757,17 +807,26 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
if (dontGenotype) return NO_CALLS; // user requested we not proceed
// filter out reads from genotyping which fail mapping quality based criteria
- final Collection filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
+ final Collection filteredReads = filterNonPassingReads( regionForGenotyping );
final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads );
- if( assemblyResult.regionForGenotyping.size() == 0 ) {
+ if( regionForGenotyping.size() == 0 ) {
// no reads remain after filtering so nothing else to do!
return referenceModelForNoVariation(originalActiveRegion, false);
}
// evaluate each sample's reads against all haplotypes
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
- final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
+ final List haplotypes = assemblyResult.getHaplotypeList();
+ final Map> reads = splitReadsBySample( regionForGenotyping.getReads() );
+
+ // Calculate the likelihoods: CPU intesive part.
+ final Map stratifiedReadMap =
+ likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads);
+
+
+
+
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
// was a bad interaction between that selection and the marginalization that happens over each event when computing
@@ -776,12 +835,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
- assemblyResult.haplotypes,
+ haplotypes,
stratifiedReadMap,
perSampleFilteredReadList,
- assemblyResult.fullReferenceWithPadding,
- assemblyResult.paddedReferenceLoc,
- assemblyResult.regionForGenotyping.getLocation(),
+ assemblyResult.getFullReferenceWithPadding(),
+ assemblyResult.getPaddedReferenceLoc(),
+ regionForGenotyping.getLocation(),
getToolkit().getGenomeLocParser(),
metaDataTracker,
activeAllelesToGenotype );
@@ -789,9 +848,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
if ( bamWriter != null ) {
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(
- assemblyResult.haplotypes,
- assemblyResult.paddedReferenceLoc,
- assemblyResult.haplotypes,
+ haplotypes,
+ assemblyResult.getPaddedReferenceLoc(),
+ haplotypes,
calledHaplotypes.getCalledHaplotypes(),
stratifiedReadMap);
}
@@ -803,50 +862,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// no called all of the potential haplotypes
return referenceModelForNoVariation(originalActiveRegion, false);
} else
- return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(),
- calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping,
+ return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
+ calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
stratifiedReadMap, calledHaplotypes.getCalls());
} else {
return calledHaplotypes.getCalls();
}
}
- private final static class AssemblyResult {
- final List haplotypes;
- final ActiveRegion regionForGenotyping;
- final byte[] fullReferenceWithPadding;
- final GenomeLoc paddedReferenceLoc;
- final boolean variationPresent;
- final Haplotype refHaplotype;
-
- private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
- this.haplotypes = haplotypes;
- this.regionForGenotyping = regionForGenotyping;
- this.fullReferenceWithPadding = fullReferenceWithPadding;
- this.paddedReferenceLoc = paddedReferenceLoc;
- this.variationPresent = variationPresent;
-
- Haplotype firstRefHaplotype = null;
- for ( final Haplotype h : haplotypes ) {
- if ( h.isReference() ) {
- if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h);
- firstRefHaplotype = h;
- }
- }
-
- if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes);
- this.refHaplotype = firstRefHaplotype;
- }
-
- public Haplotype getRefHaplotype() {
- return refHaplotype;
- }
-
- public boolean isVariationPresent() {
- return variationPresent && haplotypes.size() > 1;
- }
- }
-
/**
* High-level function that runs the assembler on the active region reads,
* returning a data structure with the resulting information needed
@@ -856,7 +879,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
* @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty)
* @return the AssemblyResult describing how to proceed with genotyping
*/
- protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) {
+ protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) {
// Create the reference haplotype which is the bases from the reference that make up the active region
finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails
@@ -867,17 +890,23 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
// Create ReadErrorCorrector object if requested - will be used within assembly engine.
ReadErrorCorrector readErrorCorrector = null;
if (errorCorrectReads)
- readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG,fullReferenceWithPadding);
+ readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding);
try {
- final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
- if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) {
- return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
- } else {
- // we don't want to trim active regions, so go ahead and use the old one
- return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
- }
- } catch ( Exception e ) {
+ final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
+ assemblyResultSet.debugDump(logger);
+
+ if ( ! dontTrimActiveRegions ) {
+ final ActiveRegion trimmedActiveRegion = trimActiveRegion(assemblyResultSet,activeAllelesToGenotype);
+ if (trimmedActiveRegion != null)
+ return trimAssemblyResultSet(assemblyResultSet, trimmedActiveRegion);
+ else {
+ assemblyResultSet.resetVariationPresent();
+ return assemblyResultSet;
+ }
+ } else
+ return assemblyResultSet;
+ } catch ( final Exception e ) {
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
if ( captureAssemblyFailureBAM ) {
final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5);
@@ -947,73 +976,89 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
return map;
}
- /**
- * Trim down the active region to just enough to properly genotype the events among the haplotypes
- *
- * @param originalActiveRegion our full active region
- * @param haplotypes the list of haplotypes we've created from assembly
- * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty)
- * @param fullReferenceWithPadding the reference bases over the full padded location
- * @param paddedReferenceLoc the span of the reference bases
- * @return an AssemblyResult containing the trimmed active region with all of the reads we should use
- * trimmed down as well, and a revised set of haplotypes. If trimming down the active region results
- * in only the reference haplotype over the non-extended active region, returns null.
- */
- private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion,
- final List haplotypes,
- final List activeAllelesToGenotype,
- final byte[] fullReferenceWithPadding,
- final GenomeLoc paddedReferenceLoc) {
- if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes");
-
- EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG);
- final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes);
+ private ActiveRegion trimActiveRegion(final AssemblyResultSet resultSet, final Collection activeAllelesToGenotype) {
+ if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes");
+ final List haplotypeList = resultSet.getHaplotypeList();
+ final ActiveRegion originalGenotypingRegion = resultSet.getRegionForGenotyping();
+ EventMap.buildEventMapsForHaplotypes(haplotypeList, resultSet.getFullReferenceWithPadding(), resultSet.getPaddedReferenceLoc(), DEBUG);
+ final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypeList);
allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype);
- final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion);
+ final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalGenotypingRegion, allVariantsWithinFullActiveRegion,false);
if ( trimmedActiveRegion == null ) {
// there were no variants found within the active region itself, so just return null
if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)");
- return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, false);
+ return null;
}
- // trim down the haplotypes
- final Set haplotypeSet = new HashSet<>(haplotypes.size());
- for ( final Haplotype h : haplotypes ) {
- final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
- if ( trimmed != null ) {
- haplotypeSet.add(trimmed);
- } else if ( DEBUG ) {
- logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc());
- }
- }
-
- // create the final list of trimmed haplotypes
- final List trimmedHaplotypes = new ArrayList<>(haplotypeSet);
-
- // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
- Collections.sort( trimmedHaplotypes, new HaplotypeSizeAndBaseComparator() );
-
- if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
- if ( DEBUG ) {
- for ( final Haplotype remaining: trimmedHaplotypes ) {
- logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
- }
- }
-
-
// trim down the reads and add them to the trimmed active region
- final List trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size());
- for( final GATKSAMRecord read : originalActiveRegion.getReads() ) {
- final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
- if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
+ final List trimmedReads = new ArrayList<>(originalGenotypingRegion.getReads().size());
+ for( final GATKSAMRecord read : originalGenotypingRegion.getReads() ) {
+ final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read,
+ trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
+ if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 )
trimmedReads.add(clippedRead);
- }
}
trimmedActiveRegion.clearReads();
trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
- return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
+ return trimmedActiveRegion;
+ }
+
+
+ /**
+ * Trims a assembly result set according to the active-region trimming.
+ *
+ * @param resultSet the original assembly result set.
+ * @param trimmedActiveRegion the trimmed active region to trim to.
+ * @return the assembly result set trimmed.
+ */
+ private AssemblyResultSet trimAssemblyResultSet(final AssemblyResultSet resultSet, final ActiveRegion trimmedActiveRegion) {
+ if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes");
+
+ final List haplotypeList = resultSet.getHaplotypeList();
+
+ // trim down the haplotypes
+ final Map originalByTrimmedHaplotypes = new HashMap<>();
+
+ for ( final Haplotype h : haplotypeList ) {
+ final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
+
+ if ( trimmed != null ) {
+ if (originalByTrimmedHaplotypes.containsKey(trimmed)) {
+ if (trimmed.isReference()) {
+ originalByTrimmedHaplotypes.remove(trimmed);
+ originalByTrimmedHaplotypes.put(trimmed, h);
+ }
+ } else
+ originalByTrimmedHaplotypes.put(trimmed,h);
+ } else if (h.isReference())
+ throw new IllegalStateException("trimming eliminates the reference haplotype");
+ else if ( DEBUG ) {
+ logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() +
+ " because it starts with or ends with an insertion or deletion when trimmed to " +
+ trimmedActiveRegion.getExtendedLoc());
+ }
+ }
+
+ // create the final list of trimmed haplotypes
+ final List trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet());
+
+ // resort the trimmed haplotypes.
+ Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator());
+ final Map sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size());
+ for (final Haplotype trimmed : trimmedHaplotypes)
+ sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed));
+
+
+ if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " +
+ trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " +
+ haplotypeList.size() + " to only " + trimmedHaplotypes.size());
+ if ( DEBUG )
+ for ( final Haplotype remaining: trimmedHaplotypes )
+ logger.info("Remains: " + remaining + " cigar " + remaining.getCigar());
+
+ return resultSet.trimTo(trimmedActiveRegion,sortedOriginalByTrimmedHaplotypes);
}
//---------------------------------------------------------------------------------------------------------------
@@ -1039,7 +1084,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
public void onTraversalDone(Integer result) {
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
referenceConfidenceModel.close();
- likelihoodCalculationEngine.close();
+ //TODO remove the need to call close here for debugging, the likelihood output stream should be managed
+ //TODO (open & close) at the walker, not the engine.
+ //likelihoodCalculationEngine.close();
logger.info("Ran local assembly on " + result + " active regions");
}
@@ -1050,6 +1097,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
//---------------------------------------------------------------------------------------------------------------
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
+ if (activeRegion.isFinalized()) return;
+
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
// Loop through the reads hard clipping the adaptor and low quality tails
@@ -1094,9 +1143,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
activeRegion.clearReads();
activeRegion.addAll(downsampledReads);
+ activeRegion.setFinalized(true);
}
- private Set filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
+ private Set filterNonPassingReads( final ActiveRegion activeRegion ) {
final Set readsToRemove = new LinkedHashSet<>();
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
@@ -1107,7 +1157,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
return readsToRemove;
}
- private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
+ private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) {
final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1);
final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength());
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java
new file mode 100644
index 000000000..5887864e3
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java
@@ -0,0 +1,129 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
+
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
+
+import java.util.*;
+
+/**
+ * Graph route that represent an haplotype on the haplotype assembly graph.
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.com>
+ */
+public class HaplotypeRoute extends Route {
+
+ protected final Set vertexSet;
+
+ protected final Map vertexOrder;
+
+ protected final Set forkAndJoins;
+
+ /**
+ * Constructs a HaplotypeRoute given its route.
+ *
+ * @param route the haplotype route.
+ */
+ public HaplotypeRoute(final Route route) {
+ super(route);
+ vertexOrder = new LinkedHashMap<>(route.length() + 1);
+ int nextOrder = 0;
+ vertexOrder.put(getFirstVertex(),nextOrder++);
+ for (final MultiSampleEdge edge : edgesInOrder)
+ vertexOrder.put(graph.getEdgeTarget(edge), nextOrder++);
+ Route currentRoute = this;
+ forkAndJoins = new HashSet<>(route.length());
+ while (currentRoute != null) {
+ if (currentRoute.lastVertexIsForkOrJoin())
+ forkAndJoins.add(currentRoute.getLastVertex());
+ currentRoute = currentRoute.getPrefixRouteWithLastVertexThatIsForkOrJoin();
+ }
+ vertexSet = Collections.unmodifiableSet(new HashSet<>(vertexOrder.keySet()));
+ }
+
+
+
+ @SuppressWarnings("unused")
+ public Route