From 3dbb86b05253ce407b540cb8fe6d1cd66cb92a0d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 26 Apr 2013 23:29:25 -0400 Subject: [PATCH] Outputting missing intervals in DiagnoseTargets Problem ------ Diagnose Targets identifies holes in the coverage of a targetted experiment, but it only reports them doesn't list the actual missing loci Solution ------ This commit implements an optional intervals file output listing the exact loci that did not pass filters Itemized changes -------------- * Cache callable statuses (to avoid recalculation) * Add functionality to output missing intervals * Implement new tool to qualify the missing intervals (QualifyMissingIntervals) by gc content, size, type of missing coverage and origin (coding sequence, intron, ...) --- .../AbstractStratification.java | 2 +- .../diagnosetargets/DiagnoseTargets.java | 79 ++++++++++++++++--- .../IntervalStratification.java | 9 ++- .../diagnosetargets/SampleStratification.java | 2 +- .../diagnosetargets/ThresHolder.java | 5 ++ 5 files changed, 85 insertions(+), 12 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java index 8b7f3dbf2..ceccdcb2e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java @@ -120,7 +120,7 @@ abstract class AbstractStratification { * * @return the callable status(es) for the whole object */ - public abstract Iterable callableStatuses(); + public abstract List callableStatuses(); /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 32d866b0a..a3ac21ae0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -65,6 +65,8 @@ import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.vcf.*; +import java.io.FileWriter; +import java.io.IOException; import java.util.*; /** @@ -122,13 +124,12 @@ public class DiagnoseTargets extends LocusWalker { @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); - private Map intervalMap = null; // maps each interval => statistics + private Map intervalMap = null; // maps each interval => statistics private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private Set samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times - - private static final int INITIAL_HASH_SIZE = 500000; + private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals @Override public void initialize() { @@ -149,7 +150,7 @@ public class DiagnoseTargets extends LocusWalker { } @Override - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); // process and remove any intervals in the map that are don't overlap the current locus anymore @@ -187,7 +188,7 @@ public class DiagnoseTargets extends LocusWalker { * @param result number of loci processed by the walker */ @Override - public void onTraversalDone(Long result) { + public void onTraversalDone(final Long result) { for (GenomeLoc interval : intervalMap.keySet()) outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); @@ -197,6 +198,14 @@ public class DiagnoseTargets extends LocusWalker { intervalListIterator.next(); interval = intervalListIterator.peek(); } + + if (thresholds.missingTargets != null) { + try { + thresholds.missingTargets.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } } /** @@ -210,7 +219,11 @@ public class DiagnoseTargets extends LocusWalker { final List toRemove = new LinkedList(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { - outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true)); + final IntervalStratification intervalStats = intervalMap.get(key); + outputStatsToVCF(intervalStats, Allele.create(refBase, true)); + if (hasMissingLoci(intervalStats)) { + outputMissingInterval(intervalStats); + } toRemove.add(key); } } @@ -224,7 +237,7 @@ public class DiagnoseTargets extends LocusWalker { * * @param refLocus the current reference locus */ - private void addNewOverlappingIntervals(GenomeLoc refLocus) { + private void addNewOverlappingIntervals(final GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); @@ -239,10 +252,9 @@ public class DiagnoseTargets extends LocusWalker { * @param stats The statistics of the interval * @param refAllele the reference allele */ - private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) { + private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); - final List alleles = new ArrayList(); final Map attributes = new HashMap(); final ArrayList genotypes = new ArrayList(); @@ -274,6 +286,55 @@ public class DiagnoseTargets extends LocusWalker { vcfWriter.add(vcb.make()); } + private boolean hasMissingStatuses(AbstractStratification stats) { + return !stats.callableStatuses().isEmpty(); + } + + private boolean hasMissingLoci(final IntervalStratification stats) { + return thresholds.missingTargets != null && hasMissingStatuses(stats); + } + + private void outputMissingInterval(final IntervalStratification stats) { + final GenomeLoc interval = stats.getInterval(); + final boolean missing[] = new boolean[interval.size()]; + Arrays.fill(missing, true); + for (AbstractStratification sample : stats.getElements()) { + if (hasMissingStatuses(sample)) { + int pos = 0; + for (AbstractStratification locus : sample.getElements()) { + if (locus.callableStatuses().isEmpty()) { + missing[pos] = false; + } + pos++; + } + } + } + int start = -1; + boolean insideMissing = false; + for (int i = 0; i < missing.length; i++) { + if (missing[i] && !insideMissing) { + start = interval.getStart() + i; + insideMissing = true; + } else if (!missing[i] && insideMissing) { + final int stop = interval.getStart() + i - 1; + outputMissingInterval(interval.getContig(), start, stop); + insideMissing = false; + } + } + if (insideMissing) { + outputMissingInterval(interval.getContig(), start, interval.getStop()); + } + } + + private void outputMissingInterval(final String contig, final int start, final int stop){ + final FileWriter out = thresholds.missingTargets; + try { + out.write(String.format("%s:%d-%d\n", contig, start, stop)); + } catch (IOException e) { + e.printStackTrace(); + } + } + /** * Function that process a set of statuses into strings * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java index 86e9d0142..3b5a23d51 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -56,6 +56,7 @@ import java.util.*; final class IntervalStratification extends AbstractStratification { private final Map samples; private final GenomeLoc interval; + private List callableStatuses; public IntervalStratification(Set samples, GenomeLoc interval, ThresHolder thresholds) { super(thresholds); @@ -113,7 +114,13 @@ final class IntervalStratification extends AbstractStratification { * {@inheritDoc} */ @Override - public Iterable callableStatuses() { + public List callableStatuses() { + if (callableStatuses == null) + callableStatuses = calculateStatus(); + return callableStatuses; + } + + private List calculateStatus() { final List output = new LinkedList(); // check if any of the votes pass the threshold diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java index 49aa10cf6..0f84c7d22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java @@ -117,7 +117,7 @@ final class SampleStratification extends AbstractStratification { * {@inheritDoc} */ @Override - public Iterable callableStatuses() { + public List callableStatuses() { final List output = new LinkedList(); // get the sample statuses of all the Loci Metrics diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index b0c999460..8c5a75148 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -47,7 +47,9 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import java.io.FileWriter; import java.util.LinkedList; import java.util.List; @@ -114,6 +116,9 @@ final class ThresHolder { @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) public double qualityStatusThreshold = 0.50; + @Output(fullName = "missing_intervals", shortName = "missing", doc ="Produces a file with the intervals that don't pass filters", required = false) + public FileWriter missingTargets = null; + public final List locusMetricList = new LinkedList(); public final List sampleMetricList = new LinkedList(); public final List intervalMetricList = new LinkedList();