diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java index dca83af44..ceccdcb2e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java @@ -63,6 +63,10 @@ abstract class AbstractStratification { private Map statusTally = null; protected ThresHolder thresholds; + public AbstractStratification(ThresHolder thresholds) { + this.thresholds = thresholds; + } + /** * Calculates the average "good" coverage of this sample. Good means "passes the base and * mapping quality requirements. @@ -116,11 +120,11 @@ abstract class AbstractStratification { * * @return the callable status(es) for the whole object */ - public abstract Iterable callableStatuses(); + public abstract List callableStatuses(); /** - * Tally up all the callable status of all the loci in this sample. + * Tally up all the callable status of all elements of the stratification. * * @return a map of callable status and counts */ @@ -136,10 +140,10 @@ abstract class AbstractStratification { return statusTally; } - public static List queryStatus(List statList, AbstractStratification stratification) { + public List queryStatus(List statList) { List output = new LinkedList(); for (Metric stat : statList) { - final CallableStatus status = stat.status(stratification); + final CallableStatus status = stat.status(this); if (status != null) { output.add(status); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 32f87b973..4bd08294b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.vcf.*; +import java.io.PrintStream; import java.util.*; /** @@ -112,6 +113,9 @@ import java.util.*; public class DiagnoseTargets extends LocusWalker { private static final String AVG_INTERVAL_DP_KEY = "IDP"; + private static final String LOW_COVERAGE_LOCI = "LL"; + private static final String ZERO_COVERAGE_LOCI = "ZL"; + @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; @@ -119,13 +123,12 @@ public class DiagnoseTargets extends LocusWalker { @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); - private Map intervalMap = null; // maps each interval => statistics + private Map intervalMap = null; // maps each interval => statistics private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private Set samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times - - private static final int INITIAL_HASH_SIZE = 500000; + private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals @Override public void initialize() { @@ -134,7 +137,7 @@ public class DiagnoseTargets extends LocusWalker { if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); - intervalMap = new HashMap(INITIAL_HASH_SIZE); + intervalMap = new LinkedHashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header @@ -146,13 +149,13 @@ public class DiagnoseTargets extends LocusWalker { } @Override - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); // process and remove any intervals in the map that are don't overlap the current locus anymore // and add all new intervals that may overlap this reference locus - outputFinishedIntervals(refLocus, ref.getBase()); addNewOverlappingIntervals(refLocus); + outputFinishedIntervals(refLocus, ref.getBase()); // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStratification intervalStratification : intervalMap.values()) @@ -184,7 +187,7 @@ public class DiagnoseTargets extends LocusWalker { * @param result number of loci processed by the walker */ @Override - public void onTraversalDone(Long result) { + public void onTraversalDone(final Long result) { for (GenomeLoc interval : intervalMap.keySet()) outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); @@ -194,6 +197,10 @@ public class DiagnoseTargets extends LocusWalker { intervalListIterator.next(); interval = intervalListIterator.peek(); } + + if (thresholds.missingTargets != null) { + thresholds.missingTargets.close(); + } } /** @@ -203,24 +210,21 @@ public class DiagnoseTargets extends LocusWalker { * @param refBase the reference allele */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { - GenomeLoc interval = intervalListIterator.peek(); - - // output empty statistics for uncovered intervals - while (interval != null && interval.isBefore(refLocus)) { - final IntervalStratification stats = intervalMap.get(interval); - outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE); - if (stats != null) intervalMap.remove(interval); - intervalListIterator.next(); - interval = intervalListIterator.peek(); - } - - // remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals) + // output any intervals that were finished + final List toRemove = new LinkedList(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { - outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true)); - intervalMap.remove(key); + final IntervalStratification intervalStats = intervalMap.get(key); + outputStatsToVCF(intervalStats, Allele.create(refBase, true)); + if (hasMissingLoci(intervalStats)) { + outputMissingInterval(intervalStats); + } + toRemove.add(key); } } + for (GenomeLoc key : toRemove) { + intervalMap.remove(key); + } } /** @@ -228,7 +232,7 @@ public class DiagnoseTargets extends LocusWalker { * * @param refLocus the current reference locus */ - private void addNewOverlappingIntervals(GenomeLoc refLocus) { + private void addNewOverlappingIntervals(final GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); @@ -243,14 +247,24 @@ public class DiagnoseTargets extends LocusWalker { * @param stats The statistics of the interval * @param refAllele the reference allele */ - private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) { + private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); + final List alleles = new ArrayList(); + final Map attributes = new HashMap(); + final ArrayList genotypes = new ArrayList(); - List alleles = new ArrayList(); - Map attributes = new HashMap(); - ArrayList genotypes = new ArrayList(); + for (String sample : samples) { + final GenotypeBuilder gb = new GenotypeBuilder(sample); + SampleStratification sampleStat = stats.getSampleStatistics(sample); + gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size())); + gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); + gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); + gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); + + genotypes.add(gb.make()); + } alleles.add(refAllele); alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); @@ -262,21 +276,56 @@ public class DiagnoseTargets extends LocusWalker { attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); vcb = vcb.attributes(attributes); - for (String sample : samples) { - final GenotypeBuilder gb = new GenotypeBuilder(sample); - - SampleStratification sampleStat = stats.getSampleStatistics(sample); - gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size())); - - gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); - - genotypes.add(gb.make()); - } vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); } + private boolean hasMissingStatuses(AbstractStratification stats) { + return !stats.callableStatuses().isEmpty(); + } + + private boolean hasMissingLoci(final IntervalStratification stats) { + return thresholds.missingTargets != null && hasMissingStatuses(stats); + } + + private void outputMissingInterval(final IntervalStratification stats) { + final GenomeLoc interval = stats.getInterval(); + final boolean missing[] = new boolean[interval.size()]; + Arrays.fill(missing, true); + for (AbstractStratification sample : stats.getElements()) { + if (hasMissingStatuses(sample)) { + int pos = 0; + for (AbstractStratification locus : sample.getElements()) { + if (locus.callableStatuses().isEmpty()) { + missing[pos] = false; + } + pos++; + } + } + } + int start = -1; + boolean insideMissing = false; + for (int i = 0; i < missing.length; i++) { + if (missing[i] && !insideMissing) { + start = interval.getStart() + i; + insideMissing = true; + } else if (!missing[i] && insideMissing) { + final int stop = interval.getStart() + i - 1; + outputMissingInterval(interval.getContig(), start, stop); + insideMissing = false; + } + } + if (insideMissing) { + outputMissingInterval(interval.getContig(), start, interval.getStop()); + } + } + + private void outputMissingInterval(final String contig, final int start, final int stop) { + final PrintStream out = thresholds.missingTargets; + out.println(String.format("%s:%d-%d", contig, start, stop)); + } + /** * Function that process a set of statuses into strings * @@ -345,6 +394,8 @@ public class DiagnoseTargets extends LocusWalker { // FORMAT fields for each genotype headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size.")); + headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero.")); + headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage.")); // FILTER fields for (CallableStatus stat : CallableStatus.values()) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java index 6c20403d1..3b5a23d51 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -56,11 +56,11 @@ import java.util.*; final class IntervalStratification extends AbstractStratification { private final Map samples; private final GenomeLoc interval; - private final ThresHolder thresholds; + private List callableStatuses; public IntervalStratification(Set samples, GenomeLoc interval, ThresHolder thresholds) { + super(thresholds); this.interval = interval; - this.thresholds = thresholds; this.samples = new HashMap(samples.size()); for (String sample : samples) this.samples.put(sample, new SampleStratification(interval, thresholds)); @@ -114,7 +114,13 @@ final class IntervalStratification extends AbstractStratification { * {@inheritDoc} */ @Override - public Iterable callableStatuses() { + public List callableStatuses() { + if (callableStatuses == null) + callableStatuses = calculateStatus(); + return callableStatuses; + } + + private List calculateStatus() { final List output = new LinkedList(); // check if any of the votes pass the threshold @@ -125,7 +131,7 @@ final class IntervalStratification extends AbstractStratification { } } - output.addAll(queryStatus(thresholds.intervalMetricList, this)); + output.addAll(queryStatus(thresholds.intervalMetricList)); return output; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java index d6acaf850..5902fce31 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java @@ -46,22 +46,20 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; -import java.util.LinkedList; import java.util.List; final class LocusStratification extends AbstractStratification { private long coverage; private long rawCoverage; - private final List locusStatisticsList; public LocusStratification(ThresHolder thresholds) { this(0,0,thresholds); } protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) { + super(thresholds); this.coverage = coverage; this.rawCoverage = rawCoverage; - this.locusStatisticsList = thresholds.locusMetricList; } @Override @@ -79,14 +77,7 @@ final class LocusStratification extends AbstractStratification { * @return a set of all statuses that apply */ public List callableStatuses() { - List output = new LinkedList(); - for (Metric stats : locusStatisticsList) { - CallableStatus status = stats.status(this); - if (status != null) { - output.add(status); - } - } - return output; + return queryStatus(thresholds.locusMetricList); } @Override diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java index 1085e8cac..7984ba7e7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java @@ -58,6 +58,6 @@ final class PluginUtils { final Map totals = sampleStratification.getStatusTally(); final int size = sampleStratification.getIntervalSize(); final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0; - return ( (double) statusCount / size) >= threshold ? CALL: null; + return ( (double) statusCount / size) > threshold ? CALL: null; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java index b9ae1f3cf..0f84c7d22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java @@ -61,15 +61,14 @@ import java.util.List; final class SampleStratification extends AbstractStratification { private final GenomeLoc interval; private final ArrayList loci; - private final ThresHolder thresholds; private int nReads = -1; private int nBadMates = -1; public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) { + super(thresholds); this.interval = interval; this.loci = new ArrayList(interval.size()); - this.thresholds = thresholds; nReads = 0; nBadMates = 0; @@ -118,10 +117,10 @@ final class SampleStratification extends AbstractStratification { * {@inheritDoc} */ @Override - public Iterable callableStatuses() { + public List callableStatuses() { final List output = new LinkedList(); - // get the tally of all the locus callable statuses + // get the sample statuses of all the Loci Metrics for (Metric locusStat : thresholds.locusMetricList) { final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this); if (status != null) { @@ -130,12 +129,7 @@ final class SampleStratification extends AbstractStratification { } // get the sample specific statitics statuses - for (Metric sampleStat : thresholds.sampleMetricList) { - final CallableStatus status = sampleStat.status(this); - if (status != null) { - output.add(status); - } - } + output.addAll(queryStatus(thresholds.sampleMetricList)); // special case, if there are no reads, then there is no sense reporting coverage gaps. if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS)) @@ -159,4 +153,17 @@ final class SampleStratification extends AbstractStratification { read.setTemporaryAttribute("seen", true); } } + + public int getNLowCoveredLoci() { + return getCallableStatusCount(CallableStatus.LOW_COVERAGE); + } + + public int getNUncoveredLoci() { + return getCallableStatusCount(CallableStatus.COVERAGE_GAPS); + } + + private int getCallableStatusCount(CallableStatus status) { + final Integer x = getStatusTally().get(status); + return x == null ? 0 : x; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index b0c999460..ebe2192b4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -47,7 +47,9 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import java.io.PrintStream; import java.util.LinkedList; import java.util.List; @@ -114,6 +116,9 @@ final class ThresHolder { @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) public double qualityStatusThreshold = 0.50; + @Output(fullName = "missing_intervals", shortName = "missing", doc ="Produces a file with the intervals that don't pass filters", required = false) + public PrintStream missingTargets = null; + public final List locusMetricList = new LinkedList(); public final List sampleMetricList = new LinkedList(); public final List intervalMetricList = new LinkedList(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java index bac09f30d..52e385957 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "850304909477afa8c2a8f128d6eedde9"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "1771e95aed2b3b240dc353f84e19847d"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "bedd19bcf21d1a779f6706c0351c9d26"); + DTTest("testMultiSample ", "-I " + multiSample, "c7f1691dbe5f121c4a79be823d3057e5"); } }