Outputting missing intervals in DiagnoseTargets

Problem
------
Diagnose Targets identifies holes in the coverage of a targetted experiment, but it only reports them doesn't list the actual missing loci

Solution
------
This commit implements an optional intervals file output listing the exact loci that did not pass filters

Itemized changes
--------------
   * Cache callable statuses (to avoid recalculation)
   * Add functionality to output missing intervals
   * Implement new tool to qualify the missing intervals (QualifyMissingIntervals) by gc content, size, type of missing coverage and origin (coding sequence, intron, ...)
This commit is contained in:
Mauricio Carneiro 2013-04-26 23:29:25 -04:00
parent 1466396a31
commit 3dbb86b052
5 changed files with 85 additions and 12 deletions

View File

@ -120,7 +120,7 @@ abstract class AbstractStratification {
* *
* @return the callable status(es) for the whole object * @return the callable status(es) for the whole object
*/ */
public abstract Iterable<CallableStatus> callableStatuses(); public abstract List<CallableStatus> callableStatuses();
/** /**

View File

@ -65,6 +65,8 @@ import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.*; import org.broadinstitute.variant.vcf.*;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*; import java.util.*;
/** /**
@ -122,13 +124,12 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@ArgumentCollection @ArgumentCollection
private ThresHolder thresholds = new ThresHolder(); private ThresHolder thresholds = new ThresHolder();
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set<String> samples = null; // all the samples being processed private Set<String> samples = null; // all the samples being processed
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals
private static final int INITIAL_HASH_SIZE = 500000;
@Override @Override
public void initialize() { public void initialize() {
@ -149,7 +150,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
} }
@Override @Override
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
GenomeLoc refLocus = ref.getLocus(); GenomeLoc refLocus = ref.getLocus();
// process and remove any intervals in the map that are don't overlap the current locus anymore // process and remove any intervals in the map that are don't overlap the current locus anymore
@ -187,7 +188,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @param result number of loci processed by the walker * @param result number of loci processed by the walker
*/ */
@Override @Override
public void onTraversalDone(Long result) { public void onTraversalDone(final Long result) {
for (GenomeLoc interval : intervalMap.keySet()) for (GenomeLoc interval : intervalMap.keySet())
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
@ -197,6 +198,14 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
intervalListIterator.next(); intervalListIterator.next();
interval = intervalListIterator.peek(); interval = intervalListIterator.peek();
} }
if (thresholds.missingTargets != null) {
try {
thresholds.missingTargets.close();
} catch (IOException e) {
e.printStackTrace();
}
}
} }
/** /**
@ -210,7 +219,11 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>(); final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
for (GenomeLoc key : intervalMap.keySet()) { for (GenomeLoc key : intervalMap.keySet()) {
if (key.isBefore(refLocus)) { if (key.isBefore(refLocus)) {
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true)); final IntervalStratification intervalStats = intervalMap.get(key);
outputStatsToVCF(intervalStats, Allele.create(refBase, true));
if (hasMissingLoci(intervalStats)) {
outputMissingInterval(intervalStats);
}
toRemove.add(key); toRemove.add(key);
} }
} }
@ -224,7 +237,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* *
* @param refLocus the current reference locus * @param refLocus the current reference locus
*/ */
private void addNewOverlappingIntervals(GenomeLoc refLocus) { private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek(); GenomeLoc interval = intervalListIterator.peek();
while (interval != null && !interval.isPast(refLocus)) { while (interval != null && !interval.isPast(refLocus)) {
intervalMap.put(interval, createIntervalStatistic(interval)); intervalMap.put(interval, createIntervalStatistic(interval));
@ -239,10 +252,9 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @param stats The statistics of the interval * @param stats The statistics of the interval
* @param refAllele the reference allele * @param refAllele the reference allele
*/ */
private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) { private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
GenomeLoc interval = stats.getInterval(); GenomeLoc interval = stats.getInterval();
final List<Allele> alleles = new ArrayList<Allele>(); final List<Allele> alleles = new ArrayList<Allele>();
final Map<String, Object> attributes = new HashMap<String, Object>(); final Map<String, Object> attributes = new HashMap<String, Object>();
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
@ -274,6 +286,55 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
vcfWriter.add(vcb.make()); vcfWriter.add(vcb.make());
} }
private boolean hasMissingStatuses(AbstractStratification stats) {
return !stats.callableStatuses().isEmpty();
}
private boolean hasMissingLoci(final IntervalStratification stats) {
return thresholds.missingTargets != null && hasMissingStatuses(stats);
}
private void outputMissingInterval(final IntervalStratification stats) {
final GenomeLoc interval = stats.getInterval();
final boolean missing[] = new boolean[interval.size()];
Arrays.fill(missing, true);
for (AbstractStratification sample : stats.getElements()) {
if (hasMissingStatuses(sample)) {
int pos = 0;
for (AbstractStratification locus : sample.getElements()) {
if (locus.callableStatuses().isEmpty()) {
missing[pos] = false;
}
pos++;
}
}
}
int start = -1;
boolean insideMissing = false;
for (int i = 0; i < missing.length; i++) {
if (missing[i] && !insideMissing) {
start = interval.getStart() + i;
insideMissing = true;
} else if (!missing[i] && insideMissing) {
final int stop = interval.getStart() + i - 1;
outputMissingInterval(interval.getContig(), start, stop);
insideMissing = false;
}
}
if (insideMissing) {
outputMissingInterval(interval.getContig(), start, interval.getStop());
}
}
private void outputMissingInterval(final String contig, final int start, final int stop){
final FileWriter out = thresholds.missingTargets;
try {
out.write(String.format("%s:%d-%d\n", contig, start, stop));
} catch (IOException e) {
e.printStackTrace();
}
}
/** /**
* Function that process a set of statuses into strings * Function that process a set of statuses into strings
* *

View File

@ -56,6 +56,7 @@ import java.util.*;
final class IntervalStratification extends AbstractStratification { final class IntervalStratification extends AbstractStratification {
private final Map<String, AbstractStratification> samples; private final Map<String, AbstractStratification> samples;
private final GenomeLoc interval; private final GenomeLoc interval;
private List<CallableStatus> callableStatuses;
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) { public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
super(thresholds); super(thresholds);
@ -113,7 +114,13 @@ final class IntervalStratification extends AbstractStratification {
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public Iterable<CallableStatus> callableStatuses() { public List<CallableStatus> callableStatuses() {
if (callableStatuses == null)
callableStatuses = calculateStatus();
return callableStatuses;
}
private List<CallableStatus> calculateStatus() {
final List<CallableStatus> output = new LinkedList<CallableStatus>(); final List<CallableStatus> output = new LinkedList<CallableStatus>();
// check if any of the votes pass the threshold // check if any of the votes pass the threshold

View File

@ -117,7 +117,7 @@ final class SampleStratification extends AbstractStratification {
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public Iterable<CallableStatus> callableStatuses() { public List<CallableStatus> callableStatuses() {
final List<CallableStatus> output = new LinkedList<CallableStatus>(); final List<CallableStatus> output = new LinkedList<CallableStatus>();
// get the sample statuses of all the Loci Metrics // get the sample statuses of all the Loci Metrics

View File

@ -47,7 +47,9 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import java.io.FileWriter;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
@ -114,6 +116,9 @@ final class ThresHolder {
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
public double qualityStatusThreshold = 0.50; public double qualityStatusThreshold = 0.50;
@Output(fullName = "missing_intervals", shortName = "missing", doc ="Produces a file with the intervals that don't pass filters", required = false)
public FileWriter missingTargets = null;
public final List<Metric> locusMetricList = new LinkedList<Metric>(); public final List<Metric> locusMetricList = new LinkedList<Metric>();
public final List<Metric> sampleMetricList = new LinkedList<Metric>(); public final List<Metric> sampleMetricList = new LinkedList<Metric>();
public final List<Metric> intervalMetricList = new LinkedList<Metric>(); public final List<Metric> intervalMetricList = new LinkedList<Metric>();