Outputting missing intervals in DiagnoseTargets

Problem
------
Diagnose Targets identifies holes in the coverage of a targetted experiment, but it only reports them doesn't list the actual missing loci

Solution
------
This commit implements an optional intervals file output listing the exact loci that did not pass filters

Itemized changes
--------------
   * Cache callable statuses (to avoid recalculation)
   * Add functionality to output missing intervals
   * Implement new tool to qualify the missing intervals (QualifyMissingIntervals) by gc content, size, type of missing coverage and origin (coding sequence, intron, ...)
This commit is contained in:
Mauricio Carneiro 2013-04-26 23:29:25 -04:00
parent 1466396a31
commit 3dbb86b052
5 changed files with 85 additions and 12 deletions

View File

@ -120,7 +120,7 @@ abstract class AbstractStratification {
*
* @return the callable status(es) for the whole object
*/
public abstract Iterable<CallableStatus> callableStatuses();
public abstract List<CallableStatus> callableStatuses();
/**

View File

@ -65,6 +65,8 @@ import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.*;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
/**
@ -122,13 +124,12 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@ArgumentCollection
private ThresHolder thresholds = new ThresHolder();
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set<String> samples = null; // all the samples being processed
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
private static final int INITIAL_HASH_SIZE = 500000;
private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals
@Override
public void initialize() {
@ -149,7 +150,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
}
@Override
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
GenomeLoc refLocus = ref.getLocus();
// process and remove any intervals in the map that are don't overlap the current locus anymore
@ -187,7 +188,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @param result number of loci processed by the walker
*/
@Override
public void onTraversalDone(Long result) {
public void onTraversalDone(final Long result) {
for (GenomeLoc interval : intervalMap.keySet())
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
@ -197,6 +198,14 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
intervalListIterator.next();
interval = intervalListIterator.peek();
}
if (thresholds.missingTargets != null) {
try {
thresholds.missingTargets.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
@ -210,7 +219,11 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
for (GenomeLoc key : intervalMap.keySet()) {
if (key.isBefore(refLocus)) {
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
final IntervalStratification intervalStats = intervalMap.get(key);
outputStatsToVCF(intervalStats, Allele.create(refBase, true));
if (hasMissingLoci(intervalStats)) {
outputMissingInterval(intervalStats);
}
toRemove.add(key);
}
}
@ -224,7 +237,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
*
* @param refLocus the current reference locus
*/
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek();
while (interval != null && !interval.isPast(refLocus)) {
intervalMap.put(interval, createIntervalStatistic(interval));
@ -239,10 +252,9 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @param stats The statistics of the interval
* @param refAllele the reference allele
*/
private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) {
private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
GenomeLoc interval = stats.getInterval();
final List<Allele> alleles = new ArrayList<Allele>();
final Map<String, Object> attributes = new HashMap<String, Object>();
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
@ -274,6 +286,55 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
vcfWriter.add(vcb.make());
}
private boolean hasMissingStatuses(AbstractStratification stats) {
return !stats.callableStatuses().isEmpty();
}
private boolean hasMissingLoci(final IntervalStratification stats) {
return thresholds.missingTargets != null && hasMissingStatuses(stats);
}
private void outputMissingInterval(final IntervalStratification stats) {
final GenomeLoc interval = stats.getInterval();
final boolean missing[] = new boolean[interval.size()];
Arrays.fill(missing, true);
for (AbstractStratification sample : stats.getElements()) {
if (hasMissingStatuses(sample)) {
int pos = 0;
for (AbstractStratification locus : sample.getElements()) {
if (locus.callableStatuses().isEmpty()) {
missing[pos] = false;
}
pos++;
}
}
}
int start = -1;
boolean insideMissing = false;
for (int i = 0; i < missing.length; i++) {
if (missing[i] && !insideMissing) {
start = interval.getStart() + i;
insideMissing = true;
} else if (!missing[i] && insideMissing) {
final int stop = interval.getStart() + i - 1;
outputMissingInterval(interval.getContig(), start, stop);
insideMissing = false;
}
}
if (insideMissing) {
outputMissingInterval(interval.getContig(), start, interval.getStop());
}
}
private void outputMissingInterval(final String contig, final int start, final int stop){
final FileWriter out = thresholds.missingTargets;
try {
out.write(String.format("%s:%d-%d\n", contig, start, stop));
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Function that process a set of statuses into strings
*

View File

@ -56,6 +56,7 @@ import java.util.*;
final class IntervalStratification extends AbstractStratification {
private final Map<String, AbstractStratification> samples;
private final GenomeLoc interval;
private List<CallableStatus> callableStatuses;
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
super(thresholds);
@ -113,7 +114,13 @@ final class IntervalStratification extends AbstractStratification {
* {@inheritDoc}
*/
@Override
public Iterable<CallableStatus> callableStatuses() {
public List<CallableStatus> callableStatuses() {
if (callableStatuses == null)
callableStatuses = calculateStatus();
return callableStatuses;
}
private List<CallableStatus> calculateStatus() {
final List<CallableStatus> output = new LinkedList<CallableStatus>();
// check if any of the votes pass the threshold

View File

@ -117,7 +117,7 @@ final class SampleStratification extends AbstractStratification {
* {@inheritDoc}
*/
@Override
public Iterable<CallableStatus> callableStatuses() {
public List<CallableStatus> callableStatuses() {
final List<CallableStatus> output = new LinkedList<CallableStatus>();
// get the sample statuses of all the Loci Metrics

View File

@ -47,7 +47,9 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import java.io.FileWriter;
import java.util.LinkedList;
import java.util.List;
@ -114,6 +116,9 @@ final class ThresHolder {
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
public double qualityStatusThreshold = 0.50;
@Output(fullName = "missing_intervals", shortName = "missing", doc ="Produces a file with the intervals that don't pass filters", required = false)
public FileWriter missingTargets = null;
public final List<Metric> locusMetricList = new LinkedList<Metric>();
public final List<Metric> sampleMetricList = new LinkedList<Metric>();
public final List<Metric> intervalMetricList = new LinkedList<Metric>();