Outputting missing intervals in DiagnoseTargets
Problem ------ Diagnose Targets identifies holes in the coverage of a targetted experiment, but it only reports them doesn't list the actual missing loci Solution ------ This commit implements an optional intervals file output listing the exact loci that did not pass filters Itemized changes -------------- * Cache callable statuses (to avoid recalculation) * Add functionality to output missing intervals * Implement new tool to qualify the missing intervals (QualifyMissingIntervals) by gc content, size, type of missing coverage and origin (coding sequence, intron, ...)
This commit is contained in:
parent
1466396a31
commit
3dbb86b052
|
|
@ -120,7 +120,7 @@ abstract class AbstractStratification {
|
||||||
*
|
*
|
||||||
* @return the callable status(es) for the whole object
|
* @return the callable status(es) for the whole object
|
||||||
*/
|
*/
|
||||||
public abstract Iterable<CallableStatus> callableStatuses();
|
public abstract List<CallableStatus> callableStatuses();
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,8 @@ import org.broadinstitute.variant.variantcontext.*;
|
||||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||||
import org.broadinstitute.variant.vcf.*;
|
import org.broadinstitute.variant.vcf.*;
|
||||||
|
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -122,13 +124,12 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
@ArgumentCollection
|
@ArgumentCollection
|
||||||
private ThresHolder thresholds = new ThresHolder();
|
private ThresHolder thresholds = new ThresHolder();
|
||||||
|
|
||||||
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
||||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||||
private Set<String> samples = null; // all the samples being processed
|
private Set<String> samples = null; // all the samples being processed
|
||||||
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||||
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
|
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
|
||||||
|
private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals
|
||||||
private static final int INITIAL_HASH_SIZE = 500000;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
@ -149,7 +150,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||||
GenomeLoc refLocus = ref.getLocus();
|
GenomeLoc refLocus = ref.getLocus();
|
||||||
|
|
||||||
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||||
|
|
@ -187,7 +188,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
* @param result number of loci processed by the walker
|
* @param result number of loci processed by the walker
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void onTraversalDone(Long result) {
|
public void onTraversalDone(final Long result) {
|
||||||
for (GenomeLoc interval : intervalMap.keySet())
|
for (GenomeLoc interval : intervalMap.keySet())
|
||||||
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
|
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
|
||||||
|
|
||||||
|
|
@ -197,6 +198,14 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
intervalListIterator.next();
|
intervalListIterator.next();
|
||||||
interval = intervalListIterator.peek();
|
interval = intervalListIterator.peek();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (thresholds.missingTargets != null) {
|
||||||
|
try {
|
||||||
|
thresholds.missingTargets.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -210,7 +219,11 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
|
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
|
||||||
for (GenomeLoc key : intervalMap.keySet()) {
|
for (GenomeLoc key : intervalMap.keySet()) {
|
||||||
if (key.isBefore(refLocus)) {
|
if (key.isBefore(refLocus)) {
|
||||||
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
|
final IntervalStratification intervalStats = intervalMap.get(key);
|
||||||
|
outputStatsToVCF(intervalStats, Allele.create(refBase, true));
|
||||||
|
if (hasMissingLoci(intervalStats)) {
|
||||||
|
outputMissingInterval(intervalStats);
|
||||||
|
}
|
||||||
toRemove.add(key);
|
toRemove.add(key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -224,7 +237,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
*
|
*
|
||||||
* @param refLocus the current reference locus
|
* @param refLocus the current reference locus
|
||||||
*/
|
*/
|
||||||
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
|
private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
|
||||||
GenomeLoc interval = intervalListIterator.peek();
|
GenomeLoc interval = intervalListIterator.peek();
|
||||||
while (interval != null && !interval.isPast(refLocus)) {
|
while (interval != null && !interval.isPast(refLocus)) {
|
||||||
intervalMap.put(interval, createIntervalStatistic(interval));
|
intervalMap.put(interval, createIntervalStatistic(interval));
|
||||||
|
|
@ -239,10 +252,9 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
* @param stats The statistics of the interval
|
* @param stats The statistics of the interval
|
||||||
* @param refAllele the reference allele
|
* @param refAllele the reference allele
|
||||||
*/
|
*/
|
||||||
private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) {
|
private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
|
||||||
GenomeLoc interval = stats.getInterval();
|
GenomeLoc interval = stats.getInterval();
|
||||||
|
|
||||||
|
|
||||||
final List<Allele> alleles = new ArrayList<Allele>();
|
final List<Allele> alleles = new ArrayList<Allele>();
|
||||||
final Map<String, Object> attributes = new HashMap<String, Object>();
|
final Map<String, Object> attributes = new HashMap<String, Object>();
|
||||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||||
|
|
@ -274,6 +286,55 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||||
vcfWriter.add(vcb.make());
|
vcfWriter.add(vcb.make());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasMissingStatuses(AbstractStratification stats) {
|
||||||
|
return !stats.callableStatuses().isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean hasMissingLoci(final IntervalStratification stats) {
|
||||||
|
return thresholds.missingTargets != null && hasMissingStatuses(stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void outputMissingInterval(final IntervalStratification stats) {
|
||||||
|
final GenomeLoc interval = stats.getInterval();
|
||||||
|
final boolean missing[] = new boolean[interval.size()];
|
||||||
|
Arrays.fill(missing, true);
|
||||||
|
for (AbstractStratification sample : stats.getElements()) {
|
||||||
|
if (hasMissingStatuses(sample)) {
|
||||||
|
int pos = 0;
|
||||||
|
for (AbstractStratification locus : sample.getElements()) {
|
||||||
|
if (locus.callableStatuses().isEmpty()) {
|
||||||
|
missing[pos] = false;
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int start = -1;
|
||||||
|
boolean insideMissing = false;
|
||||||
|
for (int i = 0; i < missing.length; i++) {
|
||||||
|
if (missing[i] && !insideMissing) {
|
||||||
|
start = interval.getStart() + i;
|
||||||
|
insideMissing = true;
|
||||||
|
} else if (!missing[i] && insideMissing) {
|
||||||
|
final int stop = interval.getStart() + i - 1;
|
||||||
|
outputMissingInterval(interval.getContig(), start, stop);
|
||||||
|
insideMissing = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (insideMissing) {
|
||||||
|
outputMissingInterval(interval.getContig(), start, interval.getStop());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void outputMissingInterval(final String contig, final int start, final int stop){
|
||||||
|
final FileWriter out = thresholds.missingTargets;
|
||||||
|
try {
|
||||||
|
out.write(String.format("%s:%d-%d\n", contig, start, stop));
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function that process a set of statuses into strings
|
* Function that process a set of statuses into strings
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,7 @@ import java.util.*;
|
||||||
final class IntervalStratification extends AbstractStratification {
|
final class IntervalStratification extends AbstractStratification {
|
||||||
private final Map<String, AbstractStratification> samples;
|
private final Map<String, AbstractStratification> samples;
|
||||||
private final GenomeLoc interval;
|
private final GenomeLoc interval;
|
||||||
|
private List<CallableStatus> callableStatuses;
|
||||||
|
|
||||||
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
||||||
super(thresholds);
|
super(thresholds);
|
||||||
|
|
@ -113,7 +114,13 @@ final class IntervalStratification extends AbstractStratification {
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Iterable<CallableStatus> callableStatuses() {
|
public List<CallableStatus> callableStatuses() {
|
||||||
|
if (callableStatuses == null)
|
||||||
|
callableStatuses = calculateStatus();
|
||||||
|
return callableStatuses;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<CallableStatus> calculateStatus() {
|
||||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||||
|
|
||||||
// check if any of the votes pass the threshold
|
// check if any of the votes pass the threshold
|
||||||
|
|
|
||||||
|
|
@ -117,7 +117,7 @@ final class SampleStratification extends AbstractStratification {
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public Iterable<CallableStatus> callableStatuses() {
|
public List<CallableStatus> callableStatuses() {
|
||||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||||
|
|
||||||
// get the sample statuses of all the Loci Metrics
|
// get the sample statuses of all the Loci Metrics
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,9 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||||
|
|
||||||
import org.broadinstitute.sting.commandline.Argument;
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
|
||||||
|
import java.io.FileWriter;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -114,6 +116,9 @@ final class ThresHolder {
|
||||||
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
|
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
|
||||||
public double qualityStatusThreshold = 0.50;
|
public double qualityStatusThreshold = 0.50;
|
||||||
|
|
||||||
|
@Output(fullName = "missing_intervals", shortName = "missing", doc ="Produces a file with the intervals that don't pass filters", required = false)
|
||||||
|
public FileWriter missingTargets = null;
|
||||||
|
|
||||||
public final List<Metric> locusMetricList = new LinkedList<Metric>();
|
public final List<Metric> locusMetricList = new LinkedList<Metric>();
|
||||||
public final List<Metric> sampleMetricList = new LinkedList<Metric>();
|
public final List<Metric> sampleMetricList = new LinkedList<Metric>();
|
||||||
public final List<Metric> intervalMetricList = new LinkedList<Metric>();
|
public final List<Metric> intervalMetricList = new LinkedList<Metric>();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue