Diagnose target is outputting intervals out of order
Problem ------- When the interval had no reads, it was being sent to the VCF before the intervals that just got processed, therefore violating the sort order of the VCF. Solution -------- Use a linked hash map, and make the insertion and removal all happen in one place regardless of having reads or not. Since the input is ordered, the output has to be ordered as well. Itemized changes -------------- * Clean up code duplication in LocusStratification and SampleStratification * Add number of uncovered sites and number of low covered sites to the VCF output. * Add new VCF format fields * Fix outputting multiple status when threshold is 0 (ratio must be GREATER THAN not equal to the threshold to get reported) [fixes #48780333] [fixes #48787311]
This commit is contained in:
parent
82e6a77385
commit
1466396a31
|
|
@ -63,6 +63,10 @@ abstract class AbstractStratification {
|
|||
private Map<CallableStatus, Integer> statusTally = null;
|
||||
protected ThresHolder thresholds;
|
||||
|
||||
public AbstractStratification(ThresHolder thresholds) {
|
||||
this.thresholds = thresholds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the average "good" coverage of this sample. Good means "passes the base and
|
||||
* mapping quality requirements.
|
||||
|
|
@ -120,7 +124,7 @@ abstract class AbstractStratification {
|
|||
|
||||
|
||||
/**
|
||||
* Tally up all the callable status of all the loci in this sample.
|
||||
* Tally up all the callable status of all elements of the stratification.
|
||||
*
|
||||
* @return a map of callable status and counts
|
||||
*/
|
||||
|
|
@ -136,10 +140,10 @@ abstract class AbstractStratification {
|
|||
return statusTally;
|
||||
}
|
||||
|
||||
public static List<CallableStatus> queryStatus(List<Metric> statList, AbstractStratification stratification) {
|
||||
public List<CallableStatus> queryStatus(List<Metric> statList) {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stat : statList) {
|
||||
final CallableStatus status = stat.status(stratification);
|
||||
final CallableStatus status = stat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,6 +112,9 @@ import java.util.*;
|
|||
public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||
|
||||
private static final String AVG_INTERVAL_DP_KEY = "IDP";
|
||||
private static final String LOW_COVERAGE_LOCI = "LL";
|
||||
private static final String ZERO_COVERAGE_LOCI = "ZL";
|
||||
|
||||
|
||||
@Output(doc = "File to which interval statistics should be written")
|
||||
private VariantContextWriter vcfWriter = null;
|
||||
|
|
@ -134,7 +137,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
|
||||
throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");
|
||||
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalMap = new LinkedHashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
|
||||
|
||||
// get all of the unique sample names for the VCF Header
|
||||
|
|
@ -151,8 +154,8 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
|
||||
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
// and add all new intervals that may overlap this reference locus
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
addNewOverlappingIntervals(refLocus);
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
|
||||
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
|
||||
for (IntervalStratification intervalStratification : intervalMap.values())
|
||||
|
|
@ -203,24 +206,17 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param refBase the reference allele
|
||||
*/
|
||||
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
// output empty statistics for uncovered intervals
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
final IntervalStratification stats = intervalMap.get(interval);
|
||||
outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
if (stats != null) intervalMap.remove(interval);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
|
||||
// output any intervals that were finished
|
||||
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
|
||||
for (GenomeLoc key : intervalMap.keySet()) {
|
||||
if (key.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
|
||||
intervalMap.remove(key);
|
||||
toRemove.add(key);
|
||||
}
|
||||
}
|
||||
for (GenomeLoc key : toRemove) {
|
||||
intervalMap.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -247,10 +243,21 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
GenomeLoc interval = stats.getInterval();
|
||||
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
|
||||
gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
alleles.add(refAllele);
|
||||
alleles.add(SYMBOLIC_ALLELE);
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
|
||||
|
|
@ -262,16 +269,6 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
|
||||
|
||||
vcb = vcb.attributes(attributes);
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
vcb = vcb.genotypes(genotypes);
|
||||
|
||||
vcfWriter.add(vcb.make());
|
||||
|
|
@ -345,6 +342,8 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
// FORMAT fields for each genotype
|
||||
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size."));
|
||||
headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero."));
|
||||
headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage."));
|
||||
|
||||
// FILTER fields
|
||||
for (CallableStatus stat : CallableStatus.values())
|
||||
|
|
|
|||
|
|
@ -56,11 +56,10 @@ import java.util.*;
|
|||
final class IntervalStratification extends AbstractStratification {
|
||||
private final Map<String, AbstractStratification> samples;
|
||||
private final GenomeLoc interval;
|
||||
private final ThresHolder thresholds;
|
||||
|
||||
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.interval = interval;
|
||||
this.thresholds = thresholds;
|
||||
this.samples = new HashMap<String, AbstractStratification>(samples.size());
|
||||
for (String sample : samples)
|
||||
this.samples.put(sample, new SampleStratification(interval, thresholds));
|
||||
|
|
@ -125,7 +124,7 @@ final class IntervalStratification extends AbstractStratification {
|
|||
}
|
||||
}
|
||||
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList, this));
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList));
|
||||
|
||||
return output;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,22 +46,20 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
final class LocusStratification extends AbstractStratification {
|
||||
private long coverage;
|
||||
private long rawCoverage;
|
||||
private final List<Metric> locusStatisticsList;
|
||||
|
||||
public LocusStratification(ThresHolder thresholds) {
|
||||
this(0,0,thresholds);
|
||||
}
|
||||
|
||||
protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
this.locusStatisticsList = thresholds.locusMetricList;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -79,14 +77,7 @@ final class LocusStratification extends AbstractStratification {
|
|||
* @return a set of all statuses that apply
|
||||
*/
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stats : locusStatisticsList) {
|
||||
CallableStatus status = stats.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
return output;
|
||||
return queryStatus(thresholds.locusMetricList);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -58,6 +58,6 @@ final class PluginUtils {
|
|||
final Map<CallableStatus, Integer> totals = sampleStratification.getStatusTally();
|
||||
final int size = sampleStratification.getIntervalSize();
|
||||
final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0;
|
||||
return ( (double) statusCount / size) >= threshold ? CALL: null;
|
||||
return ( (double) statusCount / size) > threshold ? CALL: null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,15 +61,14 @@ import java.util.List;
|
|||
final class SampleStratification extends AbstractStratification {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<AbstractStratification> loci;
|
||||
private final ThresHolder thresholds;
|
||||
|
||||
private int nReads = -1;
|
||||
private int nBadMates = -1;
|
||||
|
||||
public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.interval = interval;
|
||||
this.loci = new ArrayList<AbstractStratification>(interval.size());
|
||||
this.thresholds = thresholds;
|
||||
nReads = 0;
|
||||
nBadMates = 0;
|
||||
|
||||
|
|
@ -121,7 +120,7 @@ final class SampleStratification extends AbstractStratification {
|
|||
public Iterable<CallableStatus> callableStatuses() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// get the tally of all the locus callable statuses
|
||||
// get the sample statuses of all the Loci Metrics
|
||||
for (Metric locusStat : thresholds.locusMetricList) {
|
||||
final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this);
|
||||
if (status != null) {
|
||||
|
|
@ -130,12 +129,7 @@ final class SampleStratification extends AbstractStratification {
|
|||
}
|
||||
|
||||
// get the sample specific statitics statuses
|
||||
for (Metric sampleStat : thresholds.sampleMetricList) {
|
||||
final CallableStatus status = sampleStat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
output.addAll(queryStatus(thresholds.sampleMetricList));
|
||||
|
||||
// special case, if there are no reads, then there is no sense reporting coverage gaps.
|
||||
if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS))
|
||||
|
|
@ -159,4 +153,17 @@ final class SampleStratification extends AbstractStratification {
|
|||
read.setTemporaryAttribute("seen", true);
|
||||
}
|
||||
}
|
||||
|
||||
public int getNLowCoveredLoci() {
|
||||
return getCallableStatusCount(CallableStatus.LOW_COVERAGE);
|
||||
}
|
||||
|
||||
public int getNUncoveredLoci() {
|
||||
return getCallableStatusCount(CallableStatus.COVERAGE_GAPS);
|
||||
}
|
||||
|
||||
private int getCallableStatusCount(CallableStatus status) {
|
||||
final Integer x = getStatusTally().get(status);
|
||||
return x == null ? 0 : x;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue