Merge pull request #223 from broadinstitute/mc_dt_gaddy_outputs
Bug fixes and missing interval functionality for Diagnose Targets While the code seems fine, the complex parts of it are untested. This is probably fine for now, but private code can have a tendency to creep into the codebase once accepted. I would have preferred that unit test OR a big comment stating that the code is untested (and thus broken by Mark's rule). It is with these cavets that I accept the pull request.
This commit is contained in:
commit
9234a0efcd
|
|
@ -63,6 +63,10 @@ abstract class AbstractStratification {
|
|||
private Map<CallableStatus, Integer> statusTally = null;
|
||||
protected ThresHolder thresholds;
|
||||
|
||||
public AbstractStratification(ThresHolder thresholds) {
|
||||
this.thresholds = thresholds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the average "good" coverage of this sample. Good means "passes the base and
|
||||
* mapping quality requirements.
|
||||
|
|
@ -116,11 +120,11 @@ abstract class AbstractStratification {
|
|||
*
|
||||
* @return the callable status(es) for the whole object
|
||||
*/
|
||||
public abstract Iterable<CallableStatus> callableStatuses();
|
||||
public abstract List<CallableStatus> callableStatuses();
|
||||
|
||||
|
||||
/**
|
||||
* Tally up all the callable status of all the loci in this sample.
|
||||
* Tally up all the callable status of all elements of the stratification.
|
||||
*
|
||||
* @return a map of callable status and counts
|
||||
*/
|
||||
|
|
@ -136,10 +140,10 @@ abstract class AbstractStratification {
|
|||
return statusTally;
|
||||
}
|
||||
|
||||
public static List<CallableStatus> queryStatus(List<Metric> statList, AbstractStratification stratification) {
|
||||
public List<CallableStatus> queryStatus(List<Metric> statList) {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stat : statList) {
|
||||
final CallableStatus status = stat.status(stratification);
|
||||
final CallableStatus status = stat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.*;
|
|||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -112,6 +113,9 @@ import java.util.*;
|
|||
public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||
|
||||
private static final String AVG_INTERVAL_DP_KEY = "IDP";
|
||||
private static final String LOW_COVERAGE_LOCI = "LL";
|
||||
private static final String ZERO_COVERAGE_LOCI = "ZL";
|
||||
|
||||
|
||||
@Output(doc = "File to which interval statistics should be written")
|
||||
private VariantContextWriter vcfWriter = null;
|
||||
|
|
@ -119,13 +123,12 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
@ArgumentCollection
|
||||
private ThresHolder thresholds = new ThresHolder();
|
||||
|
||||
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
||||
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||
private Set<String> samples = null; // all the samples being processed
|
||||
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
|
||||
|
||||
private static final int INITIAL_HASH_SIZE = 500000;
|
||||
private static final int INITIAL_HASH_SIZE = 50; // enough room for potential overlapping intervals plus recently finished intervals
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
|
|
@ -134,7 +137,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
|
||||
throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");
|
||||
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalMap = new LinkedHashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
|
||||
|
||||
// get all of the unique sample names for the VCF Header
|
||||
|
|
@ -146,13 +149,13 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
public Long map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
GenomeLoc refLocus = ref.getLocus();
|
||||
|
||||
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
// and add all new intervals that may overlap this reference locus
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
addNewOverlappingIntervals(refLocus);
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
|
||||
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
|
||||
for (IntervalStratification intervalStratification : intervalMap.values())
|
||||
|
|
@ -184,7 +187,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param result number of loci processed by the walker
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
public void onTraversalDone(final Long result) {
|
||||
for (GenomeLoc interval : intervalMap.keySet())
|
||||
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
|
||||
|
||||
|
|
@ -194,6 +197,10 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
if (thresholds.missingTargets != null) {
|
||||
thresholds.missingTargets.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -203,24 +210,21 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param refBase the reference allele
|
||||
*/
|
||||
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
// output empty statistics for uncovered intervals
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
final IntervalStratification stats = intervalMap.get(interval);
|
||||
outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
if (stats != null) intervalMap.remove(interval);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
|
||||
// output any intervals that were finished
|
||||
final List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
|
||||
for (GenomeLoc key : intervalMap.keySet()) {
|
||||
if (key.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
|
||||
intervalMap.remove(key);
|
||||
final IntervalStratification intervalStats = intervalMap.get(key);
|
||||
outputStatsToVCF(intervalStats, Allele.create(refBase, true));
|
||||
if (hasMissingLoci(intervalStats)) {
|
||||
outputMissingInterval(intervalStats);
|
||||
}
|
||||
toRemove.add(key);
|
||||
}
|
||||
}
|
||||
for (GenomeLoc key : toRemove) {
|
||||
intervalMap.remove(key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -228,7 +232,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
*
|
||||
* @param refLocus the current reference locus
|
||||
*/
|
||||
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
|
||||
private void addNewOverlappingIntervals(final GenomeLoc refLocus) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
while (interval != null && !interval.isPast(refLocus)) {
|
||||
intervalMap.put(interval, createIntervalStatistic(interval));
|
||||
|
|
@ -243,14 +247,24 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param stats The statistics of the interval
|
||||
* @param refAllele the reference allele
|
||||
*/
|
||||
private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) {
|
||||
private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) {
|
||||
GenomeLoc interval = stats.getInterval();
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci());
|
||||
gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci());
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
alleles.add(refAllele);
|
||||
alleles.add(SYMBOLIC_ALLELE);
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
|
||||
|
|
@ -262,21 +276,56 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
|
||||
|
||||
vcb = vcb.attributes(attributes);
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
vcb = vcb.genotypes(genotypes);
|
||||
|
||||
vcfWriter.add(vcb.make());
|
||||
}
|
||||
|
||||
private boolean hasMissingStatuses(AbstractStratification stats) {
|
||||
return !stats.callableStatuses().isEmpty();
|
||||
}
|
||||
|
||||
private boolean hasMissingLoci(final IntervalStratification stats) {
|
||||
return thresholds.missingTargets != null && hasMissingStatuses(stats);
|
||||
}
|
||||
|
||||
private void outputMissingInterval(final IntervalStratification stats) {
|
||||
final GenomeLoc interval = stats.getInterval();
|
||||
final boolean missing[] = new boolean[interval.size()];
|
||||
Arrays.fill(missing, true);
|
||||
for (AbstractStratification sample : stats.getElements()) {
|
||||
if (hasMissingStatuses(sample)) {
|
||||
int pos = 0;
|
||||
for (AbstractStratification locus : sample.getElements()) {
|
||||
if (locus.callableStatuses().isEmpty()) {
|
||||
missing[pos] = false;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
int start = -1;
|
||||
boolean insideMissing = false;
|
||||
for (int i = 0; i < missing.length; i++) {
|
||||
if (missing[i] && !insideMissing) {
|
||||
start = interval.getStart() + i;
|
||||
insideMissing = true;
|
||||
} else if (!missing[i] && insideMissing) {
|
||||
final int stop = interval.getStart() + i - 1;
|
||||
outputMissingInterval(interval.getContig(), start, stop);
|
||||
insideMissing = false;
|
||||
}
|
||||
}
|
||||
if (insideMissing) {
|
||||
outputMissingInterval(interval.getContig(), start, interval.getStop());
|
||||
}
|
||||
}
|
||||
|
||||
private void outputMissingInterval(final String contig, final int start, final int stop) {
|
||||
final PrintStream out = thresholds.missingTargets;
|
||||
out.println(String.format("%s:%d-%d", contig, start, stop));
|
||||
}
|
||||
|
||||
/**
|
||||
* Function that process a set of statuses into strings
|
||||
*
|
||||
|
|
@ -345,6 +394,8 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
// FORMAT fields for each genotype
|
||||
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size."));
|
||||
headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero."));
|
||||
headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage."));
|
||||
|
||||
// FILTER fields
|
||||
for (CallableStatus stat : CallableStatus.values())
|
||||
|
|
|
|||
|
|
@ -56,11 +56,11 @@ import java.util.*;
|
|||
final class IntervalStratification extends AbstractStratification {
|
||||
private final Map<String, AbstractStratification> samples;
|
||||
private final GenomeLoc interval;
|
||||
private final ThresHolder thresholds;
|
||||
private List<CallableStatus> callableStatuses;
|
||||
|
||||
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.interval = interval;
|
||||
this.thresholds = thresholds;
|
||||
this.samples = new HashMap<String, AbstractStratification>(samples.size());
|
||||
for (String sample : samples)
|
||||
this.samples.put(sample, new SampleStratification(interval, thresholds));
|
||||
|
|
@ -114,7 +114,13 @@ final class IntervalStratification extends AbstractStratification {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<CallableStatus> callableStatuses() {
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
if (callableStatuses == null)
|
||||
callableStatuses = calculateStatus();
|
||||
return callableStatuses;
|
||||
}
|
||||
|
||||
private List<CallableStatus> calculateStatus() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// check if any of the votes pass the threshold
|
||||
|
|
@ -125,7 +131,7 @@ final class IntervalStratification extends AbstractStratification {
|
|||
}
|
||||
}
|
||||
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList, this));
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList));
|
||||
|
||||
return output;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,22 +46,20 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
final class LocusStratification extends AbstractStratification {
|
||||
private long coverage;
|
||||
private long rawCoverage;
|
||||
private final List<Metric> locusStatisticsList;
|
||||
|
||||
public LocusStratification(ThresHolder thresholds) {
|
||||
this(0,0,thresholds);
|
||||
}
|
||||
|
||||
protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
this.locusStatisticsList = thresholds.locusMetricList;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -79,14 +77,7 @@ final class LocusStratification extends AbstractStratification {
|
|||
* @return a set of all statuses that apply
|
||||
*/
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stats : locusStatisticsList) {
|
||||
CallableStatus status = stats.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
return output;
|
||||
return queryStatus(thresholds.locusMetricList);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -58,6 +58,6 @@ final class PluginUtils {
|
|||
final Map<CallableStatus, Integer> totals = sampleStratification.getStatusTally();
|
||||
final int size = sampleStratification.getIntervalSize();
|
||||
final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0;
|
||||
return ( (double) statusCount / size) >= threshold ? CALL: null;
|
||||
return ( (double) statusCount / size) > threshold ? CALL: null;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,15 +61,14 @@ import java.util.List;
|
|||
final class SampleStratification extends AbstractStratification {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<AbstractStratification> loci;
|
||||
private final ThresHolder thresholds;
|
||||
|
||||
private int nReads = -1;
|
||||
private int nBadMates = -1;
|
||||
|
||||
public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
this.interval = interval;
|
||||
this.loci = new ArrayList<AbstractStratification>(interval.size());
|
||||
this.thresholds = thresholds;
|
||||
nReads = 0;
|
||||
nBadMates = 0;
|
||||
|
||||
|
|
@ -118,10 +117,10 @@ final class SampleStratification extends AbstractStratification {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<CallableStatus> callableStatuses() {
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// get the tally of all the locus callable statuses
|
||||
// get the sample statuses of all the Loci Metrics
|
||||
for (Metric locusStat : thresholds.locusMetricList) {
|
||||
final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this);
|
||||
if (status != null) {
|
||||
|
|
@ -130,12 +129,7 @@ final class SampleStratification extends AbstractStratification {
|
|||
}
|
||||
|
||||
// get the sample specific statitics statuses
|
||||
for (Metric sampleStat : thresholds.sampleMetricList) {
|
||||
final CallableStatus status = sampleStat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
output.addAll(queryStatus(thresholds.sampleMetricList));
|
||||
|
||||
// special case, if there are no reads, then there is no sense reporting coverage gaps.
|
||||
if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS))
|
||||
|
|
@ -159,4 +153,17 @@ final class SampleStratification extends AbstractStratification {
|
|||
read.setTemporaryAttribute("seen", true);
|
||||
}
|
||||
}
|
||||
|
||||
public int getNLowCoveredLoci() {
|
||||
return getCallableStatusCount(CallableStatus.LOW_COVERAGE);
|
||||
}
|
||||
|
||||
public int getNUncoveredLoci() {
|
||||
return getCallableStatusCount(CallableStatus.COVERAGE_GAPS);
|
||||
}
|
||||
|
||||
private int getCallableStatusCount(CallableStatus status) {
|
||||
final Integer x = getStatusTally().get(status);
|
||||
return x == null ? 0 : x;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,7 +47,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -114,6 +116,9 @@ final class ThresHolder {
|
|||
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
|
||||
public double qualityStatusThreshold = 0.50;
|
||||
|
||||
@Output(fullName = "missing_intervals", shortName = "missing", doc ="Produces a file with the intervals that don't pass filters", required = false)
|
||||
public PrintStream missingTargets = null;
|
||||
|
||||
public final List<Metric> locusMetricList = new LinkedList<Metric>();
|
||||
public final List<Metric> sampleMetricList = new LinkedList<Metric>();
|
||||
public final List<Metric> intervalMetricList = new LinkedList<Metric>();
|
||||
|
|
|
|||
|
|
@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testSingleSample() {
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "850304909477afa8c2a8f128d6eedde9");
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "1771e95aed2b3b240dc353f84e19847d");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultiSample() {
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "bedd19bcf21d1a779f6706c0351c9d26");
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "c7f1691dbe5f121c4a79be823d3057e5");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue