diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index b6a40f167..d73b22664 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import net.sf.picard.util.PeekableIterator; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -32,8 +33,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocComparator; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -79,10 +78,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato private IntervalBinding intervalTrack = null; @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false) - private int expandInterval = 50; + private VCFWriter vcfWriter = null; @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false) private int minimumBaseQuality = 20; @@ -96,13 +92,11 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) private int maximumCoverage = 700; - private TreeSet intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) private HashMap intervalMap = null; // interval => statistics - private Iterator intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome - private GenomeLoc currentInterval = null; // The "current" interval loaded - private IntervalStatistics currentIntervalStatistics = null; // The "current" interval being filled with statistics - private Set samples = null; // All the samples being processed - private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) + private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome + private Set samples = null; // all the samples being processed + + private final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times @Override public void initialize() { @@ -111,72 +105,22 @@ public class DiagnoseTargets extends LocusWalker implements Annotato if (intervalTrack == null) throw new UserException("This tool currently only works if you provide an interval track"); - parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below - - List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided - intervalList = new TreeSet(new GenomeLocComparator()); intervalMap = new HashMap(); - for (GenomeLoc interval : originalList) - intervalList.add(interval); - //addAndExpandIntervalToMap(interval); + intervalListIterator = new PeekableIterator(intervalTrack.getIntervals(getToolkit()).listIterator()); - intervalListIterator = intervalList.iterator(); - - // get all of the unique sample names - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - // initialize the header - Set headerInfo = getHeaderInfo(); - - vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); - } - - /** - * Gets the header lines for the VCF writer - * - * @return A set of VCF header lines - */ - private Set getHeaderInfo() { - Set headerLines = new HashSet(); - - // INFO fields for overall data - headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); - headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); - - // FORMAT fields for each genotype - headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); - headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - - // FILTER fields - - for (CallableStatus stat : CallableStatus.values()) { - headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); - } - - return headerLines; + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header + vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - while (currentInterval == null || currentInterval.isBefore(refLocus)) { // do this for first time and while currentInterval is behind current locus - if (!intervalListIterator.hasNext()) - return 0L; - if (currentInterval != null) - processIntervalStats(currentInterval, Allele.create(ref.getBase(), true)); + removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore + addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus - currentInterval = intervalListIterator.next(); - addAndExpandIntervalToMap(currentInterval); - currentIntervalStatistics = intervalMap.get(currentInterval); - } - - if (currentInterval.isPast(refLocus)) // skip if we are behind the current interval - return 0L; - - currentIntervalStatistics.addLocus(context); // Add current locus to stats + for (IntervalStatistics intervalStatistics : intervalMap.values()) + intervalStatistics.addLocus(context); // Add current locus to stats return 1L; } @@ -198,10 +142,15 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return sum + value; } + /** + * Process all remaining intervals + * + * @param result number of loci processed by the walker + */ @Override public void onTraversalDone(Long result) { - for (GenomeLoc interval : intervalMap.keySet()) - processIntervalStats(interval, Allele.create("
", true)); + for (GenomeLoc interval : intervalMap.keySet()) + processIntervalStats(intervalMap.get(interval), Allele.create("A")); } @Override @@ -219,82 +168,111 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Override public boolean alwaysAppendDbsnpId() {return false;} - private GenomeLoc createIntervalBefore(GenomeLoc interval) { - int start = Math.max(interval.getStart() - expandInterval, 0); - int stop = Math.max(interval.getStart() - 1, 0); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); - } + /** + * Removes all intervals that are behind the current reference locus from the intervalMap + * + * @param refLocus the current reference locus + * @param refBase the reference allele + */ + private void removePastIntervals(GenomeLoc refLocus, byte refBase) { + List toRemove = new LinkedList(); + for (GenomeLoc interval : intervalMap.keySet()) + if (interval.isBefore(refLocus)) { + processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true)); + toRemove.add(interval); + } - private GenomeLoc createIntervalAfter(GenomeLoc interval) { - int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength(); - int start = Math.min(interval.getStop() + 1, contigLimit); - int stop = Math.min(interval.getStop() + expandInterval, contigLimit); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); + for (GenomeLoc interval : toRemove) + intervalMap.remove(interval); + + GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data + while(interval != null && interval.isBefore(refLocus)) { + interval = intervalListIterator.next(); + processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true)); + interval = intervalListIterator.peek(); + } } /** - * Takes an interval and commits it to memory. - * It will expand it if so told by the -exp command line argument + * Adds all intervals that overlap the current reference locus to the intervalMap * - * @param interval The new interval to process + * @param refLocus the current reference locus */ - private void addAndExpandIntervalToMap(GenomeLoc interval) { - if (expandInterval > 0) { - GenomeLoc before = createIntervalBefore(interval); - GenomeLoc after = createIntervalAfter(interval); - intervalList.add(before); - intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + private void addNewOverlappingIntervals(GenomeLoc refLocus) { + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null && !interval.isPast(refLocus)) { + System.out.println("LOCUS : " + refLocus + " -- " + interval); + intervalMap.put(interval, createIntervalStatistic(interval)); + intervalListIterator.next(); // discard the interval (we've already added it to the map) + interval = intervalListIterator.peek(); } - if (!intervalList.contains(interval)) - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); } /** * Takes the interval, finds it in the stash, prints it to the VCF, and removes it * - * @param interval The interval in memory that you want to write out and clear - * @param allele the allele + * @param stats The statistics of the interval + * @param refAllele the reference allele */ - private void processIntervalStats(GenomeLoc interval, Allele allele) { - IntervalStatistics stats = intervalMap.get(interval); - + private void processIntervalStats(IntervalStatistics stats, Allele refAllele) { + GenomeLoc interval = stats.getInterval(); + List alleles = new ArrayList(); Map attributes = new HashMap(); ArrayList genotypes = new ArrayList(); - alleles.add(allele); - VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); + alleles.add(refAllele); + alleles.add(SYMBOLIC_ALLELE); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF vcb.filters(statusesToStrings(stats.callableStatuses())); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage()); - attributes.put("AV", stats.averageCoverage()); + attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); for (String sample : samples) { Map infos = new HashMap(); - infos.put("DP", stats.getSample(sample).totalCoverage()); - infos.put("AV", stats.getSample(sample).averageCoverage()); + infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage()); Set filters = new HashSet(); filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); - genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); } vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); - intervalMap.remove(interval); } + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + + private static Set statusesToStrings(Set statuses) { Set output = new HashSet(statuses.size()); @@ -303,4 +281,8 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return output; } + + private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { + return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 75f56808f..f3246407b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.HashMap; @@ -52,18 +53,28 @@ public class IntervalStatistics { return samples.get(sample); } + public GenomeLoc getInterval() { + return interval; + } + public void addLocus(AlignmentContext context) { ReadBackedPileup pileup = context.getBasePileup(); - for (String sample : samples.keySet()) - getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample)); + Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); + + for (Map.Entry entry : samplePileups.entrySet()) { + String sample = entry.getKey(); + ReadBackedPileup samplePileup = entry.getValue(); + SampleStatistics sampleStatistics = samples.get(sample); + + if (sampleStatistics == null) + throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); + + sampleStatistics.addLocus(context.getLocation(), samplePileup); + } + } - public long totalCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return preComputedTotalCoverage; - } public double averageCoverage() { if (preComputedTotalCoverage < 0) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ea6901bb3..e3107c195 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -677,11 +677,11 @@ public abstract class AbstractReadBackedPileup filteredElements = tracker.getElements(sampleNames); return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; } else { - HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); } else { @@ -693,6 +693,38 @@ public abstract class AbstractReadBackedPileup getPileupsForSamples(Collection sampleNames) { + Map result = new HashMap(); + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (String sample : sampleNames) { + PileupElementTracker filteredElements = tracker.getElements(sampleNames); + if (filteredElements != null) + result.put(sample, createNewPileup(loc, filteredElements)); + } + } else { + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); + } + return result; + } + @Override public RBP getPileupForSample(String sampleName) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 110199f06..f15468840 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; /** * A data retrieval interface for accessing parts of the pileup. @@ -159,6 +160,16 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForSamples(Collection sampleNames); + /** + * Gets the particular subset of this pileup for each given sample name. + * + * Same as calling getPileupForSample for all samples, but in O(n) instead of O(n^2). + * + * @param sampleNames Name of the sample to use. + * @return A subset of this pileup containing only reads with the given sample. + */ + public Map getPileupsForSamples(Collection sampleNames); + /** * Gets the particular subset of this pileup with the given sample name.