DiagnoseTargets with working Q1,Median,Q3

- Merged Roger's metrics with Mauricio's optimizations
 - Added Stats for DiagnoseTargets
     - now has functions to find the median depth, and upper/lower quartile
     - the REF_N callable status is implemented
 - The walker now runs efficiently
 - Diagnose Targets accepts overlapping intervals
 - Diagnose Targets now checks for bad mates
 - The read mates are checked in a memory efficient manner
 - The statistics thresholds have been consolidated and moved outside of the statistics classes and into the walker.
 - Fixed some bugs
 - Removed rod binding

Added more Unit tests

 - Test callable statuses on the locus level
 - Test bad mates

 - Changed NO_COVERAGE -> COVERAGE_GAPS to avoid confusion

Signed-off-by: Mauricio Carneiro <carneiro@broadinstitute.org>
This commit is contained in:
Roger Zurawicki 2012-04-12 18:54:29 -04:00 committed by Mauricio Carneiro
parent 50031b63c5
commit b8b139841d
9 changed files with 651 additions and 169 deletions

View File

@ -31,38 +31,29 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
* @since 2/1/12 * @since 2/1/12
*/ */
public enum CallableStatus { public enum CallableStatus {
/**
* the reference base was an N, which is not considered callable the GATK
*/
// todo -- implement this status
REF_N("the reference base was an N, which is not considered callable the GATK"), REF_N("the reference base was an N, which is not considered callable the GATK"),
/**
* the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
*/
PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"), PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
/**
* absolutely no reads were seen at this locus, regardless of the filtering parameters COVERAGE_GAPS("absolutely no coverage was observed at a locus, regardless of the filtering parameters"),
*/
NO_COVERAGE("absolutely no reads were seen at this locus, regardless of the filtering parameters"),
/**
* there were less than min. depth bases at the locus, after applying filters
*/
LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"), LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"),
/**
* more than -maxDepth read at the locus, indicating some sort of mapping problem
*/
EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"), EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"),
/**
* more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*/
POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"), POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"),
BAD_MATE(""), BAD_MATE("the reads are not properly mated, suggesting mapping errors"),
INCONSISTENT_COVERAGE(""); NO_READS("there are no reads contained in the interval"),
//
// Interval-level statuses
//
LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples");
public String description; public final String description;
private CallableStatus(String description) { private CallableStatus(String description) {
this.description = description; this.description = description;

View File

@ -25,89 +25,126 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import java.util.*; import java.util.*;
/** /**
* Short one line description of the walker. * Analyzes coverage distribution and validates read mates for a given interval and sample.
* <p/> * <p/>
* <p> * <p>
* [Long description of the walker] * Used to diagnose regions with bad coverage, mapping, or read mating. Analyzes each sample independently in addition
* to interval wide analysis.
* </p> * </p>
* <p/> * <p/>
* <p/> * <p/>
* <h2>Input</h2> * <h2>Input</h2>
* <p> * <p>
* [Description of the Input] * <ul>
* <li>A reference file</li>
* <li>one or more input BAMs</li>
* <li>One or more intervals</li>
* </ul>
* </p> * </p>
* <p/> * <p/>
* <h2>Output</h2> * <h2>Output</h2>
* <p> * <p>
* [Description of the Output] * A modified VCF detailing each interval by sample
* </p> * </p>
* <p/> * <p/>
* <h2>Examples</h2> * <h2>Examples</h2>
* <pre> * <pre>
* java * java
* -jar GenomeAnalysisTK.jar * -jar GenomeAnalysisTK.jar
* -T [walker name] * -T DiagnoseTargets \
* -R reference.fasta \
* -o output.vcf \
* -I sample1.bam \
* -I sample2.bam \
* -I sample3.bam \
* -L intervals.interval_list
* </pre> * </pre>
* *
* @author Mauricio Carneiro * @author Mauricio Carneiro, Roger Zurawicki
* @since 2/1/12 * @since 5/8/12
*/ */
@By(value = DataSource.READS) @By(value = DataSource.READS)
@PartitionBy(PartitionType.INTERVAL) @PartitionBy(PartitionType.INTERVAL)
public class DiagnoseTargets extends LocusWalker<Long, Long> implements AnnotatorCompatibleWalker { public class DiagnoseTargets extends LocusWalker<Long, Long> {
@Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
private IntervalBinding<Feature> intervalTrack = null;
@Output(doc = "File to which variants should be written", required = true) @Output(doc = "File to which variants should be written", required = true)
private VariantContextWriter vcfWriter = null; private VariantContextWriter vcfWriter = null;
@Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false) @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false)
private int minimumBaseQuality = 20; private int minimumBaseQuality = 20;
@Argument(fullName = "minimum_mapping_quality", shortName = "mmq", doc = "", required = false) @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false)
private int minimumMappingQuality = 20; private int minimumMappingQuality = 20;
@Argument(fullName = "minimum_coverage", shortName = "mincov", doc = "", required = false) @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false)
private int minimumCoverage = 5; private int minimumCoverage = 5;
@Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false)
private int maximumCoverage = 700; private int maximumCoverage = 700;
@Argument(fullName = "minimum_median_depth", shortName = "med", doc = "The minimum allowable median coverage, used for calling LOW_MEDIAN_DEPTH", required = false)
private int minMedianDepth = 20;
@Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false)
private int maxInsertSize = 50;
@Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed percentage of samples containing a call for the interval to adopt the call ", required = false)
private double votePercentage = 0.50;
@Argument(fullName = "low_median_depth_status_threshold", shortName = "stMED", doc = "The percentage of the loci needed for calling LOW_MEDIAN_DEPTH", required = false)
private double lowMedianDepthPercentage = 0.20;
@Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The percentage of the loci needed for calling BAD_MATE", required = false)
private double badMateStatusThreshold = 0.50;
@Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The percentage of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false)
private double coverageStatusThreshold = 0.20;
@Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The percentage of the loci needed for calling EXCESSIVE_COVERAGE", required = false)
private double excessiveCoverageThreshold = 0.20;
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The percentage of the loci needed for calling POOR_QUALITY", required = false)
private double qualityStatusThreshold = 0.50;
@Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false)
private boolean debug = false;
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set<String> samples = null; // all the samples being processed private Set<String> samples = null; // all the samples being processed
private final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
private ThresHolder thresholds = null;
@Override @Override
public void initialize() { public void initialize() {
super.initialize(); super.initialize();
if (intervalTrack == null) if (getToolkit().getIntervals() == null)
throw new UserException("This tool currently only works if you provide an interval track"); throw new UserException("This tool currently only works if you provide one or more interval");
thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold);
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(); intervalMap = new HashMap<GenomeLoc, IntervalStatistics>();
intervalListIterator = new PeekableIterator<GenomeLoc>(intervalTrack.getIntervals(getToolkit()).listIterator()); intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header
@ -121,7 +158,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus
for (IntervalStatistics intervalStatistics : intervalMap.values()) for (IntervalStatistics intervalStatistics : intervalMap.values())
intervalStatistics.addLocus(context); // Add current locus to stats intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats
return 1L; return 1L;
} }
@ -151,45 +188,40 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
@Override @Override
public void onTraversalDone(Long result) { public void onTraversalDone(Long result) {
for (GenomeLoc interval : intervalMap.keySet()) for (GenomeLoc interval : intervalMap.keySet())
processIntervalStats(intervalMap.get(interval), Allele.create("A")); outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true));
} }
@Override private GenomeLoc getIntervalMapSpan() {
public RodBinding<VariantContext> getSnpEffRodBinding() {return null;} GenomeLoc loc = null;
for (GenomeLoc interval : intervalMap.keySet()) {
if (loc == null) {
loc = interval;
} else
loc = interval.union(loc);
}
@Override return loc;
public RodBinding<VariantContext> getDbsnpRodBinding() {return null;} }
@Override
public List<RodBinding<VariantContext>> getCompRodBindings() {return null;}
@Override
public List<RodBinding<VariantContext>> getResourceRodBindings() {return null;}
@Override
public boolean alwaysAppendDbsnpId() {return false;}
/** /**
* Removes all intervals that are behind the current reference locus from the intervalMap * Removes all intervals that are behind the current reference locus from the intervalMap
* *
* @param refLocus the current reference locus * @param refLocus the current reference locus
* @param refBase the reference allele * @param refBase the reference allele
*/ */
private void removePastIntervals(GenomeLoc refLocus, byte refBase) { private void removePastIntervals(GenomeLoc refLocus, byte refBase) {
List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>(); // if all intervals are safe
for (GenomeLoc interval : intervalMap.keySet()) if (getIntervalMapSpan() != null && getIntervalMapSpan().isBefore(refLocus)) {
if (interval.isBefore(refLocus)) { for (GenomeLoc interval : intervalMap.keySet()) {
processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true)); outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true));
toRemove.add(interval); intervalMap.remove(interval);
} }
}
for (GenomeLoc interval : toRemove)
intervalMap.remove(interval);
GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data
while(interval != null && interval.isBefore(refLocus)) { while (interval != null && interval.isBefore(refLocus)) {
interval = intervalListIterator.next(); interval = intervalListIterator.next();
processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true)); outputStatsToVCF(createIntervalStatistic(interval), Allele.create(refBase, true));
interval = intervalListIterator.peek(); interval = intervalListIterator.peek();
} }
} }
@ -202,7 +234,6 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
private void addNewOverlappingIntervals(GenomeLoc refLocus) { private void addNewOverlappingIntervals(GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek(); GenomeLoc interval = intervalListIterator.peek();
while (interval != null && !interval.isPast(refLocus)) { while (interval != null && !interval.isPast(refLocus)) {
System.out.println("LOCUS : " + refLocus + " -- " + interval);
intervalMap.put(interval, createIntervalStatistic(interval)); intervalMap.put(interval, createIntervalStatistic(interval));
intervalListIterator.next(); // discard the interval (we've already added it to the map) intervalListIterator.next(); // discard the interval (we've already added it to the map)
interval = intervalListIterator.peek(); interval = intervalListIterator.peek();
@ -210,14 +241,14 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
} }
/** /**
* Takes the interval, finds it in the stash, prints it to the VCF, and removes it * Takes the interval, finds it in the stash, prints it to the VCF
* *
* @param stats The statistics of the interval * @param stats The statistics of the interval
* @param refAllele the reference allele * @param refAllele the reference allele
*/ */
private void processIntervalStats(IntervalStatistics stats, Allele refAllele) { private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) {
GenomeLoc interval = stats.getInterval(); GenomeLoc interval = stats.getInterval();
List<Allele> alleles = new ArrayList<Allele>(); List<Allele> alleles = new ArrayList<Allele>();
Map<String, Object> attributes = new HashMap<String, Object>(); Map<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
@ -227,7 +258,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
vcb.filters(statusesToStrings(stats.callableStatuses())); vcb.filters(statusesToStrings(stats.callableStatuses(thresholds)));
attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage());
@ -236,16 +267,24 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
for (String sample : samples) { for (String sample : samples) {
Map<String, Object> infos = new HashMap<String, Object>(); Map<String, Object> infos = new HashMap<String, Object>();
infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage()); SampleStatistics sampleStat = stats.getSample(sample);
infos.put(VCFConstants.DEPTH_KEY, sampleStat.averageCoverage());
infos.put("Q1", sampleStat.getQuantileDepth(0.25));
infos.put("MED", sampleStat.getQuantileDepth(0.50));
infos.put("Q3", sampleStat.getQuantileDepth(0.75));
Set<String> filters = new HashSet<String>(); Set<String> filters = new HashSet<String>();
filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false));
} }
vcb = vcb.genotypes(genotypes); vcb = vcb.genotypes(genotypes);
if (debug) {
System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage());
}
vcfWriter.add(vcb.make()); vcfWriter.add(vcb.make());
} }
@ -264,7 +303,12 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype // FORMAT fields for each genotype
// todo -- find the appropriate VCF constants
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
// FILTER fields // FILTER fields
for (CallableStatus stat : CallableStatus.values()) for (CallableStatus stat : CallableStatus.values())
@ -273,8 +317,13 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
return headerLines; return headerLines;
} }
/**
private static Set<String> statusesToStrings(Set<CallableStatus> statuses) { * Function that process a set of statuses into strings
*
* @param statuses the set of statuses to be converted
* @return a matching set of strings
*/
private Set<String> statusesToStrings(Set<CallableStatus> statuses) {
Set<String> output = new HashSet<String>(statuses.size()); Set<String> output = new HashSet<String>(statuses.size());
for (CallableStatus status : statuses) for (CallableStatus status : statuses)
@ -284,6 +333,6 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> implements Annotato
} }
private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { private IntervalStatistics createIntervalStatistic(GenomeLoc interval) {
return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality); return new IntervalStatistics(samples, interval /*, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality*/);
} }
} }

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -34,19 +35,24 @@ import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
public class IntervalStatistics { class IntervalStatistics {
private final Map<String, SampleStatistics> samples; private final Map<String, SampleStatistics> samples;
private final GenomeLoc interval; private final GenomeLoc interval;
private boolean hasNref = false;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
/*
public IntervalStatistics(Set<String> samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { private double minMedianDepth = 20.0;
private double badMedianDepthPercentage = 0.20;
private double votePercentage = 0.50;
*/
public IntervalStatistics(Set<String> samples, GenomeLoc interval/*, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality*/) {
this.interval = interval; this.interval = interval;
this.samples = new HashMap<String, SampleStatistics>(samples.size()); this.samples = new HashMap<String, SampleStatistics>(samples.size());
for (String sample : samples) for (String sample : samples)
this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality)); this.samples.put(sample, new SampleStatistics(interval /*, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality*/));
} }
public SampleStatistics getSample(String sample) { public SampleStatistics getSample(String sample) {
@ -57,9 +63,19 @@ public class IntervalStatistics {
return interval; return interval;
} }
public void addLocus(AlignmentContext context) { /**
* The function to populate data into the Statistics from the walker.
* This takes the input and manages passing the data to the SampleStatistics and Locus Statistics
*
* @param context The alignment context given from the walker
* @param ref the reference context given from the walker
* @param thresholds the class contains the statistical threshold for making calls
*/
public void addLocus(AlignmentContext context, ReferenceContext ref, ThresHolder thresholds) {
ReadBackedPileup pileup = context.getBasePileup(); ReadBackedPileup pileup = context.getBasePileup();
//System.out.println(ref.getLocus().toString());
Map<String, ReadBackedPileup> samplePileups = pileup.getPileupsForSamples(samples.keySet()); Map<String, ReadBackedPileup> samplePileups = pileup.getPileupsForSamples(samples.keySet());
for (Map.Entry<String, ReadBackedPileup> entry : samplePileups.entrySet()) { for (Map.Entry<String, ReadBackedPileup> entry : samplePileups.entrySet()) {
@ -67,15 +83,16 @@ public class IntervalStatistics {
ReadBackedPileup samplePileup = entry.getValue(); ReadBackedPileup samplePileup = entry.getValue();
SampleStatistics sampleStatistics = samples.get(sample); SampleStatistics sampleStatistics = samples.get(sample);
if (sampleStatistics == null) if (sampleStatistics == null)
throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample));
sampleStatistics.addLocus(context.getLocation(), samplePileup); sampleStatistics.addLocus(context.getLocation(), samplePileup, thresholds);
} }
if (!hasNref && ref.getBase() == 'N')
hasNref = true;
} }
public double averageCoverage() { public double averageCoverage() {
if (preComputedTotalCoverage < 0) if (preComputedTotalCoverage < 0)
calculateTotalCoverage(); calculateTotalCoverage();
@ -90,15 +107,43 @@ public class IntervalStatistics {
/** /**
* Return the Callable statuses for the interval as a whole * Return the Callable statuses for the interval as a whole
* todo -- add a voting system for sample flags and add interval specific statuses * todo -- add missingness filter
* *
* @param thresholds the class contains the statistical threshold for making calls
* @return the callable status(es) for the whole interval * @return the callable status(es) for the whole interval
*/ */
public Set<CallableStatus> callableStatuses() { public Set<CallableStatus> callableStatuses(ThresHolder thresholds) {
Set<CallableStatus> output = new HashSet<CallableStatus>(); Set<CallableStatus> output = new HashSet<CallableStatus>();
// Initialize the Map
Map<CallableStatus, Integer> votes = new HashMap<CallableStatus, Integer>();
for (CallableStatus status : CallableStatus.values())
votes.put(status, 0);
// tally up the votes
for (SampleStatistics sample : samples.values()) for (SampleStatistics sample : samples.values())
output.addAll(sample.getCallableStatuses()); for (CallableStatus status : sample.getCallableStatuses(thresholds))
votes.put(status, votes.get(status) + 1);
// output tall values above the threshold
for (CallableStatus status : votes.keySet()) {
if (votes.get(status) > (samples.size() * thresholds.getVotePercentageThreshold()) && !(status.equals(CallableStatus.PASS)))
output.add(status);
}
if (hasNref)
output.add(CallableStatus.REF_N);
// get median DP of each sample
int nLowMedianDepth = 0;
for (SampleStatistics sample : samples.values()) {
if (sample.getQuantileDepth(0.5) < thresholds.getMinimumMedianDepth())
nLowMedianDepth++;
}
if (nLowMedianDepth > (samples.size() * thresholds.getLowMedianDepthThreshold()))
output.add(CallableStatus.LOW_MEDIAN_DEPTH);
return output; return output;
} }

View File

@ -27,9 +27,9 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
public class LocusStatistics { class LocusStatistics {
final int coverage; private final int coverage;
final int rawCoverage; private final int rawCoverage;
public LocusStatistics() { public LocusStatistics() {
this.coverage = 0; this.coverage = 0;
@ -52,21 +52,20 @@ public class LocusStatistics {
/** /**
* Generates all applicable statuses from the coverages in this locus * Generates all applicable statuses from the coverages in this locus
* *
* @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality * @param thresholds the class contains the statistical threshold for making calls
* @param maximumCoverageThreshold the maximum threshold for determining excessive coverage
* @return a set of all statuses that apply * @return a set of all statuses that apply
*/ */
public Set<CallableStatus> callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) { public Set<CallableStatus> callableStatuses(ThresHolder thresholds) {
Set<CallableStatus> output = new HashSet<CallableStatus>(); Set<CallableStatus> output = new HashSet<CallableStatus>();
// if too much coverage // if too much coverage
if (getCoverage() > maximumCoverageThreshold) if (getCoverage() > thresholds.getMaximumCoverage())
output.add(CallableStatus.EXCESSIVE_COVERAGE); output.add(CallableStatus.EXCESSIVE_COVERAGE);
// if not enough coverage // if not enough coverage
if (getCoverage() < minimumCoverageThreshold) { if (getCoverage() < thresholds.getMinimumCoverage()) {
// was there a lot of low Qual coverage? // was there a lot of low Qual coverage?
if (getRawCoverage() >= minimumCoverageThreshold) if (getRawCoverage() >= thresholds.getMinimumCoverage())
output.add(CallableStatus.POOR_QUALITY); output.add(CallableStatus.POOR_QUALITY);
// no? // no?
else { else {
@ -74,7 +73,7 @@ public class LocusStatistics {
if (getRawCoverage() > 0) if (getRawCoverage() > 0)
output.add(CallableStatus.LOW_COVERAGE); output.add(CallableStatus.LOW_COVERAGE);
else else
output.add(CallableStatus.NO_COVERAGE); output.add(CallableStatus.COVERAGE_GAPS);
} }
} }

View File

@ -27,41 +27,36 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*; import java.util.*;
/** /**
* Short one line description of the walker. * The statistics calculator for a specific sample given the interval
*
* @author Mauricio Carneiro
* @since 2/1/12
*/ */
class SampleStatistics { class SampleStatistics {
private final GenomeLoc interval; private final GenomeLoc interval;
private final ArrayList<LocusStatistics> loci; private final ArrayList<LocusStatistics> loci;
private final int minimumCoverageThreshold; private int[] preSortedDepths = null;
private final int maximumCoverageThreshold; private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private final int minimumMappingQuality;
private final int minimumBaseQuality;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) private int nReads = -1;
private int nBadMates = -1;
private SampleStatistics(GenomeLoc interval, ArrayList<LocusStatistics> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { private SampleStatistics(GenomeLoc interval, ArrayList<LocusStatistics> loci) {
this.interval = interval; this.interval = interval;
this.loci = loci; this.loci = loci;
this.minimumCoverageThreshold = minimumCoverageThreshold; nReads = 0;
this.maximumCoverageThreshold = maximumCoverageThreshold; nBadMates = 0;
this.minimumMappingQuality = minimumMappingQuality;
this.minimumBaseQuality = minimumBaseQuality;
} }
public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { public SampleStatistics(GenomeLoc interval) {
this(interval, new ArrayList<LocusStatistics>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); this(interval, new ArrayList<LocusStatistics>(interval.size()));
// Initialize every loci (this way we don't have to worry about non-existent loci in the object // Initialize every loci (this way we don't have to worry about non-existent loci in the object
for (int i = 0; i < interval.size(); i++) for (int i = 0; i < interval.size(); i++)
this.loci.add(i, new LocusStatistics()); this.loci.add(new LocusStatistics());
} }
@ -78,51 +73,56 @@ class SampleStatistics {
} }
/** /**
* Calculates the callable statuses of the entire interval * Calculates the callable statuses of the entire sample
* *
* @return the callable statuses of the entire interval * @param thresholds the class contains the statistical threshold for making calls
* @return the callable statuses of the entire sample
*/ */
public Set<CallableStatus> getCallableStatuses() { public Set<CallableStatus> getCallableStatuses(ThresHolder thresholds) {
Set<CallableStatus> output = new HashSet<CallableStatus>();
Map<CallableStatus, Integer> totals = new HashMap<CallableStatus, Integer>(CallableStatus.values().length); // We check if reads are present ot prevent div / 0 exceptions
if (nReads == 0) {
output.add(CallableStatus.NO_READS);
return output;
}
Map<CallableStatus, Double> totals = new HashMap<CallableStatus, Double>(CallableStatus.values().length);
// initialize map // initialize map
for (CallableStatus status : CallableStatus.values()) for (CallableStatus status : CallableStatus.values())
totals.put(status, 0); totals.put(status, 0.0);
// sum up all the callable statuses for each locus // sum up all the callable statuses for each locus
for (int i = 0; i < interval.size(); i++) { for (int i = 0; i < interval.size(); i++) {
for (CallableStatus status : callableStatus(i)) { for (CallableStatus status : callableStatus(i, thresholds)) {
int count = totals.get(status); double count = totals.get(status);
totals.put(status, count + 1); totals.put(status, count + 1);
} }
} }
Set<CallableStatus> output = new HashSet<CallableStatus>();
// double to avoid type casting
double intervalSize = interval.size(); double intervalSize = interval.size();
double coverageStatusThreshold = 0.20; if ((nBadMates / nReads) > thresholds.getBadMateStatusThreshold())
if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold) output.add(CallableStatus.BAD_MATE);
output.add(CallableStatus.NO_COVERAGE);
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold) if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) > thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.COVERAGE_GAPS);
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.LOW_COVERAGE); output.add(CallableStatus.LOW_COVERAGE);
double excessiveCoverageThreshold = 0.20; if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > thresholds.getExcessiveCoverageThreshold())
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold)
output.add(CallableStatus.EXCESSIVE_COVERAGE); output.add(CallableStatus.EXCESSIVE_COVERAGE);
double qualityStatusThreshold = 0.50; if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > thresholds.getQualityStatusThreshold())
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold)
output.add(CallableStatus.POOR_QUALITY); output.add(CallableStatus.POOR_QUALITY);
if (totals.get(CallableStatus.REF_N) > 0) if (totals.get(CallableStatus.REF_N) > 0)
output.add(CallableStatus.REF_N); output.add(CallableStatus.REF_N);
if (output.isEmpty()) { if (output.isEmpty()) {
output.add(CallableStatus.PASS); output.add(CallableStatus.PASS);
} }
@ -132,12 +132,13 @@ class SampleStatistics {
/** /**
* Adds a locus to the interval wide stats * Adds a locus to the interval wide stats
* *
* @param locus The locus given as a GenomeLoc * @param locus The locus given as a GenomeLoc
* @param pileup The pileup of that locus * @param pileup The pileup of that locus, this exclusively contains the sample
* @param thresholds the class contains the statistical threshold for making calls
*/ */
public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { public void addLocus(GenomeLoc locus, ReadBackedPileup pileup, ThresHolder thresholds) {
if (!interval.containsP(locus)) if (!interval.containsP(locus))
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval));
// a null pileup means there nothing ot add // a null pileup means there nothing ot add
if (pileup != null) { if (pileup != null) {
@ -145,31 +146,141 @@ class SampleStatistics {
int locusIndex = locus.getStart() - interval.getStart(); int locusIndex = locus.getStart() - interval.getStart();
int rawCoverage = pileup.depthOfCoverage(); int rawCoverage = pileup.depthOfCoverage();
int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.getMinimumBaseQuality(), thresholds.getMinimumMappingQuality()).depthOfCoverage();
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
loci.add(locusIndex, locusData); loci.set(locusIndex, locusData);
for (GATKSAMRecord read : pileup.getReads())
processRead(read, thresholds);
}
}
private void processRead(GATKSAMRecord read, ThresHolder thresholds) {
// Was this read already processed?
if (read.getTemporaryAttribute("checkedBadMate") == null) {
nReads++;
if (hasValidMate(read, thresholds))
nBadMates++;
read.setTemporaryAttribute("checkedBadMate", true);
} }
} }
/** /**
* returns the callable status of this locus without taking the reference base into account. * returns the callable status of a given locus without taking the reference base into account.
* *
* @param locusIndex location in the genome to inquire (only one locus) * @param locusIndex location in the genome to inquire (only one locus)
* @param thresholds the class contains the statistical threshold for making calls
* @return the callable status of a locus * @return the callable status of a locus
*/ */
private Set<CallableStatus> callableStatus(int locusIndex) { private Set<CallableStatus> callableStatus(int locusIndex, ThresHolder thresholds) {
LocusStatistics locus = loci.get(locusIndex); LocusStatistics locus = loci.get(locusIndex);
return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold); return locus.callableStatuses(thresholds);
} }
private void calculateTotalCoverage() { private void calculateTotalCoverage() {
preComputedTotalCoverage = 0; preComputedTotalCoverage = 0;
for (LocusStatistics locus : loci) for (LocusStatistics locus : loci)
preComputedTotalCoverage += locus.getCoverage(); preComputedTotalCoverage += locus.getCoverage();
} }
public double getQuantileDepth(double percentage) {
if (preSortedDepths == null)
getDepthsAsSortedArray();
return getQuartile(preSortedDepths, percentage);
}
static double getQuartile(int[] data, double percentage) {
int size = data.length;
if (size == 1)
return (double) data[0];
if (percentage == 0.5) {
return getMedian(data);
}
double position = (size - 1.0) / 2;
if (percentage == 0.25) {
// if the position is a whole number
return getMedian(Arrays.copyOfRange(data, 0, (int) position + 1));
}
if (percentage == 0.75) {
if (position % 1 == 0) {
return getMedian(Arrays.copyOfRange(data, (int) position, size));
} else {
return getMedian(Arrays.copyOfRange(data, (int) position + 1, size));
}
}
return -1;
}
// Assumes data is sorted
private static double getMedian(int[] data) {
double size = (double) data.length;
if (size == 1)
return (double) data[0];
double position = (size - 1.0) / 2;
if (position % 1 == 0)
return (double) data[(int) position];
else {
double high = (double) data[(int) Math.ceil(position)];
double low = (double) data[(int) Math.floor(position)];
return (high + low) / 2;
}
}
private void getDepthsAsSortedArray() {
preSortedDepths = new int[loci.size()];
for (int i = 0; i < loci.size(); i++)
preSortedDepths[i] = loci.get(i).getCoverage();
Arrays.sort(preSortedDepths);
}
boolean hasValidMate(GATKSAMRecord read, ThresHolder thresholds) {
/** Check the following
* Does it have a pair?
* reasonable insert size?
* inverted?
* same orientation?
* todo - same contig?
* is pair mapped?
* todo - is forced mate?
*
*/
// has NO pair
if (!read.getReadPairedFlag())
return false;
// unmapped
if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag())
return false;
// same orientation
if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag())
return false;
// inverted
if (read.getReadNegativeStrandFlag() ==
read.getAlignmentStart() < read.getMateAlignmentStart())
return false;
// mates are too far apart
if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize())
return false;
return true;
}
} }

View File

@ -0,0 +1,119 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
class ThresHolder {
public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
private final int minimumBaseQuality;
private final int minimumMappingQuality;
private final int minimumCoverage;
private final int maximumCoverage;
private final int minimumMedianDepth;
private final int maximumInsertSize;
private final double votePercentageThreshold;
private final double lowMedianDepthThreshold;
private final double badMateStatusThreshold;
private final double coverageStatusThreshold;
private final double excessiveCoverageThreshold;
private final double qualityStatusThreshold;
public ThresHolder(int minimumBaseQuality,
int minimumMappingQuality,
int minimumCoverage,
int maximumCoverage,
int minimumMedianDepth,
int maximumInsertSize,
double votePercentageThreshold,
double lowMedianDepthThreshold,
double badMateStatusThreshold,
double coverageStatusThreshold,
double excessiveCoverageThreshold,
double qualityStatusThreshold) {
this.minimumBaseQuality = minimumBaseQuality;
this.minimumMappingQuality = minimumMappingQuality;
this.minimumCoverage = minimumCoverage;
this.maximumCoverage = maximumCoverage;
this.minimumMedianDepth = minimumMedianDepth;
this.maximumInsertSize = maximumInsertSize;
this.votePercentageThreshold = votePercentageThreshold;
this.lowMedianDepthThreshold = lowMedianDepthThreshold;
this.badMateStatusThreshold = badMateStatusThreshold;
this.coverageStatusThreshold = coverageStatusThreshold;
this.excessiveCoverageThreshold = excessiveCoverageThreshold;
this.qualityStatusThreshold = qualityStatusThreshold;
}
public int getMinimumBaseQuality() {
return minimumBaseQuality;
}
public int getMinimumMappingQuality() {
return minimumMappingQuality;
}
public int getMinimumCoverage() {
return minimumCoverage;
}
public int getMaximumCoverage() {
return maximumCoverage;
}
public int getMinimumMedianDepth() {
return minimumMedianDepth;
}
public int getMaximumInsertSize() {
return maximumInsertSize;
}
public double getVotePercentageThreshold() {
return votePercentageThreshold;
}
public double getLowMedianDepthThreshold() {
return lowMedianDepthThreshold;
}
public double getBadMateStatusThreshold() {
return badMateStatusThreshold;
}
public double getCoverageStatusThreshold() {
return coverageStatusThreshold;
}
public double getExcessiveCoverageThreshold() {
return excessiveCoverageThreshold;
}
public double getQualityStatusThreshold() {
return qualityStatusThreshold;
}
}

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2010, The Broad Institute * Copyright (c) 2012, The Broad Institute
* *
* Permission is hereby granted, free of charge, to any person * Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation * obtaining a copy of this software and associated documentation
@ -696,26 +696,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
@Override @Override
public Map<String, ReadBackedPileup> getPileupsForSamples(Collection<String> sampleNames) { public Map<String, ReadBackedPileup> getPileupsForSamples(Collection<String> sampleNames) {
Map<String, ReadBackedPileup> result = new HashMap<String, ReadBackedPileup>(); Map<String, ReadBackedPileup> result = new HashMap<String, ReadBackedPileup>();
Map<String, UnifiedPileupElementTracker<PE>> trackerMap = new HashMap<String, UnifiedPileupElementTracker<PE>>(); if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
for (String sample : sampleNames) { // initialize pileups for each sample for (String sample : sampleNames) {
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>(); PileupElementTracker<PE> filteredElements = tracker.getElements(sample);
trackerMap.put(sample, filteredTracker); if (filteredElements != null)
} result.put(sample, createNewPileup(loc, filteredElements));
for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup
GATKSAMRecord read = p.getRead();
if (read.getReadGroup() != null) {
String sample = read.getReadGroup().getSample();
UnifiedPileupElementTracker<PE> tracker = trackerMap.get(sample);
if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest
tracker.add(p);
} }
} else {
Map<String, UnifiedPileupElementTracker<PE>> trackerMap = new HashMap<String, UnifiedPileupElementTracker<PE>>();
for (String sample : sampleNames) { // initialize pileups for each sample
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
trackerMap.put(sample, filteredTracker);
}
for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup
GATKSAMRecord read = p.getRead();
if (read.getReadGroup() != null) {
String sample = read.getReadGroup().getSample();
UnifiedPileupElementTracker<PE> tracker = trackerMap.get(sample);
if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest
tracker.add(p);
}
}
for (Map.Entry<String, UnifiedPileupElementTracker<PE>> entry : trackerMap.entrySet()) // create the RBP for each sample
result.put(entry.getKey(), createNewPileup(loc, entry.getValue()));
} }
for (Map.Entry<String, UnifiedPileupElementTracker<PE>> entry : trackerMap.entrySet()) // create the RBP for each sample
result.put(entry.getKey(), createNewPileup(loc, entry.getValue()));
return result; return result;
} }

View File

@ -0,0 +1,63 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Set;
public class LocusStatisticsUnitTest /*extends BaseTest*/ {
@Test(dataProvider = "StatusTestValues")
public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) {
// The min Coverage threshold is 10, the max is 100
ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
Set<CallableStatus> statuses = new LocusStatistics(coverage, rawCoverage).callableStatuses(thresholds);
// Check to make sure the status provides matches the actual
Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1));
}
@DataProvider(name = "StatusTestValues")
public Object[][] getStatusTestValues() {
return new Object[][]{
new Object[]{100, 100, null},
new Object[]{100, 101, null},
new Object[]{101, 101, CallableStatus.EXCESSIVE_COVERAGE},
new Object[]{10, 101, null},
new Object[]{9, 101, CallableStatus.POOR_QUALITY},
new Object[]{9, 10, CallableStatus.POOR_QUALITY},
new Object[]{9, 9, CallableStatus.LOW_COVERAGE},
new Object[]{0, 0, CallableStatus.COVERAGE_GAPS},
new Object[]{0, 9, CallableStatus.LOW_COVERAGE},
new Object[]{0, 101, CallableStatus.POOR_QUALITY},
new Object[]{10, Integer.MAX_VALUE, null},
new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, CallableStatus.EXCESSIVE_COVERAGE},
};
}
}

View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
public class SampleStatisticsUnitTest/* extends BaseTest */ {
@DataProvider(name = "QuartileValues")
public Object[][] getQuantileValues() {
int[] a1 = {5};
int[] a2 = {1, 2};
int[] a5 = {10, 20, 30, 40, 50};
int[] a10 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
return new Object[][]{
new Object[]{a1, 0.5, 5},
new Object[]{a1, 0, 5},
new Object[]{a1, 1, 5},
new Object[]{a2, 0.5, 1.5},
new Object[]{a2, 0.25, 1},
new Object[]{a2, 0.75, 2},
new Object[]{a5, 0.5, 30},
new Object[]{a5, 0.25, 20},
new Object[]{a5, 0.75, 40},
new Object[]{a5, 0, -1},
new Object[]{a10, 0.5, 5.5},
new Object[]{a10, 0.25, 3},
new Object[]{a10, 0.75, 8}
};
}
@Test(dataProvider = "QuartileValues")
public void testGetQuartile(int[] dataList, double percentage, double expected) {
Assert.assertEquals(SampleStatistics.getQuartile(dataList, percentage), expected);
}
@DataProvider(name = "ReadsAndMates")
public Object[][] getReadAndMates() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GATKSAMRecord noPair = ArtificialSAMUtils.createArtificialRead(header, "test", 0, 100, 50);
GATKSAMRecord good = ArtificialSAMUtils.createPair(header, "test", 30, 100, 150, true, false).get(0);
GATKSAMRecord bigInsertSize = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, false).get(0);
GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0);
GATKSAMRecord sameOrientation = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, true).get(0);
GATKSAMRecord pairNotMapped = ArtificialSAMUtils.createPair(header, "test", 30, 100, 140, true, false).get(1);
pairNotMapped.setMateUnmappedFlag(true);
// finish test
return new Object[][]{
new Object[]{noPair, false},
new Object[]{good, true},
new Object[]{bigInsertSize, false},
new Object[]{inverted, false},
new Object[]{sameOrientation, false},
new Object[]{pairNotMapped, false}
};
}
@Test(dataProvider = "ReadsAndMates")
public void testHasValidMate(GATKSAMRecord read, boolean expected) {
//50 is out maximum insert size
Assert.assertEquals(new SampleStatistics(GenomeLoc.UNMAPPED).hasValidMate(read, ThresHolder.DEFAULTS), expected);
}
}