From b8b139841d4bf9b6fcdeb85952aa85e09c8bcae3 Mon Sep 17 00:00:00 2001
From: Roger Zurawicki
Date: Thu, 12 Apr 2012 18:54:29 -0400
Subject: [PATCH] DiagnoseTargets with working Q1,Median,Q3
- Merged Roger's metrics with Mauricio's optimizations
- Added Stats for DiagnoseTargets
- now has functions to find the median depth, and upper/lower quartile
- the REF_N callable status is implemented
- The walker now runs efficiently
- Diagnose Targets accepts overlapping intervals
- Diagnose Targets now checks for bad mates
- The read mates are checked in a memory efficient manner
- The statistics thresholds have been consolidated and moved outside of the statistics classes and into the walker.
- Fixed some bugs
- Removed rod binding
Added more Unit tests
- Test callable statuses on the locus level
- Test bad mates
- Changed NO_COVERAGE -> COVERAGE_GAPS to avoid confusion
Signed-off-by: Mauricio Carneiro
---
.../diagnostics/targets/CallableStatus.java | 37 ++--
.../diagnostics/targets/DiagnoseTargets.java | 167 +++++++++-----
.../targets/IntervalStatistics.java | 69 ++++--
.../diagnostics/targets/LocusStatistics.java | 19 +-
.../diagnostics/targets/SampleStatistics.java | 203 ++++++++++++++----
.../diagnostics/targets/ThresHolder.java | 119 ++++++++++
.../pileup/AbstractReadBackedPileup.java | 44 ++--
.../targets/LocusStatisticsUnitTest.java | 63 ++++++
.../targets/SampleStatisticsUnitTest.java | 99 +++++++++
9 files changed, 651 insertions(+), 169 deletions(-)
create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java
create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java
create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
index 5d1685557..2aa4c977d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
@@ -31,38 +31,29 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
* @since 2/1/12
*/
public enum CallableStatus {
- /**
- * the reference base was an N, which is not considered callable the GATK
- */
- // todo -- implement this status
+
REF_N("the reference base was an N, which is not considered callable the GATK"),
- /**
- * the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
- */
+
PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
- /**
- * absolutely no reads were seen at this locus, regardless of the filtering parameters
- */
- NO_COVERAGE("absolutely no reads were seen at this locus, regardless of the filtering parameters"),
- /**
- * there were less than min. depth bases at the locus, after applying filters
- */
+
+ COVERAGE_GAPS("absolutely no coverage was observed at a locus, regardless of the filtering parameters"),
+
LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"),
- /**
- * more than -maxDepth read at the locus, indicating some sort of mapping problem
- */
+
EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"),
- /**
- * more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
- */
+
POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"),
- BAD_MATE(""),
+ BAD_MATE("the reads are not properly mated, suggesting mapping errors"),
- INCONSISTENT_COVERAGE("");
+ NO_READS("there are no reads contained in the interval"),
+ //
+ // Interval-level statuses
+ //
+ LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples");
- public String description;
+ public final String description;
private CallableStatus(String description) {
this.description = description;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
index 7e2e6722e..b8a90892f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
@@ -25,89 +25,126 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import net.sf.picard.util.PeekableIterator;
-import org.broad.tribble.Feature;
-import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
-import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
+import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import java.util.*;
/**
- * Short one line description of the walker.
+ * Analyzes coverage distribution and validates read mates for a given interval and sample.
*
*
- * [Long description of the walker]
+ * Used to diagnose regions with bad coverage, mapping, or read mating. Analyzes each sample independently in addition
+ * to interval wide analysis.
*
*
*
* Input
*
- * [Description of the Input]
+ *
+ * - A reference file
+ * - one or more input BAMs
+ * - One or more intervals
+ *
*
*
* Output
*
- * [Description of the Output]
+ * A modified VCF detailing each interval by sample
*
*
* Examples
*
* java
* -jar GenomeAnalysisTK.jar
- * -T [walker name]
+ * -T DiagnoseTargets \
+ * -R reference.fasta \
+ * -o output.vcf \
+ * -I sample1.bam \
+ * -I sample2.bam \
+ * -I sample3.bam \
+ * -L intervals.interval_list
*
*
- * @author Mauricio Carneiro
- * @since 2/1/12
+ * @author Mauricio Carneiro, Roger Zurawicki
+ * @since 5/8/12
*/
@By(value = DataSource.READS)
@PartitionBy(PartitionType.INTERVAL)
-public class DiagnoseTargets extends LocusWalker implements AnnotatorCompatibleWalker {
- @Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
- private IntervalBinding intervalTrack = null;
+public class DiagnoseTargets extends LocusWalker {
@Output(doc = "File to which variants should be written", required = true)
private VariantContextWriter vcfWriter = null;
- @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false)
+ @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false)
private int minimumBaseQuality = 20;
- @Argument(fullName = "minimum_mapping_quality", shortName = "mmq", doc = "", required = false)
+ @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false)
private int minimumMappingQuality = 20;
- @Argument(fullName = "minimum_coverage", shortName = "mincov", doc = "", required = false)
+ @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false)
private int minimumCoverage = 5;
- @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
+ @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false)
private int maximumCoverage = 700;
+ @Argument(fullName = "minimum_median_depth", shortName = "med", doc = "The minimum allowable median coverage, used for calling LOW_MEDIAN_DEPTH", required = false)
+ private int minMedianDepth = 20;
+
+ @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false)
+ private int maxInsertSize = 50;
+
+ @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed percentage of samples containing a call for the interval to adopt the call ", required = false)
+ private double votePercentage = 0.50;
+
+ @Argument(fullName = "low_median_depth_status_threshold", shortName = "stMED", doc = "The percentage of the loci needed for calling LOW_MEDIAN_DEPTH", required = false)
+ private double lowMedianDepthPercentage = 0.20;
+
+ @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The percentage of the loci needed for calling BAD_MATE", required = false)
+ private double badMateStatusThreshold = 0.50;
+
+ @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The percentage of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false)
+ private double coverageStatusThreshold = 0.20;
+
+ @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The percentage of the loci needed for calling EXCESSIVE_COVERAGE", required = false)
+ private double excessiveCoverageThreshold = 0.20;
+
+ @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The percentage of the loci needed for calling POOR_QUALITY", required = false)
+ private double qualityStatusThreshold = 0.50;
+
+ @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false)
+ private boolean debug = false;
+
private HashMap intervalMap = null; // interval => statistics
private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set samples = null; // all the samples being processed
-
private final Allele SYMBOLIC_ALLELE = Allele.create("", false); // avoid creating the symbolic allele multiple times
+ private ThresHolder thresholds = null;
@Override
public void initialize() {
super.initialize();
- if (intervalTrack == null)
- throw new UserException("This tool currently only works if you provide an interval track");
+ if (getToolkit().getIntervals() == null)
+ throw new UserException("This tool currently only works if you provide one or more interval");
+
+ thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold);
intervalMap = new HashMap();
- intervalListIterator = new PeekableIterator(intervalTrack.getIntervals(getToolkit()).listIterator());
+ intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator());
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header
@@ -121,7 +158,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus
for (IntervalStatistics intervalStatistics : intervalMap.values())
- intervalStatistics.addLocus(context); // Add current locus to stats
+ intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats
return 1L;
}
@@ -151,45 +188,40 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
@Override
public void onTraversalDone(Long result) {
for (GenomeLoc interval : intervalMap.keySet())
- processIntervalStats(intervalMap.get(interval), Allele.create("A"));
+ outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true));
}
- @Override
- public RodBinding getSnpEffRodBinding() {return null;}
+ private GenomeLoc getIntervalMapSpan() {
+ GenomeLoc loc = null;
+ for (GenomeLoc interval : intervalMap.keySet()) {
+ if (loc == null) {
+ loc = interval;
+ } else
+ loc = interval.union(loc);
+ }
- @Override
- public RodBinding getDbsnpRodBinding() {return null;}
-
- @Override
- public List> getCompRodBindings() {return null;}
-
- @Override
- public List> getResourceRodBindings() {return null;}
-
- @Override
- public boolean alwaysAppendDbsnpId() {return false;}
+ return loc;
+ }
/**
* Removes all intervals that are behind the current reference locus from the intervalMap
*
* @param refLocus the current reference locus
- * @param refBase the reference allele
+ * @param refBase the reference allele
*/
private void removePastIntervals(GenomeLoc refLocus, byte refBase) {
- List toRemove = new LinkedList();
- for (GenomeLoc interval : intervalMap.keySet())
- if (interval.isBefore(refLocus)) {
- processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true));
- toRemove.add(interval);
+ // if all intervals are safe
+ if (getIntervalMapSpan() != null && getIntervalMapSpan().isBefore(refLocus)) {
+ for (GenomeLoc interval : intervalMap.keySet()) {
+ outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true));
+ intervalMap.remove(interval);
}
-
- for (GenomeLoc interval : toRemove)
- intervalMap.remove(interval);
+ }
GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data
- while(interval != null && interval.isBefore(refLocus)) {
+ while (interval != null && interval.isBefore(refLocus)) {
interval = intervalListIterator.next();
- processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true));
+ outputStatsToVCF(createIntervalStatistic(interval), Allele.create(refBase, true));
interval = intervalListIterator.peek();
}
}
@@ -202,7 +234,6 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek();
while (interval != null && !interval.isPast(refLocus)) {
- System.out.println("LOCUS : " + refLocus + " -- " + interval);
intervalMap.put(interval, createIntervalStatistic(interval));
intervalListIterator.next(); // discard the interval (we've already added it to the map)
interval = intervalListIterator.peek();
@@ -210,14 +241,14 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
}
/**
- * Takes the interval, finds it in the stash, prints it to the VCF, and removes it
+ * Takes the interval, finds it in the stash, prints it to the VCF
*
- * @param stats The statistics of the interval
+ * @param stats The statistics of the interval
* @param refAllele the reference allele
*/
- private void processIntervalStats(IntervalStatistics stats, Allele refAllele) {
+ private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) {
GenomeLoc interval = stats.getInterval();
-
+
List alleles = new ArrayList();
Map attributes = new HashMap();
ArrayList genotypes = new ArrayList();
@@ -227,7 +258,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
- vcb.filters(statusesToStrings(stats.callableStatuses()));
+ vcb.filters(statusesToStrings(stats.callableStatuses(thresholds)));
attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage());
@@ -236,16 +267,24 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
for (String sample : samples) {
Map infos = new HashMap();
- infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage());
+ SampleStatistics sampleStat = stats.getSample(sample);
+ infos.put(VCFConstants.DEPTH_KEY, sampleStat.averageCoverage());
+ infos.put("Q1", sampleStat.getQuantileDepth(0.25));
+ infos.put("MED", sampleStat.getQuantileDepth(0.50));
+ infos.put("Q3", sampleStat.getQuantileDepth(0.75));
Set filters = new HashSet();
- filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses()));
+ filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false));
}
vcb = vcb.genotypes(genotypes);
+ if (debug) {
+ System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage());
+ }
+
vcfWriter.add(vcb.make());
}
@@ -264,7 +303,12 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
+ // todo -- find the appropriate VCF constants
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
+ headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
+ headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
+ headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
+
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
@@ -273,8 +317,13 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
return headerLines;
}
-
- private static Set statusesToStrings(Set statuses) {
+ /**
+ * Function that process a set of statuses into strings
+ *
+ * @param statuses the set of statuses to be converted
+ * @return a matching set of strings
+ */
+ private Set statusesToStrings(Set statuses) {
Set output = new HashSet(statuses.size());
for (CallableStatus status : statuses)
@@ -284,6 +333,6 @@ public class DiagnoseTargets extends LocusWalker implements Annotato
}
private IntervalStatistics createIntervalStatistic(GenomeLoc interval) {
- return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality);
+ return new IntervalStatistics(samples, interval /*, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality*/);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java
index f3246407b..5731c41c5 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java
@@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@@ -34,19 +35,24 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-public class IntervalStatistics {
+class IntervalStatistics {
private final Map samples;
private final GenomeLoc interval;
+ private boolean hasNref = false;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
-
- public IntervalStatistics(Set samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
+ /*
+ private double minMedianDepth = 20.0;
+ private double badMedianDepthPercentage = 0.20;
+ private double votePercentage = 0.50;
+ */
+ public IntervalStatistics(Set samples, GenomeLoc interval/*, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality*/) {
this.interval = interval;
this.samples = new HashMap(samples.size());
for (String sample : samples)
- this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality));
+ this.samples.put(sample, new SampleStatistics(interval /*, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality*/));
}
public SampleStatistics getSample(String sample) {
@@ -57,9 +63,19 @@ public class IntervalStatistics {
return interval;
}
- public void addLocus(AlignmentContext context) {
+ /**
+ * The function to populate data into the Statistics from the walker.
+ * This takes the input and manages passing the data to the SampleStatistics and Locus Statistics
+ *
+ * @param context The alignment context given from the walker
+ * @param ref the reference context given from the walker
+ * @param thresholds the class contains the statistical threshold for making calls
+ */
+ public void addLocus(AlignmentContext context, ReferenceContext ref, ThresHolder thresholds) {
ReadBackedPileup pileup = context.getBasePileup();
+ //System.out.println(ref.getLocus().toString());
+
Map samplePileups = pileup.getPileupsForSamples(samples.keySet());
for (Map.Entry entry : samplePileups.entrySet()) {
@@ -67,15 +83,16 @@ public class IntervalStatistics {
ReadBackedPileup samplePileup = entry.getValue();
SampleStatistics sampleStatistics = samples.get(sample);
- if (sampleStatistics == null)
+ if (sampleStatistics == null)
throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample));
-
- sampleStatistics.addLocus(context.getLocation(), samplePileup);
+
+ sampleStatistics.addLocus(context.getLocation(), samplePileup, thresholds);
}
+ if (!hasNref && ref.getBase() == 'N')
+ hasNref = true;
}
-
public double averageCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
@@ -90,15 +107,43 @@ public class IntervalStatistics {
/**
* Return the Callable statuses for the interval as a whole
- * todo -- add a voting system for sample flags and add interval specific statuses
+ * todo -- add missingness filter
*
+ * @param thresholds the class contains the statistical threshold for making calls
* @return the callable status(es) for the whole interval
*/
- public Set callableStatuses() {
+ public Set callableStatuses(ThresHolder thresholds) {
Set output = new HashSet();
+ // Initialize the Map
+ Map votes = new HashMap();
+ for (CallableStatus status : CallableStatus.values())
+ votes.put(status, 0);
+
+ // tally up the votes
for (SampleStatistics sample : samples.values())
- output.addAll(sample.getCallableStatuses());
+ for (CallableStatus status : sample.getCallableStatuses(thresholds))
+ votes.put(status, votes.get(status) + 1);
+
+ // output tall values above the threshold
+ for (CallableStatus status : votes.keySet()) {
+ if (votes.get(status) > (samples.size() * thresholds.getVotePercentageThreshold()) && !(status.equals(CallableStatus.PASS)))
+ output.add(status);
+ }
+
+
+ if (hasNref)
+ output.add(CallableStatus.REF_N);
+
+ // get median DP of each sample
+ int nLowMedianDepth = 0;
+ for (SampleStatistics sample : samples.values()) {
+ if (sample.getQuantileDepth(0.5) < thresholds.getMinimumMedianDepth())
+ nLowMedianDepth++;
+ }
+
+ if (nLowMedianDepth > (samples.size() * thresholds.getLowMedianDepthThreshold()))
+ output.add(CallableStatus.LOW_MEDIAN_DEPTH);
return output;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java
index 237ca1b1c..90563505d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java
@@ -27,9 +27,9 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import java.util.HashSet;
import java.util.Set;
-public class LocusStatistics {
- final int coverage;
- final int rawCoverage;
+class LocusStatistics {
+ private final int coverage;
+ private final int rawCoverage;
public LocusStatistics() {
this.coverage = 0;
@@ -52,21 +52,20 @@ public class LocusStatistics {
/**
* Generates all applicable statuses from the coverages in this locus
*
- * @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality
- * @param maximumCoverageThreshold the maximum threshold for determining excessive coverage
+ * @param thresholds the class contains the statistical threshold for making calls
* @return a set of all statuses that apply
*/
- public Set callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) {
+ public Set callableStatuses(ThresHolder thresholds) {
Set output = new HashSet();
// if too much coverage
- if (getCoverage() > maximumCoverageThreshold)
+ if (getCoverage() > thresholds.getMaximumCoverage())
output.add(CallableStatus.EXCESSIVE_COVERAGE);
// if not enough coverage
- if (getCoverage() < minimumCoverageThreshold) {
+ if (getCoverage() < thresholds.getMinimumCoverage()) {
// was there a lot of low Qual coverage?
- if (getRawCoverage() >= minimumCoverageThreshold)
+ if (getRawCoverage() >= thresholds.getMinimumCoverage())
output.add(CallableStatus.POOR_QUALITY);
// no?
else {
@@ -74,7 +73,7 @@ public class LocusStatistics {
if (getRawCoverage() > 0)
output.add(CallableStatus.LOW_COVERAGE);
else
- output.add(CallableStatus.NO_COVERAGE);
+ output.add(CallableStatus.COVERAGE_GAPS);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java
index 9e4993853..958a44d4a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java
@@ -27,41 +27,36 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
/**
- * Short one line description of the walker.
- *
- * @author Mauricio Carneiro
- * @since 2/1/12
+ * The statistics calculator for a specific sample given the interval
*/
class SampleStatistics {
private final GenomeLoc interval;
private final ArrayList loci;
- private final int minimumCoverageThreshold;
- private final int maximumCoverageThreshold;
- private final int minimumMappingQuality;
- private final int minimumBaseQuality;
+ private int[] preSortedDepths = null;
+ private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
- private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
+ private int nReads = -1;
+ private int nBadMates = -1;
- private SampleStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
+ private SampleStatistics(GenomeLoc interval, ArrayList loci) {
this.interval = interval;
this.loci = loci;
- this.minimumCoverageThreshold = minimumCoverageThreshold;
- this.maximumCoverageThreshold = maximumCoverageThreshold;
- this.minimumMappingQuality = minimumMappingQuality;
- this.minimumBaseQuality = minimumBaseQuality;
+ nReads = 0;
+ nBadMates = 0;
}
- public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
- this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality);
+ public SampleStatistics(GenomeLoc interval) {
+ this(interval, new ArrayList(interval.size()));
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
for (int i = 0; i < interval.size(); i++)
- this.loci.add(i, new LocusStatistics());
+ this.loci.add(new LocusStatistics());
}
@@ -78,51 +73,56 @@ class SampleStatistics {
}
/**
- * Calculates the callable statuses of the entire interval
+ * Calculates the callable statuses of the entire sample
*
- * @return the callable statuses of the entire interval
+ * @param thresholds the class contains the statistical threshold for making calls
+ * @return the callable statuses of the entire sample
*/
- public Set getCallableStatuses() {
+ public Set getCallableStatuses(ThresHolder thresholds) {
+ Set output = new HashSet();
- Map totals = new HashMap(CallableStatus.values().length);
+ // We check if reads are present ot prevent div / 0 exceptions
+ if (nReads == 0) {
+ output.add(CallableStatus.NO_READS);
+ return output;
+ }
+
+ Map totals = new HashMap(CallableStatus.values().length);
// initialize map
for (CallableStatus status : CallableStatus.values())
- totals.put(status, 0);
+ totals.put(status, 0.0);
// sum up all the callable statuses for each locus
for (int i = 0; i < interval.size(); i++) {
- for (CallableStatus status : callableStatus(i)) {
- int count = totals.get(status);
+ for (CallableStatus status : callableStatus(i, thresholds)) {
+ double count = totals.get(status);
totals.put(status, count + 1);
}
}
-
- Set output = new HashSet();
-
- // double to avoid type casting
double intervalSize = interval.size();
- double coverageStatusThreshold = 0.20;
- if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold)
- output.add(CallableStatus.NO_COVERAGE);
+ if ((nBadMates / nReads) > thresholds.getBadMateStatusThreshold())
+ output.add(CallableStatus.BAD_MATE);
- if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold)
+ if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) > thresholds.getCoverageStatusThreshold())
+ output.add(CallableStatus.COVERAGE_GAPS);
+
+ if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.LOW_COVERAGE);
- double excessiveCoverageThreshold = 0.20;
- if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold)
+ if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > thresholds.getExcessiveCoverageThreshold())
output.add(CallableStatus.EXCESSIVE_COVERAGE);
- double qualityStatusThreshold = 0.50;
- if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold)
+ if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > thresholds.getQualityStatusThreshold())
output.add(CallableStatus.POOR_QUALITY);
if (totals.get(CallableStatus.REF_N) > 0)
output.add(CallableStatus.REF_N);
+
if (output.isEmpty()) {
output.add(CallableStatus.PASS);
}
@@ -132,12 +132,13 @@ class SampleStatistics {
/**
* Adds a locus to the interval wide stats
*
- * @param locus The locus given as a GenomeLoc
- * @param pileup The pileup of that locus
+ * @param locus The locus given as a GenomeLoc
+ * @param pileup The pileup of that locus, this exclusively contains the sample
+ * @param thresholds the class contains the statistical threshold for making calls
*/
- public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) {
+ public void addLocus(GenomeLoc locus, ReadBackedPileup pileup, ThresHolder thresholds) {
if (!interval.containsP(locus))
- throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
+ throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval));
// a null pileup means there nothing ot add
if (pileup != null) {
@@ -145,31 +146,141 @@ class SampleStatistics {
int locusIndex = locus.getStart() - interval.getStart();
int rawCoverage = pileup.depthOfCoverage();
- int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
+ int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.getMinimumBaseQuality(), thresholds.getMinimumMappingQuality()).depthOfCoverage();
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
- loci.add(locusIndex, locusData);
+ loci.set(locusIndex, locusData);
+
+ for (GATKSAMRecord read : pileup.getReads())
+ processRead(read, thresholds);
+ }
+ }
+
+ private void processRead(GATKSAMRecord read, ThresHolder thresholds) {
+ // Was this read already processed?
+ if (read.getTemporaryAttribute("checkedBadMate") == null) {
+ nReads++;
+ if (hasValidMate(read, thresholds))
+ nBadMates++;
+ read.setTemporaryAttribute("checkedBadMate", true);
}
}
/**
- * returns the callable status of this locus without taking the reference base into account.
+ * returns the callable status of a given locus without taking the reference base into account.
*
* @param locusIndex location in the genome to inquire (only one locus)
+ * @param thresholds the class contains the statistical threshold for making calls
* @return the callable status of a locus
*/
- private Set callableStatus(int locusIndex) {
+ private Set callableStatus(int locusIndex, ThresHolder thresholds) {
LocusStatistics locus = loci.get(locusIndex);
- return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold);
+ return locus.callableStatuses(thresholds);
}
-
private void calculateTotalCoverage() {
preComputedTotalCoverage = 0;
for (LocusStatistics locus : loci)
preComputedTotalCoverage += locus.getCoverage();
}
+ public double getQuantileDepth(double percentage) {
+ if (preSortedDepths == null)
+ getDepthsAsSortedArray();
+
+ return getQuartile(preSortedDepths, percentage);
+ }
+
+ static double getQuartile(int[] data, double percentage) {
+ int size = data.length;
+ if (size == 1)
+ return (double) data[0];
+
+ if (percentage == 0.5) {
+ return getMedian(data);
+ }
+
+ double position = (size - 1.0) / 2;
+ if (percentage == 0.25) {
+ // if the position is a whole number
+ return getMedian(Arrays.copyOfRange(data, 0, (int) position + 1));
+
+ }
+ if (percentage == 0.75) {
+ if (position % 1 == 0) {
+ return getMedian(Arrays.copyOfRange(data, (int) position, size));
+ } else {
+ return getMedian(Arrays.copyOfRange(data, (int) position + 1, size));
+ }
+ }
+ return -1;
+ }
+
+ // Assumes data is sorted
+ private static double getMedian(int[] data) {
+ double size = (double) data.length;
+ if (size == 1)
+ return (double) data[0];
+
+ double position = (size - 1.0) / 2;
+
+ if (position % 1 == 0)
+ return (double) data[(int) position];
+
+ else {
+ double high = (double) data[(int) Math.ceil(position)];
+ double low = (double) data[(int) Math.floor(position)];
+
+ return (high + low) / 2;
+
+ }
+
+ }
+
+ private void getDepthsAsSortedArray() {
+ preSortedDepths = new int[loci.size()];
+
+ for (int i = 0; i < loci.size(); i++)
+ preSortedDepths[i] = loci.get(i).getCoverage();
+
+ Arrays.sort(preSortedDepths);
+ }
+
+ boolean hasValidMate(GATKSAMRecord read, ThresHolder thresholds) {
+ /** Check the following
+ * Does it have a pair?
+ * reasonable insert size?
+ * inverted?
+ * same orientation?
+ * todo - same contig?
+ * is pair mapped?
+ * todo - is forced mate?
+ *
+ */
+
+ // has NO pair
+ if (!read.getReadPairedFlag())
+ return false;
+
+ // unmapped
+ if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag())
+ return false;
+
+ // same orientation
+ if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag())
+ return false;
+
+ // inverted
+ if (read.getReadNegativeStrandFlag() ==
+ read.getAlignmentStart() < read.getMateAlignmentStart())
+ return false;
+
+ // mates are too far apart
+ if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize())
+ return false;
+
+ return true;
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java
new file mode 100644
index 000000000..17098567f
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+
+class ThresHolder {
+ public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
+
+ private final int minimumBaseQuality;
+ private final int minimumMappingQuality;
+
+ private final int minimumCoverage;
+ private final int maximumCoverage;
+ private final int minimumMedianDepth;
+
+ private final int maximumInsertSize;
+
+ private final double votePercentageThreshold;
+ private final double lowMedianDepthThreshold;
+ private final double badMateStatusThreshold;
+ private final double coverageStatusThreshold;
+ private final double excessiveCoverageThreshold;
+ private final double qualityStatusThreshold;
+
+ public ThresHolder(int minimumBaseQuality,
+ int minimumMappingQuality,
+ int minimumCoverage,
+ int maximumCoverage,
+ int minimumMedianDepth,
+ int maximumInsertSize,
+ double votePercentageThreshold,
+ double lowMedianDepthThreshold,
+ double badMateStatusThreshold,
+ double coverageStatusThreshold,
+ double excessiveCoverageThreshold,
+ double qualityStatusThreshold) {
+ this.minimumBaseQuality = minimumBaseQuality;
+ this.minimumMappingQuality = minimumMappingQuality;
+ this.minimumCoverage = minimumCoverage;
+ this.maximumCoverage = maximumCoverage;
+ this.minimumMedianDepth = minimumMedianDepth;
+ this.maximumInsertSize = maximumInsertSize;
+ this.votePercentageThreshold = votePercentageThreshold;
+ this.lowMedianDepthThreshold = lowMedianDepthThreshold;
+ this.badMateStatusThreshold = badMateStatusThreshold;
+ this.coverageStatusThreshold = coverageStatusThreshold;
+ this.excessiveCoverageThreshold = excessiveCoverageThreshold;
+ this.qualityStatusThreshold = qualityStatusThreshold;
+ }
+
+ public int getMinimumBaseQuality() {
+ return minimumBaseQuality;
+ }
+
+ public int getMinimumMappingQuality() {
+ return minimumMappingQuality;
+ }
+
+ public int getMinimumCoverage() {
+ return minimumCoverage;
+ }
+
+ public int getMaximumCoverage() {
+ return maximumCoverage;
+ }
+
+ public int getMinimumMedianDepth() {
+ return minimumMedianDepth;
+ }
+
+ public int getMaximumInsertSize() {
+ return maximumInsertSize;
+ }
+
+ public double getVotePercentageThreshold() {
+ return votePercentageThreshold;
+ }
+
+ public double getLowMedianDepthThreshold() {
+ return lowMedianDepthThreshold;
+ }
+
+ public double getBadMateStatusThreshold() {
+ return badMateStatusThreshold;
+ }
+
+ public double getCoverageStatusThreshold() {
+ return coverageStatusThreshold;
+ }
+
+ public double getExcessiveCoverageThreshold() {
+ return excessiveCoverageThreshold;
+ }
+
+ public double getQualityStatusThreshold() {
+ return qualityStatusThreshold;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
index bcd220dca..b15e303b3 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010, The Broad Institute
+ * Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -696,26 +696,32 @@ public abstract class AbstractReadBackedPileup getPileupsForSamples(Collection sampleNames) {
Map result = new HashMap();
- Map> trackerMap = new HashMap>();
-
- for (String sample : sampleNames) { // initialize pileups for each sample
- UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker();
- trackerMap.put(sample, filteredTracker);
- }
-
- for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup
- GATKSAMRecord read = p.getRead();
- if (read.getReadGroup() != null) {
- String sample = read.getReadGroup().getSample();
- UnifiedPileupElementTracker tracker = trackerMap.get(sample);
- if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest
- tracker.add(p);
+ if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
+ PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker;
+ for (String sample : sampleNames) {
+ PileupElementTracker filteredElements = tracker.getElements(sample);
+ if (filteredElements != null)
+ result.put(sample, createNewPileup(loc, filteredElements));
}
+ } else {
+ Map> trackerMap = new HashMap>();
+
+ for (String sample : sampleNames) { // initialize pileups for each sample
+ UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker();
+ trackerMap.put(sample, filteredTracker);
+ }
+ for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup
+ GATKSAMRecord read = p.getRead();
+ if (read.getReadGroup() != null) {
+ String sample = read.getReadGroup().getSample();
+ UnifiedPileupElementTracker tracker = trackerMap.get(sample);
+ if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest
+ tracker.add(p);
+ }
+ }
+ for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample
+ result.put(entry.getKey(), createNewPileup(loc, entry.getValue()));
}
-
- for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample
- result.put(entry.getKey(), createNewPileup(loc, entry.getValue()));
-
return result;
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java
new file mode 100644
index 000000000..c70adcc1a
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.Set;
+
+public class LocusStatisticsUnitTest /*extends BaseTest*/ {
+
+ @Test(dataProvider = "StatusTestValues")
+ public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) {
+ // The min Coverage threshold is 10, the max is 100
+ ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
+ Set statuses = new LocusStatistics(coverage, rawCoverage).callableStatuses(thresholds);
+ // Check to make sure the status provides matches the actual
+ Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1));
+
+ }
+
+ @DataProvider(name = "StatusTestValues")
+ public Object[][] getStatusTestValues() {
+ return new Object[][]{
+ new Object[]{100, 100, null},
+ new Object[]{100, 101, null},
+ new Object[]{101, 101, CallableStatus.EXCESSIVE_COVERAGE},
+ new Object[]{10, 101, null},
+ new Object[]{9, 101, CallableStatus.POOR_QUALITY},
+ new Object[]{9, 10, CallableStatus.POOR_QUALITY},
+ new Object[]{9, 9, CallableStatus.LOW_COVERAGE},
+ new Object[]{0, 0, CallableStatus.COVERAGE_GAPS},
+ new Object[]{0, 9, CallableStatus.LOW_COVERAGE},
+ new Object[]{0, 101, CallableStatus.POOR_QUALITY},
+ new Object[]{10, Integer.MAX_VALUE, null},
+ new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, CallableStatus.EXCESSIVE_COVERAGE},
+ };
+ }
+
+}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java
new file mode 100644
index 000000000..4db5b6156
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
+
+import net.sf.samtools.SAMFileHeader;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+public class SampleStatisticsUnitTest/* extends BaseTest */ {
+
+ @DataProvider(name = "QuartileValues")
+ public Object[][] getQuantileValues() {
+
+ int[] a1 = {5};
+ int[] a2 = {1, 2};
+ int[] a5 = {10, 20, 30, 40, 50};
+ int[] a10 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+
+ return new Object[][]{
+ new Object[]{a1, 0.5, 5},
+ new Object[]{a1, 0, 5},
+ new Object[]{a1, 1, 5},
+ new Object[]{a2, 0.5, 1.5},
+ new Object[]{a2, 0.25, 1},
+ new Object[]{a2, 0.75, 2},
+ new Object[]{a5, 0.5, 30},
+ new Object[]{a5, 0.25, 20},
+ new Object[]{a5, 0.75, 40},
+ new Object[]{a5, 0, -1},
+ new Object[]{a10, 0.5, 5.5},
+ new Object[]{a10, 0.25, 3},
+ new Object[]{a10, 0.75, 8}
+ };
+ }
+
+ @Test(dataProvider = "QuartileValues")
+ public void testGetQuartile(int[] dataList, double percentage, double expected) {
+ Assert.assertEquals(SampleStatistics.getQuartile(dataList, percentage), expected);
+
+ }
+
+ @DataProvider(name = "ReadsAndMates")
+ public Object[][] getReadAndMates() {
+ SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
+
+ GATKSAMRecord noPair = ArtificialSAMUtils.createArtificialRead(header, "test", 0, 100, 50);
+ GATKSAMRecord good = ArtificialSAMUtils.createPair(header, "test", 30, 100, 150, true, false).get(0);
+ GATKSAMRecord bigInsertSize = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, false).get(0);
+ GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0);
+ GATKSAMRecord sameOrientation = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, true).get(0);
+
+ GATKSAMRecord pairNotMapped = ArtificialSAMUtils.createPair(header, "test", 30, 100, 140, true, false).get(1);
+ pairNotMapped.setMateUnmappedFlag(true);
+
+ // finish test
+ return new Object[][]{
+ new Object[]{noPair, false},
+ new Object[]{good, true},
+ new Object[]{bigInsertSize, false},
+ new Object[]{inverted, false},
+ new Object[]{sameOrientation, false},
+ new Object[]{pairNotMapped, false}
+ };
+ }
+
+ @Test(dataProvider = "ReadsAndMates")
+ public void testHasValidMate(GATKSAMRecord read, boolean expected) {
+ //50 is out maximum insert size
+ Assert.assertEquals(new SampleStatistics(GenomeLoc.UNMAPPED).hasValidMate(read, ThresHolder.DEFAULTS), expected);
+ }
+
+}