From b8b139841d4bf9b6fcdeb85952aa85e09c8bcae3 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Thu, 12 Apr 2012 18:54:29 -0400 Subject: [PATCH] DiagnoseTargets with working Q1,Median,Q3 - Merged Roger's metrics with Mauricio's optimizations - Added Stats for DiagnoseTargets - now has functions to find the median depth, and upper/lower quartile - the REF_N callable status is implemented - The walker now runs efficiently - Diagnose Targets accepts overlapping intervals - Diagnose Targets now checks for bad mates - The read mates are checked in a memory efficient manner - The statistics thresholds have been consolidated and moved outside of the statistics classes and into the walker. - Fixed some bugs - Removed rod binding Added more Unit tests - Test callable statuses on the locus level - Test bad mates - Changed NO_COVERAGE -> COVERAGE_GAPS to avoid confusion Signed-off-by: Mauricio Carneiro --- .../diagnostics/targets/CallableStatus.java | 37 ++-- .../diagnostics/targets/DiagnoseTargets.java | 167 +++++++++----- .../targets/IntervalStatistics.java | 69 ++++-- .../diagnostics/targets/LocusStatistics.java | 19 +- .../diagnostics/targets/SampleStatistics.java | 203 ++++++++++++++---- .../diagnostics/targets/ThresHolder.java | 119 ++++++++++ .../pileup/AbstractReadBackedPileup.java | 44 ++-- .../targets/LocusStatisticsUnitTest.java | 63 ++++++ .../targets/SampleStatisticsUnitTest.java | 99 +++++++++ 9 files changed, 651 insertions(+), 169 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java index 5d1685557..2aa4c977d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java @@ -31,38 +31,29 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; * @since 2/1/12 */ public enum CallableStatus { - /** - * the reference base was an N, which is not considered callable the GATK - */ - // todo -- implement this status + REF_N("the reference base was an N, which is not considered callable the GATK"), - /** - * the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE - */ + PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"), - /** - * absolutely no reads were seen at this locus, regardless of the filtering parameters - */ - NO_COVERAGE("absolutely no reads were seen at this locus, regardless of the filtering parameters"), - /** - * there were less than min. depth bases at the locus, after applying filters - */ + + COVERAGE_GAPS("absolutely no coverage was observed at a locus, regardless of the filtering parameters"), + LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"), - /** - * more than -maxDepth read at the locus, indicating some sort of mapping problem - */ + EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"), - /** - * more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads - */ + POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"), - BAD_MATE(""), + BAD_MATE("the reads are not properly mated, suggesting mapping errors"), - INCONSISTENT_COVERAGE(""); + NO_READS("there are no reads contained in the interval"), + // + // Interval-level statuses + // + LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples"); - public String description; + public final String description; private CallableStatus(String description) { this.description = description; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index 7e2e6722e..b8a90892f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -25,89 +25,126 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import net.sf.picard.util.PeekableIterator; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.util.*; /** - * Short one line description of the walker. + * Analyzes coverage distribution and validates read mates for a given interval and sample. *

*

- * [Long description of the walker] + * Used to diagnose regions with bad coverage, mapping, or read mating. Analyzes each sample independently in addition + * to interval wide analysis. *

*

*

*

Input

*

- * [Description of the Input] + *

*

*

*

Output

*

- * [Description of the Output] + * A modified VCF detailing each interval by sample *

*

*

Examples

*
  *    java
  *      -jar GenomeAnalysisTK.jar
- *      -T [walker name]
+ *              -T DiagnoseTargets \
+ *              -R reference.fasta \
+ *              -o output.vcf \
+ *              -I sample1.bam \
+ *              -I sample2.bam \
+ *              -I sample3.bam \
+ *              -L intervals.interval_list
  *  
* - * @author Mauricio Carneiro - * @since 2/1/12 + * @author Mauricio Carneiro, Roger Zurawicki + * @since 5/8/12 */ @By(value = DataSource.READS) @PartitionBy(PartitionType.INTERVAL) -public class DiagnoseTargets extends LocusWalker implements AnnotatorCompatibleWalker { - @Input(fullName = "interval_track", shortName = "int", doc = "", required = true) - private IntervalBinding intervalTrack = null; +public class DiagnoseTargets extends LocusWalker { @Output(doc = "File to which variants should be written", required = true) private VariantContextWriter vcfWriter = null; - @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false) + @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false) private int minimumBaseQuality = 20; - @Argument(fullName = "minimum_mapping_quality", shortName = "mmq", doc = "", required = false) + @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false) private int minimumMappingQuality = 20; - @Argument(fullName = "minimum_coverage", shortName = "mincov", doc = "", required = false) + @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false) private int minimumCoverage = 5; - @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) + @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) private int maximumCoverage = 700; + @Argument(fullName = "minimum_median_depth", shortName = "med", doc = "The minimum allowable median coverage, used for calling LOW_MEDIAN_DEPTH", required = false) + private int minMedianDepth = 20; + + @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false) + private int maxInsertSize = 50; + + @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed percentage of samples containing a call for the interval to adopt the call ", required = false) + private double votePercentage = 0.50; + + @Argument(fullName = "low_median_depth_status_threshold", shortName = "stMED", doc = "The percentage of the loci needed for calling LOW_MEDIAN_DEPTH", required = false) + private double lowMedianDepthPercentage = 0.20; + + @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The percentage of the loci needed for calling BAD_MATE", required = false) + private double badMateStatusThreshold = 0.50; + + @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The percentage of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) + private double coverageStatusThreshold = 0.20; + + @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The percentage of the loci needed for calling EXCESSIVE_COVERAGE", required = false) + private double excessiveCoverageThreshold = 0.20; + + @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The percentage of the loci needed for calling POOR_QUALITY", required = false) + private double qualityStatusThreshold = 0.50; + + @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) + private boolean debug = false; + private HashMap intervalMap = null; // interval => statistics private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private Set samples = null; // all the samples being processed - private final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times + private ThresHolder thresholds = null; @Override public void initialize() { super.initialize(); - if (intervalTrack == null) - throw new UserException("This tool currently only works if you provide an interval track"); + if (getToolkit().getIntervals() == null) + throw new UserException("This tool currently only works if you provide one or more interval"); + + thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); intervalMap = new HashMap(); - intervalListIterator = new PeekableIterator(intervalTrack.getIntervals(getToolkit()).listIterator()); + intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header @@ -121,7 +158,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus for (IntervalStatistics intervalStatistics : intervalMap.values()) - intervalStatistics.addLocus(context); // Add current locus to stats + intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats return 1L; } @@ -151,45 +188,40 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Override public void onTraversalDone(Long result) { for (GenomeLoc interval : intervalMap.keySet()) - processIntervalStats(intervalMap.get(interval), Allele.create("A")); + outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true)); } - @Override - public RodBinding getSnpEffRodBinding() {return null;} + private GenomeLoc getIntervalMapSpan() { + GenomeLoc loc = null; + for (GenomeLoc interval : intervalMap.keySet()) { + if (loc == null) { + loc = interval; + } else + loc = interval.union(loc); + } - @Override - public RodBinding getDbsnpRodBinding() {return null;} - - @Override - public List> getCompRodBindings() {return null;} - - @Override - public List> getResourceRodBindings() {return null;} - - @Override - public boolean alwaysAppendDbsnpId() {return false;} + return loc; + } /** * Removes all intervals that are behind the current reference locus from the intervalMap * * @param refLocus the current reference locus - * @param refBase the reference allele + * @param refBase the reference allele */ private void removePastIntervals(GenomeLoc refLocus, byte refBase) { - List toRemove = new LinkedList(); - for (GenomeLoc interval : intervalMap.keySet()) - if (interval.isBefore(refLocus)) { - processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true)); - toRemove.add(interval); + // if all intervals are safe + if (getIntervalMapSpan() != null && getIntervalMapSpan().isBefore(refLocus)) { + for (GenomeLoc interval : intervalMap.keySet()) { + outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true)); + intervalMap.remove(interval); } - - for (GenomeLoc interval : toRemove) - intervalMap.remove(interval); + } GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data - while(interval != null && interval.isBefore(refLocus)) { + while (interval != null && interval.isBefore(refLocus)) { interval = intervalListIterator.next(); - processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true)); + outputStatsToVCF(createIntervalStatistic(interval), Allele.create(refBase, true)); interval = intervalListIterator.peek(); } } @@ -202,7 +234,6 @@ public class DiagnoseTargets extends LocusWalker implements Annotato private void addNewOverlappingIntervals(GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); while (interval != null && !interval.isPast(refLocus)) { - System.out.println("LOCUS : " + refLocus + " -- " + interval); intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); // discard the interval (we've already added it to the map) interval = intervalListIterator.peek(); @@ -210,14 +241,14 @@ public class DiagnoseTargets extends LocusWalker implements Annotato } /** - * Takes the interval, finds it in the stash, prints it to the VCF, and removes it + * Takes the interval, finds it in the stash, prints it to the VCF * - * @param stats The statistics of the interval + * @param stats The statistics of the interval * @param refAllele the reference allele */ - private void processIntervalStats(IntervalStatistics stats, Allele refAllele) { + private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) { GenomeLoc interval = stats.getInterval(); - + List alleles = new ArrayList(); Map attributes = new HashMap(); ArrayList genotypes = new ArrayList(); @@ -227,7 +258,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF - vcb.filters(statusesToStrings(stats.callableStatuses())); + vcb.filters(statusesToStrings(stats.callableStatuses(thresholds))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); @@ -236,16 +267,24 @@ public class DiagnoseTargets extends LocusWalker implements Annotato for (String sample : samples) { Map infos = new HashMap(); - infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage()); + SampleStatistics sampleStat = stats.getSample(sample); + infos.put(VCFConstants.DEPTH_KEY, sampleStat.averageCoverage()); + infos.put("Q1", sampleStat.getQuantileDepth(0.25)); + infos.put("MED", sampleStat.getQuantileDepth(0.50)); + infos.put("Q3", sampleStat.getQuantileDepth(0.75)); Set filters = new HashSet(); - filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); + filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds))); genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); } vcb = vcb.genotypes(genotypes); + if (debug) { + System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage()); + } + vcfWriter.add(vcb.make()); } @@ -264,7 +303,12 @@ public class DiagnoseTargets extends LocusWalker implements Annotato headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype + // todo -- find the appropriate VCF constants headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); + headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); + headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); + // FILTER fields for (CallableStatus stat : CallableStatus.values()) @@ -273,8 +317,13 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return headerLines; } - - private static Set statusesToStrings(Set statuses) { + /** + * Function that process a set of statuses into strings + * + * @param statuses the set of statuses to be converted + * @return a matching set of strings + */ + private Set statusesToStrings(Set statuses) { Set output = new HashSet(statuses.size()); for (CallableStatus status : statuses) @@ -284,6 +333,6 @@ public class DiagnoseTargets extends LocusWalker implements Annotato } private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { - return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality); + return new IntervalStatistics(samples, interval /*, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality*/); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index f3246407b..5731c41c5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -34,19 +35,24 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; -public class IntervalStatistics { +class IntervalStatistics { private final Map samples; private final GenomeLoc interval; + private boolean hasNref = false; private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - - public IntervalStatistics(Set samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + /* + private double minMedianDepth = 20.0; + private double badMedianDepthPercentage = 0.20; + private double votePercentage = 0.50; + */ + public IntervalStatistics(Set samples, GenomeLoc interval/*, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality*/) { this.interval = interval; this.samples = new HashMap(samples.size()); for (String sample : samples) - this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality)); + this.samples.put(sample, new SampleStatistics(interval /*, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality*/)); } public SampleStatistics getSample(String sample) { @@ -57,9 +63,19 @@ public class IntervalStatistics { return interval; } - public void addLocus(AlignmentContext context) { + /** + * The function to populate data into the Statistics from the walker. + * This takes the input and manages passing the data to the SampleStatistics and Locus Statistics + * + * @param context The alignment context given from the walker + * @param ref the reference context given from the walker + * @param thresholds the class contains the statistical threshold for making calls + */ + public void addLocus(AlignmentContext context, ReferenceContext ref, ThresHolder thresholds) { ReadBackedPileup pileup = context.getBasePileup(); + //System.out.println(ref.getLocus().toString()); + Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); for (Map.Entry entry : samplePileups.entrySet()) { @@ -67,15 +83,16 @@ public class IntervalStatistics { ReadBackedPileup samplePileup = entry.getValue(); SampleStatistics sampleStatistics = samples.get(sample); - if (sampleStatistics == null) + if (sampleStatistics == null) throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); - - sampleStatistics.addLocus(context.getLocation(), samplePileup); + + sampleStatistics.addLocus(context.getLocation(), samplePileup, thresholds); } + if (!hasNref && ref.getBase() == 'N') + hasNref = true; } - public double averageCoverage() { if (preComputedTotalCoverage < 0) calculateTotalCoverage(); @@ -90,15 +107,43 @@ public class IntervalStatistics { /** * Return the Callable statuses for the interval as a whole - * todo -- add a voting system for sample flags and add interval specific statuses + * todo -- add missingness filter * + * @param thresholds the class contains the statistical threshold for making calls * @return the callable status(es) for the whole interval */ - public Set callableStatuses() { + public Set callableStatuses(ThresHolder thresholds) { Set output = new HashSet(); + // Initialize the Map + Map votes = new HashMap(); + for (CallableStatus status : CallableStatus.values()) + votes.put(status, 0); + + // tally up the votes for (SampleStatistics sample : samples.values()) - output.addAll(sample.getCallableStatuses()); + for (CallableStatus status : sample.getCallableStatuses(thresholds)) + votes.put(status, votes.get(status) + 1); + + // output tall values above the threshold + for (CallableStatus status : votes.keySet()) { + if (votes.get(status) > (samples.size() * thresholds.getVotePercentageThreshold()) && !(status.equals(CallableStatus.PASS))) + output.add(status); + } + + + if (hasNref) + output.add(CallableStatus.REF_N); + + // get median DP of each sample + int nLowMedianDepth = 0; + for (SampleStatistics sample : samples.values()) { + if (sample.getQuantileDepth(0.5) < thresholds.getMinimumMedianDepth()) + nLowMedianDepth++; + } + + if (nLowMedianDepth > (samples.size() * thresholds.getLowMedianDepthThreshold())) + output.add(CallableStatus.LOW_MEDIAN_DEPTH); return output; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java index 237ca1b1c..90563505d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java @@ -27,9 +27,9 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import java.util.HashSet; import java.util.Set; -public class LocusStatistics { - final int coverage; - final int rawCoverage; +class LocusStatistics { + private final int coverage; + private final int rawCoverage; public LocusStatistics() { this.coverage = 0; @@ -52,21 +52,20 @@ public class LocusStatistics { /** * Generates all applicable statuses from the coverages in this locus * - * @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality - * @param maximumCoverageThreshold the maximum threshold for determining excessive coverage + * @param thresholds the class contains the statistical threshold for making calls * @return a set of all statuses that apply */ - public Set callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) { + public Set callableStatuses(ThresHolder thresholds) { Set output = new HashSet(); // if too much coverage - if (getCoverage() > maximumCoverageThreshold) + if (getCoverage() > thresholds.getMaximumCoverage()) output.add(CallableStatus.EXCESSIVE_COVERAGE); // if not enough coverage - if (getCoverage() < minimumCoverageThreshold) { + if (getCoverage() < thresholds.getMinimumCoverage()) { // was there a lot of low Qual coverage? - if (getRawCoverage() >= minimumCoverageThreshold) + if (getRawCoverage() >= thresholds.getMinimumCoverage()) output.add(CallableStatus.POOR_QUALITY); // no? else { @@ -74,7 +73,7 @@ public class LocusStatistics { if (getRawCoverage() > 0) output.add(CallableStatus.LOW_COVERAGE); else - output.add(CallableStatus.NO_COVERAGE); + output.add(CallableStatus.COVERAGE_GAPS); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index 9e4993853..958a44d4a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -27,41 +27,36 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; /** - * Short one line description of the walker. - * - * @author Mauricio Carneiro - * @since 2/1/12 + * The statistics calculator for a specific sample given the interval */ class SampleStatistics { private final GenomeLoc interval; private final ArrayList loci; - private final int minimumCoverageThreshold; - private final int maximumCoverageThreshold; - private final int minimumMappingQuality; - private final int minimumBaseQuality; + private int[] preSortedDepths = null; + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + private int nReads = -1; + private int nBadMates = -1; - private SampleStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + private SampleStatistics(GenomeLoc interval, ArrayList loci) { this.interval = interval; this.loci = loci; - this.minimumCoverageThreshold = minimumCoverageThreshold; - this.maximumCoverageThreshold = maximumCoverageThreshold; - this.minimumMappingQuality = minimumMappingQuality; - this.minimumBaseQuality = minimumBaseQuality; + nReads = 0; + nBadMates = 0; } - public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { - this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + public SampleStatistics(GenomeLoc interval) { + this(interval, new ArrayList(interval.size())); // Initialize every loci (this way we don't have to worry about non-existent loci in the object for (int i = 0; i < interval.size(); i++) - this.loci.add(i, new LocusStatistics()); + this.loci.add(new LocusStatistics()); } @@ -78,51 +73,56 @@ class SampleStatistics { } /** - * Calculates the callable statuses of the entire interval + * Calculates the callable statuses of the entire sample * - * @return the callable statuses of the entire interval + * @param thresholds the class contains the statistical threshold for making calls + * @return the callable statuses of the entire sample */ - public Set getCallableStatuses() { + public Set getCallableStatuses(ThresHolder thresholds) { + Set output = new HashSet(); - Map totals = new HashMap(CallableStatus.values().length); + // We check if reads are present ot prevent div / 0 exceptions + if (nReads == 0) { + output.add(CallableStatus.NO_READS); + return output; + } + + Map totals = new HashMap(CallableStatus.values().length); // initialize map for (CallableStatus status : CallableStatus.values()) - totals.put(status, 0); + totals.put(status, 0.0); // sum up all the callable statuses for each locus for (int i = 0; i < interval.size(); i++) { - for (CallableStatus status : callableStatus(i)) { - int count = totals.get(status); + for (CallableStatus status : callableStatus(i, thresholds)) { + double count = totals.get(status); totals.put(status, count + 1); } } - - Set output = new HashSet(); - - // double to avoid type casting double intervalSize = interval.size(); - double coverageStatusThreshold = 0.20; - if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold) - output.add(CallableStatus.NO_COVERAGE); + if ((nBadMates / nReads) > thresholds.getBadMateStatusThreshold()) + output.add(CallableStatus.BAD_MATE); - if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold) + if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) > thresholds.getCoverageStatusThreshold()) + output.add(CallableStatus.COVERAGE_GAPS); + + if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > thresholds.getCoverageStatusThreshold()) output.add(CallableStatus.LOW_COVERAGE); - double excessiveCoverageThreshold = 0.20; - if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold) + if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > thresholds.getExcessiveCoverageThreshold()) output.add(CallableStatus.EXCESSIVE_COVERAGE); - double qualityStatusThreshold = 0.50; - if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold) + if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > thresholds.getQualityStatusThreshold()) output.add(CallableStatus.POOR_QUALITY); if (totals.get(CallableStatus.REF_N) > 0) output.add(CallableStatus.REF_N); + if (output.isEmpty()) { output.add(CallableStatus.PASS); } @@ -132,12 +132,13 @@ class SampleStatistics { /** * Adds a locus to the interval wide stats * - * @param locus The locus given as a GenomeLoc - * @param pileup The pileup of that locus + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus, this exclusively contains the sample + * @param thresholds the class contains the statistical threshold for making calls */ - public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup, ThresHolder thresholds) { if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval)); // a null pileup means there nothing ot add if (pileup != null) { @@ -145,31 +146,141 @@ class SampleStatistics { int locusIndex = locus.getStart() - interval.getStart(); int rawCoverage = pileup.depthOfCoverage(); - int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.getMinimumBaseQuality(), thresholds.getMinimumMappingQuality()).depthOfCoverage(); LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); - loci.add(locusIndex, locusData); + loci.set(locusIndex, locusData); + + for (GATKSAMRecord read : pileup.getReads()) + processRead(read, thresholds); + } + } + + private void processRead(GATKSAMRecord read, ThresHolder thresholds) { + // Was this read already processed? + if (read.getTemporaryAttribute("checkedBadMate") == null) { + nReads++; + if (hasValidMate(read, thresholds)) + nBadMates++; + read.setTemporaryAttribute("checkedBadMate", true); } } /** - * returns the callable status of this locus without taking the reference base into account. + * returns the callable status of a given locus without taking the reference base into account. * * @param locusIndex location in the genome to inquire (only one locus) + * @param thresholds the class contains the statistical threshold for making calls * @return the callable status of a locus */ - private Set callableStatus(int locusIndex) { + private Set callableStatus(int locusIndex, ThresHolder thresholds) { LocusStatistics locus = loci.get(locusIndex); - return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold); + return locus.callableStatuses(thresholds); } - private void calculateTotalCoverage() { preComputedTotalCoverage = 0; for (LocusStatistics locus : loci) preComputedTotalCoverage += locus.getCoverage(); } + public double getQuantileDepth(double percentage) { + if (preSortedDepths == null) + getDepthsAsSortedArray(); + + return getQuartile(preSortedDepths, percentage); + } + + static double getQuartile(int[] data, double percentage) { + int size = data.length; + if (size == 1) + return (double) data[0]; + + if (percentage == 0.5) { + return getMedian(data); + } + + double position = (size - 1.0) / 2; + if (percentage == 0.25) { + // if the position is a whole number + return getMedian(Arrays.copyOfRange(data, 0, (int) position + 1)); + + } + if (percentage == 0.75) { + if (position % 1 == 0) { + return getMedian(Arrays.copyOfRange(data, (int) position, size)); + } else { + return getMedian(Arrays.copyOfRange(data, (int) position + 1, size)); + } + } + return -1; + } + + // Assumes data is sorted + private static double getMedian(int[] data) { + double size = (double) data.length; + if (size == 1) + return (double) data[0]; + + double position = (size - 1.0) / 2; + + if (position % 1 == 0) + return (double) data[(int) position]; + + else { + double high = (double) data[(int) Math.ceil(position)]; + double low = (double) data[(int) Math.floor(position)]; + + return (high + low) / 2; + + } + + } + + private void getDepthsAsSortedArray() { + preSortedDepths = new int[loci.size()]; + + for (int i = 0; i < loci.size(); i++) + preSortedDepths[i] = loci.get(i).getCoverage(); + + Arrays.sort(preSortedDepths); + } + + boolean hasValidMate(GATKSAMRecord read, ThresHolder thresholds) { + /** Check the following + * Does it have a pair? + * reasonable insert size? + * inverted? + * same orientation? + * todo - same contig? + * is pair mapped? + * todo - is forced mate? + * + */ + + // has NO pair + if (!read.getReadPairedFlag()) + return false; + + // unmapped + if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag()) + return false; + + // same orientation + if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag()) + return false; + + // inverted + if (read.getReadNegativeStrandFlag() == + read.getAlignmentStart() < read.getMateAlignmentStart()) + return false; + + // mates are too far apart + if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize()) + return false; + + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java new file mode 100644 index 000000000..17098567f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +class ThresHolder { + public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); + + private final int minimumBaseQuality; + private final int minimumMappingQuality; + + private final int minimumCoverage; + private final int maximumCoverage; + private final int minimumMedianDepth; + + private final int maximumInsertSize; + + private final double votePercentageThreshold; + private final double lowMedianDepthThreshold; + private final double badMateStatusThreshold; + private final double coverageStatusThreshold; + private final double excessiveCoverageThreshold; + private final double qualityStatusThreshold; + + public ThresHolder(int minimumBaseQuality, + int minimumMappingQuality, + int minimumCoverage, + int maximumCoverage, + int minimumMedianDepth, + int maximumInsertSize, + double votePercentageThreshold, + double lowMedianDepthThreshold, + double badMateStatusThreshold, + double coverageStatusThreshold, + double excessiveCoverageThreshold, + double qualityStatusThreshold) { + this.minimumBaseQuality = minimumBaseQuality; + this.minimumMappingQuality = minimumMappingQuality; + this.minimumCoverage = minimumCoverage; + this.maximumCoverage = maximumCoverage; + this.minimumMedianDepth = minimumMedianDepth; + this.maximumInsertSize = maximumInsertSize; + this.votePercentageThreshold = votePercentageThreshold; + this.lowMedianDepthThreshold = lowMedianDepthThreshold; + this.badMateStatusThreshold = badMateStatusThreshold; + this.coverageStatusThreshold = coverageStatusThreshold; + this.excessiveCoverageThreshold = excessiveCoverageThreshold; + this.qualityStatusThreshold = qualityStatusThreshold; + } + + public int getMinimumBaseQuality() { + return minimumBaseQuality; + } + + public int getMinimumMappingQuality() { + return minimumMappingQuality; + } + + public int getMinimumCoverage() { + return minimumCoverage; + } + + public int getMaximumCoverage() { + return maximumCoverage; + } + + public int getMinimumMedianDepth() { + return minimumMedianDepth; + } + + public int getMaximumInsertSize() { + return maximumInsertSize; + } + + public double getVotePercentageThreshold() { + return votePercentageThreshold; + } + + public double getLowMedianDepthThreshold() { + return lowMedianDepthThreshold; + } + + public double getBadMateStatusThreshold() { + return badMateStatusThreshold; + } + + public double getCoverageStatusThreshold() { + return coverageStatusThreshold; + } + + public double getExcessiveCoverageThreshold() { + return excessiveCoverageThreshold; + } + + public double getQualityStatusThreshold() { + return qualityStatusThreshold; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index bcd220dca..b15e303b3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -696,26 +696,32 @@ public abstract class AbstractReadBackedPileup getPileupsForSamples(Collection sampleNames) { Map result = new HashMap(); - Map> trackerMap = new HashMap>(); - - for (String sample : sampleNames) { // initialize pileups for each sample - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - trackerMap.put(sample, filteredTracker); - } - - for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup - GATKSAMRecord read = p.getRead(); - if (read.getReadGroup() != null) { - String sample = read.getReadGroup().getSample(); - UnifiedPileupElementTracker tracker = trackerMap.get(sample); - if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest - tracker.add(p); + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (String sample : sampleNames) { + PileupElementTracker filteredElements = tracker.getElements(sample); + if (filteredElements != null) + result.put(sample, createNewPileup(loc, filteredElements)); } + } else { + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); } - - for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample - result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); - return result; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java new file mode 100644 index 000000000..c70adcc1a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Set; + +public class LocusStatisticsUnitTest /*extends BaseTest*/ { + + @Test(dataProvider = "StatusTestValues") + public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) { + // The min Coverage threshold is 10, the max is 100 + ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); + Set statuses = new LocusStatistics(coverage, rawCoverage).callableStatuses(thresholds); + // Check to make sure the status provides matches the actual + Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1)); + + } + + @DataProvider(name = "StatusTestValues") + public Object[][] getStatusTestValues() { + return new Object[][]{ + new Object[]{100, 100, null}, + new Object[]{100, 101, null}, + new Object[]{101, 101, CallableStatus.EXCESSIVE_COVERAGE}, + new Object[]{10, 101, null}, + new Object[]{9, 101, CallableStatus.POOR_QUALITY}, + new Object[]{9, 10, CallableStatus.POOR_QUALITY}, + new Object[]{9, 9, CallableStatus.LOW_COVERAGE}, + new Object[]{0, 0, CallableStatus.COVERAGE_GAPS}, + new Object[]{0, 9, CallableStatus.LOW_COVERAGE}, + new Object[]{0, 101, CallableStatus.POOR_QUALITY}, + new Object[]{10, Integer.MAX_VALUE, null}, + new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, CallableStatus.EXCESSIVE_COVERAGE}, + }; + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java new file mode 100644 index 000000000..4db5b6156 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class SampleStatisticsUnitTest/* extends BaseTest */ { + + @DataProvider(name = "QuartileValues") + public Object[][] getQuantileValues() { + + int[] a1 = {5}; + int[] a2 = {1, 2}; + int[] a5 = {10, 20, 30, 40, 50}; + int[] a10 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + + return new Object[][]{ + new Object[]{a1, 0.5, 5}, + new Object[]{a1, 0, 5}, + new Object[]{a1, 1, 5}, + new Object[]{a2, 0.5, 1.5}, + new Object[]{a2, 0.25, 1}, + new Object[]{a2, 0.75, 2}, + new Object[]{a5, 0.5, 30}, + new Object[]{a5, 0.25, 20}, + new Object[]{a5, 0.75, 40}, + new Object[]{a5, 0, -1}, + new Object[]{a10, 0.5, 5.5}, + new Object[]{a10, 0.25, 3}, + new Object[]{a10, 0.75, 8} + }; + } + + @Test(dataProvider = "QuartileValues") + public void testGetQuartile(int[] dataList, double percentage, double expected) { + Assert.assertEquals(SampleStatistics.getQuartile(dataList, percentage), expected); + + } + + @DataProvider(name = "ReadsAndMates") + public Object[][] getReadAndMates() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + GATKSAMRecord noPair = ArtificialSAMUtils.createArtificialRead(header, "test", 0, 100, 50); + GATKSAMRecord good = ArtificialSAMUtils.createPair(header, "test", 30, 100, 150, true, false).get(0); + GATKSAMRecord bigInsertSize = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, false).get(0); + GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0); + GATKSAMRecord sameOrientation = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, true).get(0); + + GATKSAMRecord pairNotMapped = ArtificialSAMUtils.createPair(header, "test", 30, 100, 140, true, false).get(1); + pairNotMapped.setMateUnmappedFlag(true); + + // finish test + return new Object[][]{ + new Object[]{noPair, false}, + new Object[]{good, true}, + new Object[]{bigInsertSize, false}, + new Object[]{inverted, false}, + new Object[]{sameOrientation, false}, + new Object[]{pairNotMapped, false} + }; + } + + @Test(dataProvider = "ReadsAndMates") + public void testHasValidMate(GATKSAMRecord read, boolean expected) { + //50 is out maximum insert size + Assert.assertEquals(new SampleStatistics(GenomeLoc.UNMAPPED).hasValidMate(read, ThresHolder.DEFAULTS), expected); + } + +}