diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index f6cf07aae..7f8c35c96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -51,11 +51,10 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; -import java.io.FileNotFoundException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; -import java.util.concurrent.*; +import java.util.concurrent.Callable; /** * User: aaron @@ -466,7 +465,6 @@ public class SAMDataSource { /** * Fill the given buffering shard with reads. * @param shard Shard to fill. - * @return true if at the end of the stream. False otherwise. */ public void fillShard(Shard shard) { if(!shard.buffersReads()) @@ -561,15 +559,19 @@ public class SAMDataSource { if(shard.getFileSpans().get(id) == null) throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); - BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); - codec.setInputStream(inputStream); - iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); - } - else { - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + try { + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); + } + else { + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + } + } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes + throw new UserException.MalformedBAM(id.samFile, e.getMessage()); } iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); @@ -924,10 +926,7 @@ public class SAMDataSource { blockInputStream = new BlockInputStream(dispatcher,readerID,false); reader = new SAMFileReader(readerID.samFile,indexFile,false); } catch ( RuntimeIOException e ) { - if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException ) - throw new UserException.CouldNotReadInputFile(readerID.samFile, e); - else - throw e; + throw new UserException.CouldNotReadInputFile(readerID.samFile, e); } catch ( SAMFormatException e ) { throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 1dfc6fea0..2a8940de0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -1,23 +1,25 @@ /* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * * OTHER DEALINGS IN THE SOFTWARE. + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.coverage; @@ -42,40 +44,40 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - * + *
** A very common question about a NGS set of reads is what areas of the genome are considered callable. The system * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *
- * A BAM file containing exactly one sample. + * A BAM file containing exactly one sample. *
- * + * **
* -T CallableLociWalker \ @@ -83,31 +85,31 @@ import java.io.PrintStream; * -summary my.summary \ * -o my.bed *- * + * * would produce a BED file (my.bed) that looks like: - * + * *
- * 20 10000000 10000864 CALLABLE + * 20 10000000 10000864 PASS * 20 10000865 10000985 POOR_MAPPING_QUALITY - * 20 10000986 10001138 CALLABLE + * 20 10000986 10001138 PASS * 20 10001139 10001254 POOR_MAPPING_QUALITY - * 20 10001255 10012255 CALLABLE + * 20 10001255 10012255 PASS * 20 10012256 10012259 POOR_MAPPING_QUALITY - * 20 10012260 10012263 CALLABLE + * 20 10012260 10012263 PASS * 20 10012264 10012328 POOR_MAPPING_QUALITY - * 20 10012329 10012550 CALLABLE + * 20 10012329 10012550 PASS * 20 10012551 10012551 LOW_COVERAGE - * 20 10012552 10012554 CALLABLE + * 20 10012552 10012554 PASS * 20 10012555 10012557 LOW_COVERAGE - * 20 10012558 10012558 CALLABLE + * 20 10012558 10012558 PASS * et cetera... ** as well as a summary table that looks like: - * + * *
* state nBases * REF_N 0 - * CALLABLE 996046 + * PASS 996046 * NO_COVERAGE 121 * LOW_COVERAGE 928 * EXCESSIVE_COVERAGE 0 @@ -139,21 +141,21 @@ public class CallableLociWalker extends LocusWalkerminMappingQuality are treated as usable for variation detection, contributing to the CALLABLE + * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS * state. */ @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) byte minMappingQuality = 10; /** - * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state + * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state */ @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) byte minBaseQuality = 20; /** * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this - * value and is less than maxDepth the site is considered CALLABLE. + * value and is less than maxDepth the site is considered PASS. */ @Advanced @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) @@ -191,7 +193,7 @@ public class CallableLociWalker extends LocusWalker = minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) { + if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { QCDepth++; } } //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth); - if ( rawDepth == 0 ) { + if (rawDepth == 0) { state = CalledState.NO_COVERAGE; - } else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) { + } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) { state = CalledState.POOR_MAPPING_QUALITY; - } else if ( QCDepth < minDepth ) { + } else if (QCDepth < minDepth) { state = CalledState.LOW_COVERAGE; - } else if ( rawDepth >= maxDepth && maxDepth != -1 ) { + } else if (rawDepth >= maxDepth && maxDepth != -1) { state = CalledState.EXCESSIVE_COVERAGE; } else { state = CalledState.CALLABLE; } } - return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state); + return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state); } @Override @@ -328,15 +345,15 @@ public class CallableLociWalker extends LocusWalker * * [Long description of the walker] *
- * - * + * + * *Input
** [Description of the Input] *
- * + * *Output
** [Description of the Output] *
- * + * *Examples
** java @@ -51,12 +73,13 @@ import java.util.TreeSet; * @since 2/1/12 */ @By(value = DataSource.READS) -public class DiagnoseTargets extends LocusWalker{ +@PartitionBy(PartitionType.INTERVAL) +public class DiagnoseTargets extends LocusWalker implements AnnotatorCompatibleWalker { @Input(fullName = "interval_track", shortName = "int", doc = "", required = true) private IntervalBinding intervalTrack = null; - @Output - private PrintStream out = System.out; + @Output(doc = "File to which variants should be written", required = true) + protected VCFWriter vcfWriter = null; @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false) private int expandInterval = 50; @@ -73,13 +96,13 @@ public class DiagnoseTargets extends LocusWalker { @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) private int maximumCoverage = 700; - private TreeSet intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) - private HashMap intervalMap = null; // interval => statistics - private Iterator intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome - private GenomeLoc currentInterval = null; // The "current" interval loaded and being filled with statistics - private IntervalStatistics currentIntervalStatistics = null; // The "current" interval loaded and being filled with statistics - - private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) + private TreeSet intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) + private HashMap intervalMap = null; // interval => statistics + private Iterator intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome + private GenomeLoc currentInterval = null; // The "current" interval loaded + private IntervalStatistics currentIntervalStatistics = null; // The "current" interval being filled with statistics + private Set samples = null; // All the samples being processed + private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) @Override public void initialize() { @@ -88,38 +111,72 @@ public class DiagnoseTargets extends LocusWalker { if (intervalTrack == null) throw new UserException("This tool currently only works if you provide an interval track"); - parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below + parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below - List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided + List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided intervalList = new TreeSet (new GenomeLocComparator()); - intervalMap = new HashMap (originalList.size() * 2); + intervalMap = new HashMap (); for (GenomeLoc interval : originalList) - addAndExpandIntervalToLists(interval); + intervalList.add(interval); + //addAndExpandIntervalToMap(interval); intervalListIterator = intervalList.iterator(); + + // get all of the unique sample names + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + + // initialize the header + Set headerInfo = getHeaderInfo(); + + vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); + } + + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private Set getHeaderInfo() { + Set headerLines = new HashSet (); + + // INFO fields for overall data + headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); + headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); + headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + + // FILTER fields + + for (CallableStatus stat : CallableStatus.values()) { + headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); + } + + return headerLines; } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - while (currentInterval == null || currentInterval.isBefore(refLocus)) { + while (currentInterval == null || currentInterval.isBefore(refLocus)) { // do this for first time and while currentInterval is behind current locus if (!intervalListIterator.hasNext()) return 0L; + if (currentInterval != null) + processIntervalStats(currentInterval, Allele.create(ref.getBase(), true)); + currentInterval = intervalListIterator.next(); + addAndExpandIntervalToMap(currentInterval); currentIntervalStatistics = intervalMap.get(currentInterval); } - if (currentInterval.isPast(refLocus)) + if (currentInterval.isPast(refLocus)) // skip if we are behind the current interval return 0L; - byte[] mappingQualities = context.getBasePileup().getMappingQuals(); - byte[] baseQualities = context.getBasePileup().getQuals(); - int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); - int rawCoverage = context.size(); - - IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage); - currentIntervalStatistics.addLocus(refLocus, locusData); + currentIntervalStatistics.addLocus(context); // Add current locus to stats return 1L; } @@ -129,6 +186,13 @@ public class DiagnoseTargets extends LocusWalker { return 0L; } + /** + * Not sure what we are going to do here + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return a long + */ @Override public Long reduce(Long value, Long sum) { return sum + value; @@ -136,14 +200,25 @@ public class DiagnoseTargets extends LocusWalker { @Override public void onTraversalDone(Long result) { - super.onTraversalDone(result); - out.println("Interval\tCallStatus\tCOV\tAVG"); - for (GenomeLoc interval : intervalList) { - IntervalStatistics stats = intervalMap.get(interval); - out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage())); - } + for (GenomeLoc interval : intervalMap.keySet()) + processIntervalStats(interval, Allele.create(" ", true)); } + @Override + public RodBinding getSnpEffRodBinding() {return null;} + + @Override + public RodBinding getDbsnpRodBinding() {return null;} + + @Override + public List > getCompRodBindings() {return null;} + + @Override + public List > getResourceRodBindings() {return null;} + + @Override + public boolean alwaysAppendDbsnpId() {return false;} + private GenomeLoc createIntervalBefore(GenomeLoc interval) { int start = Math.max(interval.getStart() - expandInterval, 0); int stop = Math.max(interval.getStart() - 1, 0); @@ -157,16 +232,75 @@ public class DiagnoseTargets extends LocusWalker { return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); } - private void addAndExpandIntervalToLists(GenomeLoc interval) { + /** + * Takes an interval and commits it to memory. + * It will expand it if so told by the -exp command line argument + * + * @param interval The new interval to process + */ + private void addAndExpandIntervalToMap(GenomeLoc interval) { if (expandInterval > 0) { GenomeLoc before = createIntervalBefore(interval); GenomeLoc after = createIntervalAfter(interval); intervalList.add(before); intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); } - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + if (!intervalList.contains(interval)) + intervalList.add(interval); + intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + } + + /** + * Takes the interval, finds it in the stash, prints it to the VCF, and removes it + * + * @param interval The interval in memory that you want to write out and clear + * @param allele the allele + */ + private void processIntervalStats(GenomeLoc interval, Allele allele) { + IntervalStatistics stats = intervalMap.get(interval); + + List alleles = new ArrayList (); + Map attributes = new HashMap (); + ArrayList genotypes = new ArrayList (); + + alleles.add(allele); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); + + vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF + vcb.filters(statusesToStrings(stats.callableStatuses())); + + attributes.put(VCFConstants.END_KEY, interval.getStop()); + attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage()); + attributes.put("AV", stats.averageCoverage()); + + vcb = vcb.attributes(attributes); + + for (String sample : samples) { + Map infos = new HashMap (); + infos.put("DP", stats.getSample(sample).totalCoverage()); + infos.put("AV", stats.getSample(sample).averageCoverage()); + + Set filters = new HashSet (); + filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); + + + genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + } + vcb = vcb.genotypes(genotypes); + + vcfWriter.add(vcb.make()); + + intervalMap.remove(interval); + } + + private static Set statusesToStrings(Set statuses) { + Set output = new HashSet (statuses.size()); + + for (CallableStatus status : statuses) + output.add(status.name()); + + return output; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java deleted file mode 100644 index 5620c3902..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; - -/** - * The definition of a locus for the DiagnoseTargets walker statistics calculation - * - * @author Mauricio Carneiro - * @since 2/3/12 - */ -class IntervalStatisticLocus { - private final byte[] mappingQuality; - private final byte[] baseQuality; - private final int coverage; - private final int rawCoverage; - - public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) { - this.mappingQuality = mappingQuality; - this.baseQuality = baseQuality; - this.coverage = coverage; - this.rawCoverage = rawCoverage; - } - - public IntervalStatisticLocus() { - this(new byte[1], new byte[1], 0, 0); - } - - public int getCoverage() { - return coverage; - } - - public int getRawCoverage() { - return rawCoverage; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 8ee5f76fb..75f56808f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -1,44 +1,62 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; -/** - * Short one line description of the walker. - * - * @author Mauricio Carneiro - * @since 2/1/12 - */ -class IntervalStatistics { +public class IntervalStatistics { + + private final Map samples; private final GenomeLoc interval; - private final ArrayList loci; - private final int minimumCoverageThreshold; - private final int maximumCoverageThreshold; - private final int minimumMappingQuality; - private final int minimumBaseQuality; + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private IntervalStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + public IntervalStatistics(Set samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { this.interval = interval; - this.loci = loci; - this.minimumCoverageThreshold = minimumCoverageThreshold; - this.maximumCoverageThreshold = maximumCoverageThreshold; - this.minimumMappingQuality = minimumMappingQuality; - this.minimumBaseQuality = minimumBaseQuality; + this.samples = new HashMap (samples.size()); + for (String sample : samples) + this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality)); } - public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { - this(interval, new ArrayList (interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + public SampleStatistics getSample(String sample) { + return samples.get(sample); + } - // Initialize every loci (this way we don't have to worry about non-existent loci in the object - for (int i = 0; i < interval.size(); i++) - this.loci.add(i, new IntervalStatisticLocus()); + public void addLocus(AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + for (String sample : samples.keySet()) + getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample)); } public long totalCoverage() { @@ -50,73 +68,27 @@ class IntervalStatistics { public double averageCoverage() { if (preComputedTotalCoverage < 0) calculateTotalCoverage(); - return (double) preComputedTotalCoverage / loci.size(); - } - - /** - * Calculates the callable status of the entire interval - * - * @return the callable status of the entire interval - */ - public CallableStatus callableStatus() { - long max = -1; - CallableStatus maxCallableStatus = null; - HashMap statusCounts = new HashMap (CallableStatus.values().length); - - // initialize the statusCounts with all callable states - for (CallableStatus key : CallableStatus.values()) - statusCounts.put(key, 0); - - // calculate the callable status for each locus - for (int i = 0; i < loci.size(); i++) { - CallableStatus status = callableStatus(i); - int count = statusCounts.get(status) + 1; - statusCounts.put(status, count); - - if (count > max) { - max = count; - maxCallableStatus = status; - } - } - - return maxCallableStatus; - } - - public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) { - if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); - - int locusIndex = locus.getStart() - interval.getStart(); - - loci.add(locusIndex, locusData); - } - - /** - * returns the callable status of this locus without taking the reference base into account. - * - * @param locusIndex location in the genome to inquire (only one locus) - * @return the callable status of a locus - */ - private CallableStatus callableStatus(int locusIndex) { - if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold) - return CallableStatus.EXCESSIVE_COVERAGE; - - if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold) - return CallableStatus.CALLABLE; - - if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold) - return CallableStatus.POOR_QUALITY; - - if (loci.get(locusIndex).getRawCoverage() > 0) - return CallableStatus.LOW_COVERAGE; - - return CallableStatus.NO_COVERAGE; + return (double) preComputedTotalCoverage / interval.size(); } private void calculateTotalCoverage() { preComputedTotalCoverage = 0; - for (IntervalStatisticLocus locus : loci) - preComputedTotalCoverage += locus.getCoverage(); + for (SampleStatistics sample : samples.values()) + preComputedTotalCoverage += sample.totalCoverage(); } + /** + * Return the Callable statuses for the interval as a whole + * todo -- add a voting system for sample flags and add interval specific statuses + * + * @return the callable status(es) for the whole interval + */ + public Set callableStatuses() { + Set output = new HashSet (); + + for (SampleStatistics sample : samples.values()) + output.addAll(sample.getCallableStatuses()); + + return output; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java new file mode 100644 index 000000000..237ca1b1c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import java.util.HashSet; +import java.util.Set; + +public class LocusStatistics { + final int coverage; + final int rawCoverage; + + public LocusStatistics() { + this.coverage = 0; + this.rawCoverage = 0; + } + + public LocusStatistics(int coverage, int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; + } + + public int getCoverage() { + return coverage; + } + + public int getRawCoverage() { + return rawCoverage; + } + + /** + * Generates all applicable statuses from the coverages in this locus + * + * @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality + * @param maximumCoverageThreshold the maximum threshold for determining excessive coverage + * @return a set of all statuses that apply + */ + public Set callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) { + Set output = new HashSet (); + + // if too much coverage + if (getCoverage() > maximumCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + // if not enough coverage + if (getCoverage() < minimumCoverageThreshold) { + // was there a lot of low Qual coverage? + if (getRawCoverage() >= minimumCoverageThreshold) + output.add(CallableStatus.POOR_QUALITY); + // no? + else { + // is there any coverage? + if (getRawCoverage() > 0) + output.add(CallableStatus.LOW_COVERAGE); + else + output.add(CallableStatus.NO_COVERAGE); + } + } + + return output; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java new file mode 100644 index 000000000..9e4993853 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.*; + +/** + * Short one line description of the walker. + * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +class SampleStatistics { + private final GenomeLoc interval; + private final ArrayList loci; + + private final int minimumCoverageThreshold; + private final int maximumCoverageThreshold; + private final int minimumMappingQuality; + private final int minimumBaseQuality; + + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + + private SampleStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this.interval = interval; + this.loci = loci; + this.minimumCoverageThreshold = minimumCoverageThreshold; + this.maximumCoverageThreshold = maximumCoverageThreshold; + this.minimumMappingQuality = minimumMappingQuality; + this.minimumBaseQuality = minimumBaseQuality; + } + + public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this(interval, new ArrayList (interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + + // Initialize every loci (this way we don't have to worry about non-existent loci in the object + for (int i = 0; i < interval.size(); i++) + this.loci.add(i, new LocusStatistics()); + + } + + public long totalCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return preComputedTotalCoverage; + } + + public double averageCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return (double) preComputedTotalCoverage / loci.size(); + } + + /** + * Calculates the callable statuses of the entire interval + * + * @return the callable statuses of the entire interval + */ + public Set getCallableStatuses() { + + Map totals = new HashMap (CallableStatus.values().length); + + // initialize map + for (CallableStatus status : CallableStatus.values()) + totals.put(status, 0); + + // sum up all the callable statuses for each locus + for (int i = 0; i < interval.size(); i++) { + for (CallableStatus status : callableStatus(i)) { + int count = totals.get(status); + + totals.put(status, count + 1); + } + } + + + Set output = new HashSet (); + + // double to avoid type casting + double intervalSize = interval.size(); + + double coverageStatusThreshold = 0.20; + if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.NO_COVERAGE); + + if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.LOW_COVERAGE); + + double excessiveCoverageThreshold = 0.20; + if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + double qualityStatusThreshold = 0.50; + if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold) + output.add(CallableStatus.POOR_QUALITY); + + if (totals.get(CallableStatus.REF_N) > 0) + output.add(CallableStatus.REF_N); + + if (output.isEmpty()) { + output.add(CallableStatus.PASS); + } + return output; + } + + /** + * Adds a locus to the interval wide stats + * + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus + */ + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); + + // a null pileup means there nothing ot add + if (pileup != null) { + + int locusIndex = locus.getStart() - interval.getStart(); + + int rawCoverage = pileup.depthOfCoverage(); + int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + + LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); + + loci.add(locusIndex, locusData); + } + } + + /** + * returns the callable status of this locus without taking the reference base into account. + * + * @param locusIndex location in the genome to inquire (only one locus) + * @return the callable status of a locus + */ + private Set callableStatus(int locusIndex) { + LocusStatistics locus = loci.get(locusIndex); + + return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold); + } + + + private void calculateTotalCoverage() { + preComputedTotalCoverage = 0; + for (LocusStatistics locus : loci) + preComputedTotalCoverage += locus.getCoverage(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 0867d949e..c93e780bf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.MathUtils; -import java.util.ArrayList; import java.util.Arrays; /** @@ -42,12 +41,13 @@ public class AlleleFrequencyCalculationResult { // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles private double log10MLE; private double log10MAP; - final private int[] alleleCountsOfMLE; - final private int[] alleleCountsOfMAP; + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; // The posteriors seen, not including that of AF=0 - // TODO -- better implementation needed here (see below) - private ArrayList log10PosteriorMatrixValues = new ArrayList (100000); + private static final int POSTERIORS_CACHE_SIZE = 5000; + private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE]; + private int currentPosteriorsCacheIndex = 0; private Double log10PosteriorMatrixSum = null; // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) @@ -69,14 +69,9 @@ public class AlleleFrequencyCalculationResult { return log10MAP; } - public double getLog10PosteriorMatrixSum() { + public double getLog10PosteriorsMatrixSumWithoutAFzero() { if ( log10PosteriorMatrixSum == null ) { - // TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory; - // TODO -- will discuss with the team what the best option is - final double[] tmp = new double[log10PosteriorMatrixValues.size()]; - for ( int i = 0; i < tmp.length; i++ ) - tmp[i] = log10PosteriorMatrixValues.get(i); - log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp); + log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); } return log10PosteriorMatrixSum; } @@ -103,7 +98,7 @@ public class AlleleFrequencyCalculationResult { alleleCountsOfMLE[i] = 0; alleleCountsOfMAP[i] = 0; } - log10PosteriorMatrixValues.clear(); + currentPosteriorsCacheIndex = 0; log10PosteriorMatrixSum = null; } @@ -116,7 +111,8 @@ public class AlleleFrequencyCalculationResult { } public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - log10PosteriorMatrixValues.add(log10LofK); + addToPosteriorsCache(log10LofK); + if ( log10LofK > log10MAP ) { log10MAP = log10LofK; for ( int i = 0; i < alleleCountsForK.length; i++ ) @@ -124,6 +120,18 @@ public class AlleleFrequencyCalculationResult { } } + private void addToPosteriorsCache(final double log10LofK) { + // add to the cache + log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) { + final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); + log10PosteriorMatrixValues[0] = temporarySum; + currentPosteriorsCacheIndex = 1; + } + } + public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java index 5f374e597..5d6cf9f7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java @@ -72,6 +72,8 @@ import static java.lang.Math.pow; */ public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable { + public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; + protected final static int FIXED_PLOIDY = 2; protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 58b8af493..9f606cdfb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -45,7 +45,7 @@ public class UnifiedArgumentCollection { * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -53,7 +53,7 @@ public class UnifiedArgumentCollection { * effectively acts as a cap on the base qualities. */ @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) - public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; /** * Specifies how to determine the alternate allele to use for genotyping diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index e3d0efaa1..79ec08558 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; import java.util.*; @@ -126,8 +127,19 @@ public class UnifiedGenotyper extends LocusWalker , Unif @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public RodBinding
getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List > comps = Collections.emptyList(); + public List > getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper public RodBinding getSnpEffRodBinding() { return null; } - public List > getCompRodBindings() { return Collections.emptyList(); } public List > getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } @@ -216,7 +228,7 @@ public class UnifiedGenotyper extends LocusWalker , Unif verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP"); annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UnifiedGenotyperEngine.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the header Set
headerInfo = getHeaderInfo(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index d4206e8ef..f26dfe22e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -50,8 +50,9 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - - public static final int DEFAULT_PLOIDY = 2; + + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; + public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; public enum OUTPUT_MODE { /** produces calls only at variant sites */ @@ -111,7 +112,7 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size())); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size())); } @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) @@ -326,7 +327,7 @@ public class UnifiedGenotyperEngine { } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - final double sum = AFresult.getLog10PosteriorMatrixSum(); + final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } @@ -369,7 +370,7 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum(); + double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List alternateAllelesToUse = builder.make().getAlternateAlleles(); @@ -380,7 +381,7 @@ public class UnifiedGenotyperEngine { afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum(); + double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod @@ -389,7 +390,7 @@ public class UnifiedGenotyperEngine { afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum(); + double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -424,7 +425,7 @@ public class UnifiedGenotyperEngine { public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); - normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum(); + normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); return MathUtils.normalizeFromLog10(normalizedPosteriors); } @@ -622,8 +623,6 @@ public class UnifiedGenotyperEngine { } - public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; - public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { if( model.name().contains("SNP") ) return HUMAN_SNP_HETEROZYGOSITY; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index 786b7296b..c22f82969 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -207,7 +207,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { break; default: - throw new UserException.BadInput("Unexpected variant context type: " + eval); + // TODO - MIXED, SYMBOLIC, and MNP records are skipped over + //throw new UserException.BadInput("Unexpected variant context type: " + eval); + break; } return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 50fafa202..43aa273c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import net.sf.picard.PicardException; import net.sf.picard.liftover.LiftOver; import net.sf.picard.util.Interval; import net.sf.samtools.SAMFileHeader; @@ -73,7 +72,7 @@ public class LiftoverVariants extends RodWalker { public void initialize() { try { liftOver = new LiftOver(CHAIN); - } catch (PicardException e) { + } catch (RuntimeException e) { throw new UserException.BadInput("there is a problem with the chain file you are using: " + e.getMessage()); } @@ -82,7 +81,7 @@ public class LiftoverVariants extends RodWalker { try { final SAMFileHeader toHeader = new SAMFileReader(NEW_SEQ_DICT).getFileHeader(); liftOver.validateToSequences(toHeader.getSequenceDictionary()); - } catch (PicardException e) { + } catch (RuntimeException e) { throw new UserException.BadInput("the chain file you are using is not compatible with the reference you are trying to lift over to; please use the appropriate chain file for the given reference"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index c4b0165ca..5e3160452 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -237,7 +237,7 @@ public class MathUtils { public static double log10sumLog10(double[] log10p, int start, int finish) { double sum = 0.0; - double maxValue = Utils.findMaxEntry(log10p); + double maxValue = arrayMax(log10p, finish); if(maxValue == Double.NEGATIVE_INFINITY) return sum; @@ -554,7 +554,7 @@ public class MathUtils { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = Utils.findMaxEntry(array); + double maxValue = arrayMax(array); // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { @@ -627,10 +627,14 @@ public class MathUtils { return maxI; } - public static double arrayMax(double[] array) { + public static double arrayMax(final double[] array) { return array[maxElementIndex(array)]; } + public static double arrayMax(final double[] array, final int endIndex) { + return array[maxElementIndex(array, endIndex)]; + } + public static double arrayMin(double[] array) { return array[minElementIndex(array)]; } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 4817966fe..c2c608903 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -290,32 +290,6 @@ public class Utils { return m; } - - // returns the maximum value in the array - public static double findMaxEntry(double[] array) { - return findIndexAndMaxEntry(array).first; - } - - // returns the index of the maximum value in the array - public static int findIndexOfMaxEntry(double[] array) { - return findIndexAndMaxEntry(array).second; - } - - // returns the the maximum value and its index in the array - private static Pair findIndexAndMaxEntry(double[] array) { - if ( array.length == 0 ) - return new Pair (0.0, -1); - int index = 0; - double max = array[0]; - for (int i = 1; i < array.length; i++) { - if ( array[i] > max ) { - max = array[i]; - index = i; - } - } - return new Pair (max, index); - } - /** * Splits expressions in command args by spaces and returns the array of expressions. * Expressions may use single or double quotes to group any individual expression, but not both. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7aa0b2605..a6b2bbb21 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -278,6 +278,10 @@ public class GenotypeLikelihoods { public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { + // fast, closed form solution for diploid samples (most common use case) + if (ploidy==2) + return numAlleles*(numAlleles+1)/2; + if (numAlleles == 1) return 1; else if (ploidy == 1) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index c220a597b..cbaf705b4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -30,7 +30,6 @@ import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -48,6 +47,8 @@ public class VariantContextUtils { public final static String MERGE_FILTER_PREFIX = "filterIn"; final public static JexlEngine engine = new JexlEngine(); + public static final int DEFAULT_PLOIDY = 2; + static { engine.setSilent(false); // will throw errors now for selects that don't evaluate properly engine.setLenient(false); @@ -1123,7 +1124,7 @@ public class VariantContextUtils { } // calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY); + final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 78167e7e9..e7c10a623 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -142,6 +142,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test SLOD", spec); } + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("71251d8893649ea9abd5d9aa65739ba1")); + executeTest("test using comp track", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap (); diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index adc7927a7..5327d4cf2 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -284,6 +284,18 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); } + @Test + public void testLog10sumLog10() { + final double log3 = 0.477121254719662; + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3), 0); + + final double log2 = 0.301029995663981; + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0), 0); + } + @Test public void testDotProduct() { Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3);