From 10e74a71ebb101cd46827861d403bf68f73266fd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 10 Apr 2012 12:30:35 -0400 Subject: [PATCH 01/51] We now allow arbitrary annotations other than dbSNP (e.g. HM3) to come out of the Unified Genotyper. This was already set up in the Variant Annotator Engine and was just a matter of hooking UG up to it. Added integration test to ensure correct behavior. --- .../gatk/walkers/genotyper/UnifiedGenotyper.java | 13 ++++++++++++- .../genotyper/UnifiedGenotyperIntegrationTest.java | 8 ++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 8df501e1b..79ec08558 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -127,8 +127,19 @@ public class UnifiedGenotyper extends LocusWalker, Unif @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper public RodBinding getSnpEffRodBinding() { return null; } - public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 78167e7e9..e7c10a623 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -142,6 +142,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test SLOD", spec); } + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("71251d8893649ea9abd5d9aa65739ba1")); + executeTest("test using comp track", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap(); From a4634624b7485a745dd5829619186fd3ea762a45 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 10 Apr 2012 14:48:23 -0400 Subject: [PATCH 02/51] There are now three triggering options in the HaplotypeCaller. The default (mismatches, insertions, deletions, high quality soft clips), an external alleles file (from the UG for example), or extended triggers which include low quality soft clips, bad mates and unmapped mates. Added better algorithm for band pass filtering an ActivityProfile and breaking them apart when they get too big. Greatly increased the specificity of the caller by battening down the hatches on things like base quality and mapping quality thresholds for both the assembler and the likelihood function. --- .../traversals/TraverseActiveRegions.java | 3 +- .../gatk/walkers/ActiveRegionExtension.java | 1 + .../gatk/walkers/ActiveRegionWalker.java | 9 +-- .../utils/activeregion/ActiveRegion.java | 7 +- .../utils/activeregion/ActivityProfile.java | 66 ++++++++++++------- .../sting/utils/pileup/PileupElement.java | 2 - ...ntReadsInActiveRegionsIntegrationTest.java | 2 +- 7 files changed, 57 insertions(+), 33 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 22d23f216..76c1ce8c5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -47,6 +47,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process workQueue.addAll( activeRegions ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java index bb007893c..d27148884 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy; public @interface ActiveRegionExtension { public int extension() default 0; + public int maxRegion() default 1500; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 8ff4b2f6f..f217268d2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -33,8 +30,8 @@ import java.util.List; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) -@ActiveRegionExtension(extension=50) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@ActiveRegionExtension(extension=50,maxRegion=1500) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) public abstract class ActiveRegionWalker extends Walker { @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 37822dc84..764be2ac7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -15,7 +15,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation { +public class ActiveRegion implements HasGenomeLocation, Comparable { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -73,6 +73,11 @@ public class ActiveRegion implements HasGenomeLocation { Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } + @Override + public int compareTo( final ActiveRegion other ) { + return this.getLocation().compareTo(other.getLocation()); + } + @Override public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 1499f639d..6ef5a2af2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -24,8 +24,10 @@ package org.broadinstitute.sting.utils.activeregion; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; @@ -45,8 +47,16 @@ public class ActivityProfile { final boolean presetRegions; GenomeLoc regionStartLoc = null; final List isActiveList; - private GenomeLoc lastLoc = null; + private static final int FILTER_SIZE = 65; + private static final Double[] GaussianKernel; + + static { + GaussianKernel = new Double[2*FILTER_SIZE + 1]; + for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { + GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii); + } + } // todo -- add upfront the start and stop of the intervals // todo -- check that no regions are unexpectedly missing @@ -85,15 +95,13 @@ public class ActivityProfile { public ActivityProfile bandPassFilter() { final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); final Double[] filteredProbArray = new Double[activeProbArray.length]; - final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - double maxVal = 0; - for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) { - if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } + if( !presetRegions ) { + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); + final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); + filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); } - filteredProbArray[iii] = maxVal; } - return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); } @@ -102,9 +110,9 @@ public class ActivityProfile { * @param activeRegionExtension * @return */ - public List createActiveRegions( final int activeRegionExtension ) { - final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author - final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author + public List createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) { + final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author + final ArrayList returnList = new ArrayList(); if( isActiveList.size() == 0 ) { // no elements in the active list, just return an empty one @@ -112,25 +120,22 @@ public class ActivityProfile { } else if( isActiveList.size() == 1 ) { // there's a single element, it's either active or inactive boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; - final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension ); - return Collections.singletonList(region); + returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize)); } else { // there are 2+ elements, divide these up into regions - final ArrayList returnList = new ArrayList(); boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; int curStart = 0; for(int iii = 1; iii < isActiveList.size(); iii++ ) { final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; - if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) ); + if( isActive != thisStatus ) { + returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize)); isActive = thisStatus; curStart = iii; } } - returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region - - return returnList; + returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region } + return returnList; } /** @@ -141,8 +146,25 @@ public class ActivityProfile { * @param activeRegionExtension * @return a fully initialized ActiveRegion with the above properties */ - private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) { - final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); - return new ActiveRegion( loc, isActive, parser, activeRegionExtension ); + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { + return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { + if( !isActive || curEnd - curStart < maxRegionSize ) { + final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); + returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension)); + return returnList; + } + // find the best place to break up the large active region + Double minProb = Double.MAX_VALUE; + int cutPoint = -1; + for( int iii = curStart + 45; iii < curEnd - 45; iii++ ) { // BUGBUG: assumes maxRegionSize >> 45 + if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; } + } + final List leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList()); + final List rightList = createActiveRegion(isActive, cutPoint, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + returnList.addAll( leftList ); + returnList.addAll( rightList ); + return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 771721169..81ba00888 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -32,8 +32,6 @@ public class PileupElement implements Comparable { protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases - - /** * Creates a new pileup element. * diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java index 44cf87b45..7d1fc637b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java @@ -38,7 +38,7 @@ public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s", 1, - Arrays.asList("fcd581aa6befe85c7297509fa7b34edf")); + Arrays.asList("1e9e8d637d2acde23fa99fe9dc07e3e2")); executeTest("CountReadsInActiveRegions:", spec); } } \ No newline at end of file From 1df0adf86237e238ec352c8e53d8c2ddfe1fa441 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 10 Apr 2012 15:28:27 -0400 Subject: [PATCH 03/51] Fixing ActivityProfile unit test. --- .../sting/utils/activeregion/ActivityProfileUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index 7d478d063..282f19d8a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -130,7 +130,7 @@ public class ActivityProfileUnitTest extends BaseTest { Assert.assertEquals(profile.size(), cfg.probs.size()); Assert.assertEquals(profile.isActiveList, cfg.probs); - assertRegionsAreEqual(profile.createActiveRegions(0), cfg.expectedRegions); + assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); } private void assertRegionsAreEqual(List actual, List expected) { From cd842b650e07c860a12c28938f4188b120fea6ef Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 10 Apr 2012 09:46:29 -0400 Subject: [PATCH 04/51] Optimizing DiagnoseTargets * Fixed output format to get a valid vcf * Optimzed the per sample pileup routine O(n^2) => O(n) pileup for samples * Added support to overlapping intervals * Removed expand target functionality (for now) * Removed total depth (pointless metric) --- .../diagnostics/targets/DiagnoseTargets.java | 198 ++++++++---------- .../targets/IntervalStatistics.java | 25 ++- .../pileup/AbstractReadBackedPileup.java | 36 +++- .../sting/utils/pileup/ReadBackedPileup.java | 11 + 4 files changed, 153 insertions(+), 117 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index b6a40f167..d73b22664 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import net.sf.picard.util.PeekableIterator; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -32,8 +33,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocComparator; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -79,10 +78,7 @@ public class DiagnoseTargets extends LocusWalker implements Annotato private IntervalBinding intervalTrack = null; @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false) - private int expandInterval = 50; + private VCFWriter vcfWriter = null; @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false) private int minimumBaseQuality = 20; @@ -96,13 +92,11 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false) private int maximumCoverage = 700; - private TreeSet intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them) private HashMap intervalMap = null; // interval => statistics - private Iterator intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome - private GenomeLoc currentInterval = null; // The "current" interval loaded - private IntervalStatistics currentIntervalStatistics = null; // The "current" interval being filled with statistics - private Set samples = null; // All the samples being processed - private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals) + private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome + private Set samples = null; // all the samples being processed + + private final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times @Override public void initialize() { @@ -111,72 +105,22 @@ public class DiagnoseTargets extends LocusWalker implements Annotato if (intervalTrack == null) throw new UserException("This tool currently only works if you provide an interval track"); - parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below - - List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided - intervalList = new TreeSet(new GenomeLocComparator()); intervalMap = new HashMap(); - for (GenomeLoc interval : originalList) - intervalList.add(interval); - //addAndExpandIntervalToMap(interval); + intervalListIterator = new PeekableIterator(intervalTrack.getIntervals(getToolkit()).listIterator()); - intervalListIterator = intervalList.iterator(); - - // get all of the unique sample names - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - // initialize the header - Set headerInfo = getHeaderInfo(); - - vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); - } - - /** - * Gets the header lines for the VCF writer - * - * @return A set of VCF header lines - */ - private Set getHeaderInfo() { - Set headerLines = new HashSet(); - - // INFO fields for overall data - headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); - headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); - - // FORMAT fields for each genotype - headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools")); - headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - - // FILTER fields - - for (CallableStatus stat : CallableStatus.values()) { - headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); - } - - return headerLines; + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header + vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - while (currentInterval == null || currentInterval.isBefore(refLocus)) { // do this for first time and while currentInterval is behind current locus - if (!intervalListIterator.hasNext()) - return 0L; - if (currentInterval != null) - processIntervalStats(currentInterval, Allele.create(ref.getBase(), true)); + removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore + addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus - currentInterval = intervalListIterator.next(); - addAndExpandIntervalToMap(currentInterval); - currentIntervalStatistics = intervalMap.get(currentInterval); - } - - if (currentInterval.isPast(refLocus)) // skip if we are behind the current interval - return 0L; - - currentIntervalStatistics.addLocus(context); // Add current locus to stats + for (IntervalStatistics intervalStatistics : intervalMap.values()) + intervalStatistics.addLocus(context); // Add current locus to stats return 1L; } @@ -198,10 +142,15 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return sum + value; } + /** + * Process all remaining intervals + * + * @param result number of loci processed by the walker + */ @Override public void onTraversalDone(Long result) { - for (GenomeLoc interval : intervalMap.keySet()) - processIntervalStats(interval, Allele.create("
", true)); + for (GenomeLoc interval : intervalMap.keySet()) + processIntervalStats(intervalMap.get(interval), Allele.create("A")); } @Override @@ -219,82 +168,111 @@ public class DiagnoseTargets extends LocusWalker implements Annotato @Override public boolean alwaysAppendDbsnpId() {return false;} - private GenomeLoc createIntervalBefore(GenomeLoc interval) { - int start = Math.max(interval.getStart() - expandInterval, 0); - int stop = Math.max(interval.getStart() - 1, 0); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); - } + /** + * Removes all intervals that are behind the current reference locus from the intervalMap + * + * @param refLocus the current reference locus + * @param refBase the reference allele + */ + private void removePastIntervals(GenomeLoc refLocus, byte refBase) { + List toRemove = new LinkedList(); + for (GenomeLoc interval : intervalMap.keySet()) + if (interval.isBefore(refLocus)) { + processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true)); + toRemove.add(interval); + } - private GenomeLoc createIntervalAfter(GenomeLoc interval) { - int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength(); - int start = Math.min(interval.getStop() + 1, contigLimit); - int stop = Math.min(interval.getStop() + expandInterval, contigLimit); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); + for (GenomeLoc interval : toRemove) + intervalMap.remove(interval); + + GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data + while(interval != null && interval.isBefore(refLocus)) { + interval = intervalListIterator.next(); + processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true)); + interval = intervalListIterator.peek(); + } } /** - * Takes an interval and commits it to memory. - * It will expand it if so told by the -exp command line argument + * Adds all intervals that overlap the current reference locus to the intervalMap * - * @param interval The new interval to process + * @param refLocus the current reference locus */ - private void addAndExpandIntervalToMap(GenomeLoc interval) { - if (expandInterval > 0) { - GenomeLoc before = createIntervalBefore(interval); - GenomeLoc after = createIntervalAfter(interval); - intervalList.add(before); - intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + private void addNewOverlappingIntervals(GenomeLoc refLocus) { + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null && !interval.isPast(refLocus)) { + System.out.println("LOCUS : " + refLocus + " -- " + interval); + intervalMap.put(interval, createIntervalStatistic(interval)); + intervalListIterator.next(); // discard the interval (we've already added it to the map) + interval = intervalListIterator.peek(); } - if (!intervalList.contains(interval)) - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); } /** * Takes the interval, finds it in the stash, prints it to the VCF, and removes it * - * @param interval The interval in memory that you want to write out and clear - * @param allele the allele + * @param stats The statistics of the interval + * @param refAllele the reference allele */ - private void processIntervalStats(GenomeLoc interval, Allele allele) { - IntervalStatistics stats = intervalMap.get(interval); - + private void processIntervalStats(IntervalStatistics stats, Allele refAllele) { + GenomeLoc interval = stats.getInterval(); + List alleles = new ArrayList(); Map attributes = new HashMap(); ArrayList genotypes = new ArrayList(); - alleles.add(allele); - VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); + alleles.add(refAllele); + alleles.add(SYMBOLIC_ALLELE); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF vcb.filters(statusesToStrings(stats.callableStatuses())); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage()); - attributes.put("AV", stats.averageCoverage()); + attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); for (String sample : samples) { Map infos = new HashMap(); - infos.put("DP", stats.getSample(sample).totalCoverage()); - infos.put("AV", stats.getSample(sample).averageCoverage()); + infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage()); Set filters = new HashSet(); filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); - genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); } vcb = vcb.genotypes(genotypes); vcfWriter.add(vcb.make()); - intervalMap.remove(interval); } + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + + private static Set statusesToStrings(Set statuses) { Set output = new HashSet(statuses.size()); @@ -303,4 +281,8 @@ public class DiagnoseTargets extends LocusWalker implements Annotato return output; } + + private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { + return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 75f56808f..f3246407b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.HashMap; @@ -52,18 +53,28 @@ public class IntervalStatistics { return samples.get(sample); } + public GenomeLoc getInterval() { + return interval; + } + public void addLocus(AlignmentContext context) { ReadBackedPileup pileup = context.getBasePileup(); - for (String sample : samples.keySet()) - getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample)); + Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); + + for (Map.Entry entry : samplePileups.entrySet()) { + String sample = entry.getKey(); + ReadBackedPileup samplePileup = entry.getValue(); + SampleStatistics sampleStatistics = samples.get(sample); + + if (sampleStatistics == null) + throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); + + sampleStatistics.addLocus(context.getLocation(), samplePileup); + } + } - public long totalCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return preComputedTotalCoverage; - } public double averageCoverage() { if (preComputedTotalCoverage < 0) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ea6901bb3..e3107c195 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -677,11 +677,11 @@ public abstract class AbstractReadBackedPileup filteredElements = tracker.getElements(sampleNames); return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; } else { - HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); } else { @@ -693,6 +693,38 @@ public abstract class AbstractReadBackedPileup getPileupsForSamples(Collection sampleNames) { + Map result = new HashMap(); + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (String sample : sampleNames) { + PileupElementTracker filteredElements = tracker.getElements(sampleNames); + if (filteredElements != null) + result.put(sample, createNewPileup(loc, filteredElements)); + } + } else { + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); + } + return result; + } + @Override public RBP getPileupForSample(String sampleName) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 110199f06..f15468840 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; /** * A data retrieval interface for accessing parts of the pileup. @@ -159,6 +160,16 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForSamples(Collection sampleNames); + /** + * Gets the particular subset of this pileup for each given sample name. + * + * Same as calling getPileupForSample for all samples, but in O(n) instead of O(n^2). + * + * @param sampleNames Name of the sample to use. + * @return A subset of this pileup containing only reads with the given sample. + */ + public Map getPileupsForSamples(Collection sampleNames); + /** * Gets the particular subset of this pileup with the given sample name. From f46f7d059048ef820b38508d420777bc7674ad22 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 10 Apr 2012 22:26:10 -0400 Subject: [PATCH 05/51] Fix the stats coming out of FlagStat. I will add an integration test in unstable --- .../sting/gatk/walkers/FlagStatWalker.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java index ab1e452d7..0777037bf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStatWalker.java @@ -127,7 +127,7 @@ public class FlagStatWalker extends ReadWalker { if (read.getDuplicateReadFlag()) { myStat.duplicates++; } - if (read.getReferenceIndex() >= 0) { + if (!read.getReadUnmappedFlag()) { myStat.mapped++; } if (read.getReadPairedFlag()) { @@ -139,21 +139,21 @@ public class FlagStatWalker extends ReadWalker { myStat.read1++; } if (read.getProperPairFlag()) { - myStat.properly_paired++; } - if (!read.getMateUnmappedFlag() && read.getReferenceIndex() >= 0) { + if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { myStat.with_itself_and_mate_mapped++; - } - if (read.getMateUnmappedFlag()) { - myStat.singletons++; - } - } - if (read.getReferenceIndex() >= 0 && read.getMateReferenceIndex() >= 0 && ! read.getReferenceIndex().equals(read.getMateReferenceIndex())) { - myStat.with_mate_mapped_to_a_different_chr++; - if (read.getMappingQuality() >= 5) { - myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) { + myStat.with_mate_mapped_to_a_different_chr++; + + if (read.getMappingQuality() >= 5) { + myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + } + } + } + if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) { + myStat.singletons++; } } return 1; From d2142c3aa7656a639d075b78d5235efd55e04a08 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 10 Apr 2012 22:40:38 -0400 Subject: [PATCH 06/51] Adding integration test for Flag Stat --- .../gatk/walkers/FlagStatIntegrationTest.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java new file mode 100755 index 000000000..d2acaa588 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class FlagStatIntegrationTest extends WalkerTest { + + @Test + public void testFlagStat() { + String md5 = "9c4039662f24bfd23ccf67973cb5df29"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T FlagStat -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + 1, + Arrays.asList(md5)); + executeTest("test flag stat", spec); + } +} From dc90508104ca2c33319b1513be8724fe9c99d2a0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 11 Apr 2012 13:47:10 -0400 Subject: [PATCH 07/51] Adding a new annotation to UG calls: NDA = number of discovered (but not necessarily genotyped) alleles for the site. This could help downstream analysis esp. of indels for wonky sites (since we only use the top 2-3 alleles). Not enabled by default but we can change that if this turns out to be useful. --- .../genotyper/UnifiedArgumentCollection.java | 12 ++++++++-- .../walkers/genotyper/UnifiedGenotyper.java | 2 ++ .../genotyper/UnifiedGenotyperEngine.java | 5 ++++ .../UnifiedGenotyperIntegrationTest.java | 23 +++++++++++-------- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 9f606cdfb..1a7900a6c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -82,15 +82,22 @@ public class UnifiedArgumentCollection { public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; /** - * This argument is not enabled by default because it increases the runtime by an appreciable amount. + * Note that calculating the SLOD increases the runtime by an appreciable amount. */ @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; + /** + * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. + * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. + */ + @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) + public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false) + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) public RodBinding alleles; /** @@ -171,6 +178,7 @@ public class UnifiedArgumentCollection { uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; uac.NO_SLOD = NO_SLOD; + uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 79ec08558..45d509cf8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -249,6 +249,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif // annotation (INFO) fields from UnifiedGenotyper if ( !UAC.NO_SLOD ) headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); // also, check to see whether comp rods were included diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f26dfe22e..94d340926 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -51,6 +51,8 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; @@ -365,6 +367,9 @@ public class UnifiedGenotyperEngine { if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); + if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e7c10a623..5095940a3 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -122,16 +122,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test - public void testCallingParameters() { - HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" ); - - for ( Map.Entry entry : e.entrySet() ) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + entry.getKey(), 1, - Arrays.asList(entry.getValue())); - executeTest(String.format("test calling parameter[%s]", entry.getKey()), spec); - } + public void testMinBaseQualityScore() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, + Arrays.asList("258c1b33349eb3b2d395ec4d69302725")); + executeTest("test min_base_quality_score 26", spec); } @Test @@ -142,6 +137,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test SLOD", spec); } + @Test + public void testNDA() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("")); + executeTest("test NDA", spec); + } + @Test public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( From 7aa654d13f1cf734bb9faa7c9d5053327b61cdb0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 11 Apr 2012 13:49:09 -0400 Subject: [PATCH 08/51] New interface for some dev work that Ryan and I are doing; only accessible from private walkers right now --- .../ActiveRegionBasedAnnotation.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java new file mode 100755 index 000000000..2c1bb0974 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; + +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; + +// TODO -- make this an abstract class when we move away from InfoFieldAnnotation +public interface ActiveRegionBasedAnnotation { + // return annotations for the given contexts split by sample and then allele + public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); + + // return the descriptions used for the VCF INFO meta field + public abstract List getDescriptions(); +} \ No newline at end of file From 5b7da3831f5f7c79b7f8e62bd0b0cf8964328a62 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 11 Apr 2012 13:49:50 -0400 Subject: [PATCH 09/51] Not sure why this didn't make it into the last push, but here's a working MD5 for the NDA annotation in UG --- .../gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 5095940a3..015f11048 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -141,7 +141,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("")); + Arrays.asList("443b2f8882393c4c65277c34cdb6060c")); executeTest("test NDA", spec); } From 5bf9dd2def3e5aa166468e79beb3e236a94fff7c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 11 Apr 2012 14:43:40 -0400 Subject: [PATCH 10/51] A framework to get annotations working in the HaplotypeCaller (and ART walkers in general). Adding support for active-region-based annotation for most standard annotations. I need to discuss with Ryan what to do about tests that require offsets into the reads (since I don't have access to the offsets) like e.g. the ReadPosRankSumTest. IMPORTANT NOTE: this is still very much a dev effort and can only be accessed through private walkers (i.e. the HaplotypeCaller). The interface is in flux and so we are making no attempt at all to make it clean or to merge this with the Locus-Traversal-based annotation system. When we are satisfied that it's working properly and have settled on the proper interface, we will clean it up then. --- .../annotator/BaseQualityRankSumTest.java | 31 ++++++++-- .../walkers/annotator/ChromosomeCounts.java | 12 +++- .../walkers/annotator/DepthOfCoverage.java | 21 ++++++- .../gatk/walkers/annotator/FisherStrand.java | 57 ++++++++++++++++++- .../walkers/annotator/InbreedingCoeff.java | 12 +++- .../annotator/MappingQualityRankSumTest.java | 23 ++++++-- .../gatk/walkers/annotator/QualByDepth.java | 41 ++++++++++++- .../walkers/annotator/RMSMappingQuality.java | 36 +++++++++++- .../gatk/walkers/annotator/RankSumTest.java | 51 +++++++++++++++-- .../walkers/annotator/ReadPosRankSumTest.java | 27 +++++++-- .../annotator/VariantAnnotatorEngine.java | 29 ++++++++-- .../ActiveRegionBasedAnnotation.java | 2 +- 12 files changed, 309 insertions(+), 33 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 97a4ac468..6eea12e2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -5,12 +5,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** @@ -31,8 +29,31 @@ public class BaseQualityRankSumTest extends RankSumTest { altQuals.add((double)p.getQual()); } } - } + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + + if ( isUsableBase(p) ) { + if ( matchesRef ) + refQuals.add((double)p.getQual()); + else + altQuals.add((double)p.getQual()); + } + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 0acd3e841..b3a8dbebd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -35,6 +36,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -49,7 +52,7 @@ import java.util.Map; * allele Frequency, for each ALT allele, in the same order as listed; total number * of alleles in called genotypes. */ -public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation { +public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), @@ -63,6 +66,13 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( ! vc.hasGenotypes() ) + return null; + + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); + } + public List getKeyNames() { return Arrays.asList(keyNames); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index b744fec46..f94d48893 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -33,7 +36,7 @@ import java.util.Map; * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with * -dcov D is N * D */ -public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { +public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -47,6 +50,22 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List alleleBin : alleleBins.values() ) { + depth += alleleBin.size(); + } + } + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", depth)); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 817d6b1ff..0d3bd11a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -28,6 +28,7 @@ import cern.jet.math.Arithmetic; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,7 +51,7 @@ import java.util.*; * indicative of false positive calls. Note that the fisher strand test may not be * calculated for certain complex indel cases or for multi-allelic sites. */ -public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; @@ -78,6 +80,22 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( !vc.isVariant() ) + return null; + + int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + + Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); + if ( pvalue == null ) + return null; + + Map map = new HashMap(); + map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); + return map; + + } + public List getKeyNames() { return Arrays.asList(FS); } @@ -193,6 +211,38 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return sum; } + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + int[][] table = new int[2][2]; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alt.equals(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + boolean isFW = read.getReadNegativeStrandFlag(); + + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + } + } + + return table; + } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -214,8 +264,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat Allele base = Allele.create(p.getBase(), false); boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - boolean matchesRef = ref.equals(base, true); - boolean matchesAlt = alt.equals(base, true); + final boolean matchesRef = ref.equals(base, true); + final boolean matchesAlt = alt.equals(base, true); if ( matchesRef || matchesAlt ) { int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; @@ -227,6 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 6366890d5..57561a277 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -27,12 +30,19 @@ import java.util.Map; * more information. Note that the Inbreeding Coefficient will not be calculated for files * with fewer than a minimum (generally 10) number of samples. */ -public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation { +public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + + private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index aa4f26ef3..520b0f232 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -6,12 +6,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** @@ -35,6 +33,23 @@ public class MappingQualityRankSumTest extends RankSumTest { } } } + + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index bf60dec6b..24a107235 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -3,11 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -23,7 +26,7 @@ import java.util.Map; * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing * reads associated with the samples with polymorphic genotypes. */ -public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -62,4 +65,40 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + int depth = 0; + + for ( final Genotype genotype : genotypes ) { + + // we care only about variant calls with likelihoods + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); + if ( alleleBins == null ) + continue; + + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + if ( !alleleBin.getKey().equals(Allele.NO_CALL) ) + depth += alleleBin.getValue().size(); + } + } + + if ( depth == 0 ) + return null; + + double QD = -10.0 * vc.getLog10PError() / (double)depth; + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 50ade5334..97c15e747 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -13,6 +14,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -24,7 +27,7 @@ import java.util.Map; /** * Root Mean Square of the mapping quality of the reads across all samples. */ -public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation { +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -34,7 +37,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( AlignmentContext context : stratifiedContexts.values() ) totalSize += context.size(); - int[] qualities = new int[totalSize]; + final int[] qualities = new int[totalSize]; int index = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { @@ -54,6 +57,35 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + depth += alleleBin.getValue().size(); + } + } + + final int[] qualities = new int[depth]; + int index = 0; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List reads : alleleBins.values() ) { + for ( final GATKSAMRecord read : reads ) { + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[index++] = read.getMappingQuality(); + } + } + } + + double rms = MathUtils.rms(qualities); + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", rms)); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ff5f8f144..80d248ac2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -12,6 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -26,7 +28,7 @@ import java.util.Map; /** * Abstract root for all RankSum based annotations */ -public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation { +public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; @@ -38,7 +40,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (genotypes == null || genotypes.size() == 0) return null; - final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); @@ -104,12 +105,52 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (!Double.isNaN(testResults.first)) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; - } - protected abstract void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals); + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if (stratifiedContexts.size() == 0) + return null; - protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); + final GenotypesContext genotypes = vc.getGenotypes(); + if (genotypes == null || genotypes.size() == 0) + return null; + + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); + + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final Map> context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + + fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), context, refQuals, altQuals); + } + + if ( refQuals.size() == 0 || altQuals.size() == 0 ) + return null; + + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + for (final Double qual : altQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); + } + for (final Double qual : refQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); + } + + // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); + + final Map map = new HashMap(); + if (!Double.isNaN(testResults.first)) + map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); + return map; + } + + protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals); + + protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); + + protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); protected static boolean isUsableBase(final PileupElement p) { return !(p.isInsertionAtBeginningOfRead() || diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index a998cd08b..e013f0e08 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -11,12 +11,10 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). @@ -49,6 +47,27 @@ public class ReadPosRankSumTest extends RankSumTest { } } + protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 90d0ad740..413c32a24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -33,10 +33,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -94,6 +92,13 @@ public class VariantAnnotatorEngine { initializeDBs(); } + // experimental constructor for active region traversal + public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) { + this.walker = null; + this.toolkit = toolkit; + requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.emptyList()); + } + // select specific expressions to use public void initializeExpressions(List expressionsToUse) { // set up the expressions @@ -169,7 +174,7 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -192,6 +197,20 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } + public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + + // go through all the requested info annotationTypes + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); + } + + // generate a new annotated VC + return new VariantContextBuilder(vc).attributes(infoAnnotations).make(); + } + private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java index 2c1bb0974..de61c7741 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.Map; // TODO -- make this an abstract class when we move away from InfoFieldAnnotation -public interface ActiveRegionBasedAnnotation { +public interface ActiveRegionBasedAnnotation extends AnnotationType { // return annotations for the given contexts split by sample and then allele public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); From cc71baf691f5b4105c5eb56f39a7d168a06021e7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 12 Apr 2012 09:18:44 -0400 Subject: [PATCH 11/51] Don't allow users to try to genotype more than the max possible value (catch and throw a User Error at startup). Better docs explaining that users shouldn't play with this value unless they know what they are doing. --- .../gatk/walkers/genotyper/UnifiedArgumentCollection.java | 5 ++++- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 6 ++++++ .../sting/utils/variantcontext/GenotypeLikelihoods.java | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 82e411c25..1af8b8e8e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -105,8 +105,11 @@ public class UnifiedArgumentCollection { /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. */ + @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 3; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 1106fcb52..9d4dbfe7c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -39,6 +39,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; @@ -207,6 +209,10 @@ public class UnifiedGenotyper extends LocusWalker GenotypeLikelihoods.MAX_ALLELES_THAT_CAN_BE_GENOTYPED ) + throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALLELES_THAT_CAN_BE_GENOTYPED); + // warn the user for misusing EMIT_ALL_SITES if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY && diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 621397fdb..0fd611b04 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -223,7 +223,7 @@ public class GenotypeLikelihoods { /** * The maximum number of alleles that we can represent as genotype likelihoods */ - final static int MAX_ALLELES_THAT_CAN_BE_GENOTYPED = 500; + public final static int MAX_ALLELES_THAT_CAN_BE_GENOTYPED = 500; /* * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles From b659b16b31a747a3134a75d0ec50ec47d09c7d6e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 12 Apr 2012 09:49:35 -0400 Subject: [PATCH 12/51] Generate User Error for bad POS value --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 3c2ed18e4..65e42c1d3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -204,7 +204,12 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { final List alleles = parseAlleles(ref, alts, lineNo); // find out our location - final int start = Integer.valueOf(locParts[1]); + int start = 0; + try { + start = Integer.valueOf(locParts[1]); + } catch (Exception e) { + generateException("the value in the POS field must be an integer but it was " + locParts[1], lineNo); + } int stop = start; // ref alleles don't need to be single bases for monomorphic sites From f77a6d18b8f45a9a83bf2bb8684025123159899b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 12 Apr 2012 09:56:49 -0400 Subject: [PATCH 13/51] Bad conflict merge before --- .../sting/utils/variantcontext/GenotypeLikelihoods.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index a6b2bbb21..63241f621 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -223,7 +223,7 @@ public class GenotypeLikelihoods { /** * The maximum number of alleles that we can represent as genotype likelihoods */ - final static int MAX_ALLELES_THAT_CAN_BE_GENOTYPED = 50; + public final static int MAX_ALLELES_THAT_CAN_BE_GENOTYPED = 50; /* * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles From 0dd571928d15dcbb5c537c0f599d49bfaf787624 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 12 Apr 2012 15:16:29 -0400 Subject: [PATCH 15/51] Let's not have the indel model emit more than the max possible number of genotypable alt alleles (since we may not be able to subset down to the best ones). --- .../IndelGenotypeLikelihoodsCalculationModel.java | 15 +++++++-------- .../gatk/walkers/genotyper/UnifiedGenotyper.java | 4 ++-- .../utils/variantcontext/GenotypeLikelihoods.java | 6 +++--- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 21f11d2ff..e41998cb3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -261,15 +261,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood builder.alleles(vcAlleles); builder.referenceBaseForIndel(ref.getBase()); builder.noGenotypes(); - if (doMultiAllelicCalls) + if (doMultiAllelicCalls) { + vcs.add(builder.make()); + if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + break; + } else if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); vcs.add(builder.make()); - else { - if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9d4dbfe7c..6f8851017 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -210,8 +210,8 @@ public class UnifiedGenotyper extends LocusWalker GenotypeLikelihoods.MAX_ALLELES_THAT_CAN_BE_GENOTYPED ) - throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALLELES_THAT_CAN_BE_GENOTYPED); + if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); // warn the user for misusing EMIT_ALL_SITES if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES && diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 0fd611b04..cc14585ac 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -223,12 +223,12 @@ public class GenotypeLikelihoods { /** * The maximum number of alleles that we can represent as genotype likelihoods */ - public final static int MAX_ALLELES_THAT_CAN_BE_GENOTYPED = 500; + public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 500; /* * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALLELES_THAT_CAN_BE_GENOTYPED); // start with data for 10 alternate alleles + private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); // start with data for 10 alternate alleles private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { final int numLikelihoods = calculateNumLikelihoods(altAlleles); @@ -274,7 +274,7 @@ public class GenotypeLikelihoods { public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) { // make sure that we've cached enough data if ( PLindex >= PLIndexToAlleleIndex.length ) - throw new ReviewedStingException("GATK limitation: cannot genotype more than " + MAX_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles"); + throw new ReviewedStingException("GATK limitation: cannot genotype more than " + MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles"); return PLIndexToAlleleIndex[PLindex]; } From 297afc79119cfd6c0def1063b9e005c8b7ab6b70 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 12 Apr 2012 15:43:14 -0400 Subject: [PATCH 16/51] Added unit test to ensure that we genotype correctly cases with really large GLs --- .../genotyper/ExactAFCalculationModelUnitTest.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 31c7a4e83..964d768c4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -94,4 +94,18 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } + + @Test + public void testLargeGLs() { + + final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; + GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); + + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); + + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); + + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; + Assert.assertEquals(calculatedAlleleCount, 6); + } } From e85e9a8cf5f76c3806116a1e8932339d8e3d9651 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 12 Apr 2012 14:42:40 -0400 Subject: [PATCH 17/51] More extensive testing of type of error thrown in multi-threaded walker test -- Unfortunately the result of the multi-threaded test is non-deterministic so run the test 10x times to see if the right expection is always thrown -- Now prints the stack trace and exception message of the caught exception of the wrong type, if this occurs --- .../org/broadinstitute/sting/WalkerTest.java | 15 +++++++-------- .../gatk/EngineFeaturesIntegrationTest.java | 16 ++++++++++------ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 7f5212ba3..6e8689e82 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -26,18 +26,17 @@ package org.broadinstitute.sting; import org.apache.commons.lang.StringUtils; -import org.broad.tribble.FeatureCodec; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.testng.Assert; import org.testng.annotations.BeforeMethod; @@ -315,9 +314,9 @@ public class WalkerTest extends BaseTest { // it's the type we expected System.out.println(String.format(" => %s PASSED", name)); } else { - e.printStackTrace(); - Assert.fail(String.format("Test %s expected exception %s but got %s instead", - name, expectedException, e.getClass())); + e.printStackTrace(System.out); // must print to stdout to see the message + Assert.fail(String.format("Test %s expected exception %s but instead got %s with error message %s", + name, expectedException, e.getClass(), e.getMessage())); } } else { // we didn't expect an exception but we got one :-( diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index 192c86fe3..68bd28d7a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -86,13 +86,15 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------- private class EngineErrorHandlingTestProvider extends TestDataProvider { - Class expectedException; - boolean multiThreaded; + final Class expectedException; + final boolean multiThreaded; + final int iterationsToTest; public EngineErrorHandlingTestProvider(Class exceptedException, final boolean multiThreaded) { super(EngineErrorHandlingTestProvider.class); this.expectedException = exceptedException; this.multiThreaded = multiThreaded; + this.iterationsToTest = multiThreaded ? 10 : 1; setName(String.format("Engine error handling: expected %s, is-multithreaded %b", exceptedException, multiThreaded)); } } @@ -113,9 +115,11 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // @Test(dataProvider = "EngineErrorHandlingTestProvider") public void testEngineErrorHandlingTestProvider(EngineErrorHandlingTestProvider cfg) { - final String root = "-T ErrorThrowing -R " + b37KGReference; - final String args = root + (cfg.multiThreaded ? " -nt 2" : "") + " -E " + cfg.expectedException.getSimpleName(); - WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); - executeTest(cfg.toString(), spec); + for ( int i = 0; i < cfg.iterationsToTest; i++ ) { + final String root = "-T ErrorThrowing -R " + b37KGReference; + final String args = root + (cfg.multiThreaded ? " -nt 2" : "") + " -E " + cfg.expectedException.getSimpleName(); + WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); + executeTest(cfg.toString(), spec); + } } } \ No newline at end of file From 27e7e17dc766c3dbbd477a6cf04163eb8a41b4ea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 13 Apr 2012 09:23:20 -0400 Subject: [PATCH 18/51] New way to handle exceptions in multi-threaded GATK -- HMS no longer tries to grab and throw all exceptions. Exceptions are just thrown directly now. -- Proper error handling is handled by functions in HMS, which are used by ShardTraverser and TreeReducer -- Better printing of stack traces in WalkerTest --- .../executive/HierarchicalMicroScheduler.java | 82 +++++++++++-------- .../sting/gatk/executive/ShardTraverser.java | 28 +++---- .../sting/gatk/executive/TreeReducer.java | 33 ++------ .../org/broadinstitute/sting/WalkerTest.java | 3 +- 4 files changed, 67 insertions(+), 79 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 1cea14a9d..b821b98e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -18,10 +18,7 @@ import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.Queue; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.FutureTask; +import java.util.concurrent.*; /** * A microscheduler that schedules shards according to a tree-like structure. @@ -44,11 +41,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar private final Queue reduceTasks = new LinkedList(); - /** - * An exception that's occurred in this traversal. If null, no exception has occurred. - */ - private RuntimeException error = null; - /** * Queue of incoming shards. */ @@ -99,11 +91,13 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); + // + // exception handling here is a bit complex. We used to catch and rethrow exceptions all over + // the place, but that just didn't work well. Now we have a specific execution exception (inner class) + // to use for multi-threading specific exceptions. All RuntimeExceptions that occur within the threads are rethrown + // up the stack as their underlying causes + // while (isShardTraversePending() || isTreeReducePending()) { - // Check for errors during execution. - if(hasTraversalErrorOccurred()) - throw getTraversalError(); - // Too many files sitting around taking up space? Merge them. if (isMergeLimitExceeded()) mergeExistingOutput(false); @@ -130,12 +124,8 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar result = reduceTree.getResult().get(); notifyTraversalDone(walker,result); } - catch (ReviewedStingException ex) { - throw ex; - } - catch (Exception ex) { - throw new ReviewedStingException("Unable to retrieve result", ex); - } + catch( InterruptedException ex ) { handleException(ex); } + catch( ExecutionException ex ) { handleException(ex); } // do final cleanup operations outputTracker.close(); @@ -338,32 +328,41 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar } /** - * Detects whether an execution error has occurred. - * @return True if an error has occurred. False otherwise. + * Handle an exception that occurred in a worker thread as needed by this scheduler. + * + * The way to use this function in a worker is: + * + * try { doSomeWork(); + * catch ( InterruptedException ex ) { hms.handleException(ex); } + * catch ( ExecutionException ex ) { hms.handleException(ex); } + * + * @param ex the exception that occurred in the worker thread */ - private synchronized boolean hasTraversalErrorOccurred() { - return error != null; - } - - private synchronized RuntimeException getTraversalError() { - if(!hasTraversalErrorOccurred()) - throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists"); - return error; + protected final void handleException(InterruptedException ex) { + throw new HierarchicalMicroScheduler.ExecutionFailure("Hierarchical reduce interrupted", ex); } /** - * Allows other threads to notify of an error during traversal. + * Handle an exception that occurred in a worker thread as needed by this scheduler. + * + * The way to use this function in a worker is: + * + * try { doSomeWork(); + * catch ( InterruptedException ex ) { hms.handleException(ex); } + * catch ( ExecutionException ex ) { hms.handleException(ex); } + * + * @param ex the exception that occurred in the worker thread */ - protected synchronized void notifyOfTraversalError(Throwable error) { - // If the error is already a Runtime, pass it along as is. Otherwise, wrap it. - if (error instanceof RuntimeException) - this.error = (RuntimeException)error; + protected final void handleException(ExecutionException ex) { + if ( ex.getCause() instanceof RuntimeException ) + // if the cause was a runtime exception that's what we want to send up the stack + throw (RuntimeException )ex.getCause(); else - this.error = new ReviewedStingException("An error occurred during the traversal.", error); - + throw new HierarchicalMicroScheduler.ExecutionFailure("Hierarchical reduce failed", ex); } + /** A small wrapper class that provides the TreeReducer interface along with the FutureTask semantics. */ private class TreeReduceTask extends FutureTask { private TreeReducer treeReducer = null; @@ -382,6 +381,17 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar } } + /** + * A specific exception class for HMS-specific failures such as + * Interrupted or ExecutionFailures that aren't clearly the fault + * of the underlying walker code + */ + public static class ExecutionFailure extends ReviewedStingException { + public ExecutionFailure(final String s, final Throwable throwable) { + super(s, throwable); + } + } + /** * Used by the ShardTraverser to report time consumed traversing a given shard. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index badd39860..9920213a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -27,16 +27,15 @@ import java.util.concurrent.Callable; * Carries the walker over a given shard, in a callable interface. */ public class ShardTraverser implements Callable { - private HierarchicalMicroScheduler microScheduler; - private Walker walker; - private Shard shard; - private TraversalEngine traversalEngine; - private ThreadLocalOutputTracker outputTracker; + final private HierarchicalMicroScheduler microScheduler; + final private Walker walker; + final private Shard shard; + final private TraversalEngine traversalEngine; + final private ThreadLocalOutputTracker outputTracker; private OutputMergeTask outputMergeTask; /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(ShardTraverser.class); - + final protected static Logger logger = Logger.getLogger(ShardTraverser.class); /** * Is this traversal complete? @@ -58,11 +57,10 @@ public class ShardTraverser implements Callable { public Object call() { try { traversalEngine.startTimersIfNecessary(); - long startTime = System.currentTimeMillis(); + final long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); - LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), + final WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), microScheduler.getReadIterator(shard), shard.getGenomeLocs(), microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here? @@ -76,18 +74,12 @@ public class ShardTraverser implements Callable { windowMaker.close(); outputMergeTask = outputTracker.closeStorage(); - long endTime = System.currentTimeMillis(); + final long endTime = System.currentTimeMillis(); microScheduler.reportShardTraverseTime(endTime-startTime); return accumulator; - } - catch(Throwable t) { - // Notify that an exception has occurred and rethrow it. - microScheduler.notifyOfTraversalError(t); - throw new ReviewedStingException("An error has occurred during traversal",t); - } - finally { + } finally { synchronized(this) { complete = true; notifyAll(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java index 6acaadd50..fc8a89c64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/TreeReducer.java @@ -25,20 +25,11 @@ import java.util.concurrent.Future; * interface to force the reduce. */ public class TreeReducer implements Callable { - private HierarchicalMicroScheduler microScheduler; + final private HierarchicalMicroScheduler microScheduler; private TreeReducible walker; private Future lhs; private Future rhs; - /** - * Create a one-sided reduce. Result will be a simple pass-through of the result. - * @param microScheduler The parent hierarchical microscheduler for this reducer. - * @param lhs The one side of the reduce. - */ - public TreeReducer( HierarchicalMicroScheduler microScheduler, Future lhs ) { - this( microScheduler, lhs, null ); - } - /** * Create a full tree reduce. Combine this two results using an unspecified walker at some point in the future. * @param microScheduler The parent hierarchical microscheduler for this reducer. @@ -67,10 +58,7 @@ public class TreeReducer implements Callable { if( lhs == null ) throw new IllegalStateException(String.format("Insufficient data on which to reduce; lhs = %s, rhs = %s", lhs, rhs) ); - if( rhs == null ) - return lhs.isDone(); - - return lhs.isDone() && rhs.isDone(); + return lhs.isDone() && (rhs == null || rhs.isDone()); } /** @@ -80,24 +68,21 @@ public class TreeReducer implements Callable { public Object call() { Object result = null; - long startTime = System.currentTimeMillis(); + final long startTime = System.currentTimeMillis(); try { if( lhs == null ) result = lhs.get(); + // todo -- what the hell is this above line? Shouldn't it be the two below? +// if( lhs == null ) +// throw new IllegalStateException(String.format("Insufficient data on which to reduce; lhs = %s, rhs = %s", lhs, rhs) ); else result = walker.treeReduce( lhs.get(), rhs.get() ); } - catch( InterruptedException ex ) { - microScheduler.notifyOfTraversalError(ex); - throw new ReviewedStingException("Hierarchical reduce interrupted", ex); - } - catch( ExecutionException ex ) { - microScheduler.notifyOfTraversalError(ex); - throw new ReviewedStingException("Hierarchical reduce failed", ex); - } + catch( InterruptedException ex ) { microScheduler.handleException(ex); } + catch( ExecutionException ex ) { microScheduler.handleException(ex); } - long endTime = System.currentTimeMillis(); + final long endTime = System.currentTimeMillis(); // Constituent bits of this tree reduces are no longer required. Throw them away. this.lhs = null; diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 6e8689e82..f477fedc9 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -314,7 +314,8 @@ public class WalkerTest extends BaseTest { // it's the type we expected System.out.println(String.format(" => %s PASSED", name)); } else { - e.printStackTrace(System.out); // must print to stdout to see the message + if ( e.getCause() != null ) + e.getCause().printStackTrace(System.out); // must print to stdout to see the message Assert.fail(String.format("Test %s expected exception %s but instead got %s with error message %s", name, expectedException, e.getClass(), e.getMessage())); } From bfa966a4e94f440f5eeb5bb338c09dd04198bafb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 9 Apr 2012 15:50:20 -0400 Subject: [PATCH 19/51] Bugfix for OneBPIndel -- Previously was only including 1 bp insertions in stratification --- .../gatk/walkers/varianteval/stratifications/OneBPIndel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java index fe4f7641f..65633bc2b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java @@ -50,7 +50,7 @@ public class OneBPIndel extends VariantStratifier { public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null && eval.isIndel()) { for ( int l : eval.getIndelLengths() ) - if ( l > 1 ) + if ( Math.abs(l) > 1 ) return TWO_PLUS_BP; // someone is too long return ONE_BP; // all lengths are one } else From e6d5cb46d25c627818278a0ef68746678f9d4ec7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 9 Apr 2012 15:51:34 -0400 Subject: [PATCH 20/51] Improvements and bugfixes to IndelSummary -- Now properly includes both bi and multi-allelic variants. These are actually counted as well, and emitted as counts and % of sites with multiple alleles -- Bug fix for gold standard rate --- .../varianteval/evaluators/IndelSummary.java | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index c22f82969..b4062fb10 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -62,8 +62,8 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Percent of indels overlapping gold standard sites") public String gold_standard_matching_rate; - // counts 1 for each site where the number of alleles > 2 - public int nMultiIndelSites = 0; + @DataPoint(description = "Number of sites with where the number of alleles is greater than 2") + public int n_multiallelic_indel_sites = 0; @DataPoint(description = "Percent of indel sites that are multi-allelic") public String percent_of_sites_with_more_than_2_alleles; @@ -158,10 +158,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { break; case INDEL: final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard); - if ( eval.isComplexIndel() ) break; // don't count complex substitutions - + nIndelSites++; - if ( ! eval.isBiallelic() ) nMultiIndelSites++; + if ( ! eval.isBiallelic() ) n_multiallelic_indel_sites++; // collect information about het / hom ratio for ( final Genotype g : eval.getGenotypes() ) { @@ -216,11 +215,11 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { } public void finalizeEvaluation() { - percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites); + percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(n_multiallelic_indel_sites, nIndelSites); SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels); SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); - gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels); + gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels); indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels); frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); From 285e61a227e87904daef86ae4423f44406568c23 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 10 Apr 2012 08:31:09 -0400 Subject: [PATCH 22/51] Bugfix for IndelSummary -- multi allelic count should be % not ratio --- .../sting/gatk/walkers/varianteval/evaluators/IndelSummary.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index b4062fb10..198172411 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -215,7 +215,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { } public void finalizeEvaluation() { - percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(n_multiallelic_indel_sites, nIndelSites); + percent_of_sites_with_more_than_2_alleles = Utils.formattedPercent(n_multiallelic_indel_sites, nIndelSites); SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels); SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); From ab06d53867f065e693b79d606034db5526263d7a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 10 Apr 2012 09:34:28 -0400 Subject: [PATCH 23/51] Useful test constructor or Unit tests in RefMetaDataTracker --- .../sting/gatk/refdata/RefMetaDataTracker.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 0e13e4ad9..2c2ee51bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -47,6 +47,14 @@ public class RefMetaDataTracker { // // ------------------------------------------------------------------------------------------ + /** + * Only for testing -- not accesssible in any other context + */ + public RefMetaDataTracker() { + ref = null; + map = Collections.emptyMap(); + } + public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { this.ref = ref; From 38986e42400402d872f6d1c58d3cd13b02e8333d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 10 Apr 2012 13:54:20 -0400 Subject: [PATCH 24/51] Documentation for StratificationManager --- .../manager/StratificationManager.java | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java index 86821fbc1..c674a0146 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java @@ -54,16 +54,27 @@ public class StratificationManager implements Map strats) { - stratifiers = new ArrayList(strats); + this.stratifiers = new ArrayList(strats); + + // construct and store the full tree of strats this.root = buildStratificationTree(new LinkedList(strats)); + // assign the linear key ordering to the leafs assignKeys(root); + // cache the size, and check for a bad state this.size = root.size(); if ( this.size == 0 ) throw new ReviewedStingException("Size == 0 in StratificationManager"); + // prepare the assocated data vectors mapping from key -> data this.valuesByKey = new ArrayList(size()); this.stratifierValuesByKey = new ArrayList>(size()); this.keyStrings = new ArrayList(size()); @@ -72,9 +83,20 @@ public class StratificationManager implements Map buildStratificationTree(final Queue strats) { final K first = strats.poll(); if ( first == null ) { @@ -97,6 +119,10 @@ public class StratificationManager implements Map root) { int key = 0; @@ -106,15 +132,23 @@ public class StratificationManager implements Map root) { + /** + * Entry point to recursive tool that fills in the list of state values corresponding + * to each key. After this function is called you can map from key -> List of StateValues + * instead of walking the tree to find the key and reading the list of state values + * + * @param root + */ + private void assignStratifierValuesByKey(final StratNode root) { assignStratifierValuesByKey(root, new LinkedList()); - + + // do a last sanity check that no key has null value after assigning for ( List stateValues : stratifierValuesByKey ) if ( stateValues == null ) throw new ReviewedStingException("Found a null state value set that's null"); } - public void assignStratifierValuesByKey(final StratNode node, final LinkedList states) { + private void assignStratifierValuesByKey(final StratNode node, final LinkedList states) { if ( node.isLeaf() ) { // we're here! if ( states.isEmpty() ) throw new ReviewedStingException("Found a leaf node with an empty state values vector"); @@ -134,13 +168,17 @@ public class StratificationManager implements Map= 0") public int size() { return size; } @Ensures("result != null") - public StratNode getRoot() { + protected StratNode getRoot() { return root; } @@ -299,7 +337,7 @@ public class StratificationManager implements Map> combineStates(final List first, final List second) { - List> combined = new ArrayList>(first.size()); + final List> combined = new ArrayList>(first.size()); for ( int i = 0; i < first.size(); i++ ) { final Object firstI = first.get(i); final Object secondI = second.get(i); From 84d1e8713a888af88448e5efbf54a9298f414723 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 11 Apr 2012 09:41:45 -0400 Subject: [PATCH 25/51] Infrastructure for combining VariantEvaluations -- Not hooked up yet, so the output of VariantEval should be the same as before -- Implemented a VariantEvalUnitTest that tests the low level strat / eval combinatorics and counting routines -- Better docs throughout --- .../varianteval/VariantEvalReportWriter.java | 2 +- .../varianteval/VariantEvalWalker.java | 24 +- .../evaluators/VariantEvaluator.java | 38 +++ .../DynamicStratification.java | 69 +++++ .../manager/StratificationManager.java | 79 ++++- .../varianteval/util/EvaluationContext.java | 31 +- .../org/broadinstitute/sting/utils/Utils.java | 14 + .../varianteval/VariantEvalUnitTest.java | 277 ++++++++++++++++++ 8 files changed, 521 insertions(+), 13 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index d4bbacdf1..8887e3c4f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -68,7 +68,7 @@ public class VariantEvalReportWriter { */ public final void writeReport(final PrintStream out) { for ( int key = 0; key < stratManager.size(); key++ ) { - final String stratStateString = stratManager.getStratsAndStatesForKeyString(key); + final String stratStateString = stratManager.getStratsAndStatesStringForKey(key); final List> stratsAndStates = stratManager.getStratsAndStatesForKey(key); final EvaluationContext nec = stratManager.get(key); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 6c7922ea5..a73bc2c70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -17,6 +17,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.DynamicStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; @@ -221,6 +222,7 @@ public class VariantEvalWalker extends RodWalker implements Tr // The set of all possible evaluation contexts StratificationManager stratManager; + //Set dynamicStratifications = Collections.emptySet(); /** * Initialize the stratifications, evaluations, evaluation contexts, and reporting object @@ -360,6 +362,14 @@ public class VariantEvalWalker extends RodWalker implements Tr if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); +// // update the dynamic stratifications +// for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) { +// // don't worry -- DynamicStratification only work with one eval object +// for ( final DynamicStratification ds : dynamicStratifications ) { +// ds.update(vc); +// } +// } + // --------- track --------- sample - VariantContexts - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); @@ -456,13 +466,13 @@ public class VariantEvalWalker extends RodWalker implements Tr * @param sampleName * @return */ - private Collection getEvaluationContexts(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final VariantContext eval, - final String evalName, - final VariantContext comp, - final String compName, - final String sampleName ) { + protected Collection getEvaluationContexts(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final VariantContext eval, + final String evalName, + final VariantContext comp, + final String compName, + final String sampleName ) { final List> states = new LinkedList>(); for ( final VariantStratifier vs : stratManager.getStratifiers() ) { states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index bb4cab750..df4c3e860 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; public abstract class VariantEvaluator implements Comparable { @@ -67,4 +68,41 @@ public abstract class VariantEvaluator implements Comparable { public int compareTo(final VariantEvaluator variantEvaluator) { return getSimpleName().compareTo(variantEvaluator.getSimpleName()); } + + /** + * Evaluation modules that override this function to indicate that they support + * combining the results of two independent collections of eval data into + * a single meaningful result. The purpose of this interface is to + * allow us to cut up the input data into many independent stratifications, and then + * at the end of the eval run decide which stratifications to combine. This is + * important in the case of AC, where you may have thousands of distinct AC + * values that chop up the number of variants to too small a number of variants, + * and you'd like to combine the AC values into ranges containing some percent + * of the data. + * + * For example, suppose you have an eval that + * counts variants in a variable nVariants. If you want to be able to combine + * multiple evaluations of this type, overload the combine function + * with a function that sets this.nVariants += other.nVariants. + * + * Add in the appropriate fields of the VariantEvaluator T + * (of the same type as this object) to the values of this object. + * + * The values in this and other are implicitly independent, so that + * the values can be added together. + * + * @param other a VariantEvaluator of the same type of this object + */ + public void combine(final VariantEvaluator other) { + throw new ReviewedStingException(getSimpleName() + " doesn't support combining results, sorry"); + } + + /** + * Must be overloaded to return true for evaluation modules that support the combine operation + * + * @return + */ + public boolean supportsCombine() { + return false; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java new file mode 100644 index 000000000..21255f7b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Tag this stratification as dynamically determining the final strat based on the input data + * + * The paradigm here is simple. We upfront create a strat with N states that reflect the finest grained + * possible division of the data. The data is processed, and statistics collected for each of the N states. + * An update call is made to the stratification for evaluation VariantContext during each map call, + * allowing the strat to collect data about the usage of each state. A final call requests that + * the stratification map down the N states into M states (typically less than N, not necessarily + * a subset of N). This is provided by returning a map from each of M state -> N states and + * the VariantEval walker will combine all of the evaluations for N into a single value for + * each M. + * + * For example, suppose I have a dynamic strat called AC, adopting 7 possible values 0,1,2,3,4,5,6. This + * strats tracks the number of eval vcs for each state, with final counts 0=1, 1=100, 2=10, 3=5, 4=3, 5=2, 6=1. + * The stratification attempts to combine the strats down to so that each state has approximately the same + * fraction of the data in each bin. Overall there is 1+100+10+5+3+2+1=124 observations and 7 bins so we really + * want ~ 18 observations in each bin. So we merge 3-6 with 5+3+2+1 = 11 and keep 2, 1, and 0 as distinct bins. We + * return a map from 0 -> 0, 1 -> 1, 2 -> 2, 3-6 -> {3,4,5,6}. + * + * TODO - some open implementation questions + * -- We should only create one stratifier overall. How do we track this? When we create the stratifiers + * perhaps we can look at them and create a tracker? + * -- How do we create a new stratifier based on the finalStratifications() given the framework? Conceptually + * this new thing is itself a stratifier, just like before, but it's states are determined at the end. We'd + * then like to call not getRelevantStates but a different function that accepts an old state and returns + * the new state. Perhaps the process should look like: + * finalizeStratification -> new Stratifier whose states are the final ones + * getNewState(old state) -> new state (one of those in getFinalStratification) + * + * @author Mark DePristo + * @since 4/9/12 + */ +public interface DynamicStratification { + public void update(final VariantContext eval); + public VariantStratifier finalizeStratification(); + public Object getFinalState(final Object oldState); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java index c674a0146..5e8db8107 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manage import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -226,7 +228,7 @@ public class StratificationManager implements Map implements Map { + /** take two values of type V and return a combined value of type V */ + public V combine(final V lhs, final V rhs); + } + + /** + * Remaps the stratifications from one stratification set to another, combining + * the values in V according to the combiner function. + * + * stratifierToReplace defines a set of states S1, while newStratifier defines + * a new set S2. remappedStates is a map from all of S1 into at least some of + * S2. This function creates a new, fully initialized manager where all of the + * data in this new manager is derived from the original data in this object + * combined according to the mapping remappedStates. When multiple + * elements of S1 can map to the same value in S2, these are sequentially + * combined by the function combiner. Suppose for example at states s1, s2, and + * s3 all map to N1. Eventually the value associated with state N1 would be + * + * value(N1) = combine(value(s1), combine(value(s2), value(s3)) + * + * in some order for s1, s2, and s3, which is not defined. Note that this function + * only supports combining one stratification at a time, but in principle a loop over + * stratifications and this function could do the multi-dimensional collapse. + * + * @param stratifierToReplace + * @param newStratifier + * @param combiner + * @param remappedStates + * @return + */ + public StratificationManager combineStrats(final K stratifierToReplace, + final K newStratifier, + final Combiner combiner, + final Map remappedStates) { + // make sure the mapping is reasonable + if ( ! newStratifier.getAllStates().containsAll(remappedStates.values()) ) + throw new ReviewedStingException("combineStrats: remapped states contains states not found in newStratifer state set"); + + if ( ! remappedStates.keySet().containsAll(stratifierToReplace.getAllStates()) ) + throw new ReviewedStingException("combineStrats: remapped states missing mapping for some states"); + + // the new strats are the old ones with the single replacement + final List newStrats = new ArrayList(getStratifiers()); + final int stratOffset = newStrats.indexOf(stratifierToReplace); + if ( stratOffset == -1 ) + throw new ReviewedStingException("Could not find strat to replace " + stratifierToReplace + " in existing strats " + newStrats); + newStrats.set(stratOffset, newStratifier); + + // create an empty but fully initialized new manager + final StratificationManager combined = new StratificationManager(newStrats); + + // for each key, get its state, update it according to the map, and update the combined manager + for ( int key = 0; key < size(); key++ ) { + // the new state is just the old one with the replacement + final List newStates = new ArrayList(getStatesForKey(key)); + final Object oldState = newStates.get(stratOffset); + final Object newState = remappedStates.get(oldState); + newStates.set(stratOffset, newState); + + // look up the new key given the new state + final int combinedKey = combined.getKey(newStates); + if ( combinedKey == -1 ) throw new ReviewedStingException("Couldn't find key for states: " + Utils.join(",", newStates)); + + // combine the old value with whatever new value is in combined already + final V combinedValue = combiner.combine(combined.get(combinedKey), get(key)); + + // update the value associated with combined key + combined.set(combinedKey, combinedValue); + } + + return combined; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java index 9363bbd79..390682837 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -14,15 +15,23 @@ import java.util.*; public final class EvaluationContext { // NOTE: must be hashset to avoid O(log n) cost of iteration in the very frequently called apply function - private final HashSet evaluationInstances; + final VariantEvalWalker walker; + private final ArrayList evaluationInstances; + private final Set> evaluationClasses; public EvaluationContext(final VariantEvalWalker walker, final Set> evaluationClasses) { - evaluationInstances = new HashSet(evaluationClasses.size()); + this(walker, evaluationClasses, true); + } + + private EvaluationContext(final VariantEvalWalker walker, final Set> evaluationClasses, final boolean doInitialize) { + this.walker = walker; + this.evaluationClasses = evaluationClasses; + this.evaluationInstances = new ArrayList(evaluationClasses.size()); for ( final Class c : evaluationClasses ) { try { final VariantEvaluator eval = c.newInstance(); - eval.initialize(walker); + if ( doInitialize ) eval.initialize(walker); evaluationInstances.add(eval); } catch (InstantiationException e) { throw new ReviewedStingException("Unable to instantiate eval module '" + c.getSimpleName() + "'", e); @@ -62,4 +71,20 @@ public final class EvaluationContext { } } } + + public void combine(final EvaluationContext rhs) { + for ( int i = 0; i < evaluationInstances.size(); i++ ) + evaluationInstances.get(i).combine(rhs.evaluationInstances.get(i)); + } + + public final static EvaluationContextCombiner COMBINER = new EvaluationContext.EvaluationContextCombiner(); + private static class EvaluationContextCombiner implements StratificationManager.Combiner { + @Override + public EvaluationContext combine(EvaluationContext lhs, final EvaluationContext rhs) { + if ( lhs == null ) + lhs = new EvaluationContext(rhs.walker, rhs.evaluationClasses, false); + lhs.combine(rhs); + return lhs; + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index c2c608903..7b627fba2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -750,4 +750,18 @@ public class Utils { public static String formattedRatio(final long num, final long denom) { return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); } + + /** + * Create a constant map that maps each value in values to itself + * @param values + * @param + * @return + */ + public static Map makeIdentityFunctionMap(Collection values) { + Map map = new HashMap(values.size()); + for ( final T value : values ) + map.put(value, value); + return Collections.unmodifiableMap(map); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java new file mode 100644 index 000000000..218af3b62 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.varianteval; + + +// the imports for unit testing. + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class VariantEvalUnitTest extends BaseTest { + VariantEvalWalker VEwalker; + VariantContext eval; + + + @BeforeMethod + public void init() { + VEwalker = new VariantEvalWalker(); + eval = new VariantContextBuilder("x", "chr1", 1, 1, Collections.singleton(Allele.create("A", true))).make(); + } + + // -------------------------------------------------------------------------------- + // + // Test stratifications / evaluations + // + // -------------------------------------------------------------------------------- + + private class StratifiedEvalTestProvider extends TestDataProvider { + final List stratificationObjects = new ArrayList(); + final Set> evaluationObjects = new HashSet>(); + final List expectedCounts; + final int maxI; + + /** + * + * @param maxI test integers from 1 ... maxI + * @param expectedCounts the expected number of integers from 1 ... maxI divisible by each combination, in order, of allStates + * @param allStates all stratification tests, in order + */ + public StratifiedEvalTestProvider(int maxI, + final List expectedCounts, + final List ... allStates) { + super(StratifiedEvalTestProvider.class); + + this.maxI = maxI; + this.expectedCounts = expectedCounts; + this.evaluationObjects.add(CounterEval.class); + + String stateName = ""; + for ( List states : allStates ) { + stratificationObjects.add(new IntegerStratifier(states)); + stateName = stateName + Utils.join(",", states) + " "; + } + + setName(String.format("maxI=%d expectedCounts=%s states=%s", maxI, Utils.join(",", expectedCounts), stateName)); + } + } + + /** + * Test stratifier -> holds a list of integers, and the states are if the integer value of evalName is divisable + * by that number + */ + public static class IntegerStratifier extends VariantStratifier { + final List integers; + + private IntegerStratifier(final List integers) { + this.integers = integers; + initialize(); + } + + @Override + public void initialize() { + states.addAll(integers); + } + + @Override + public List getRelevantStates(final ReferenceContext ref, final RefMetaDataTracker tracker, final VariantContext comp, final String compName, final VariantContext eval, final String evalName, final String sampleName) { + int i = Integer.valueOf(evalName); // a terrible hack, but we can now provide accessible states + List states = new ArrayList(); + for ( int state : integers ) + if ( i % state == 0 ) + states.add(state); + return states; + } + } + + /** + * Test evaluator -> just counts the number of calls to update1 + */ + public static class CounterEval extends VariantEvaluator { + public int count = 0; + + @Override public int getComparisonOrder() { return 1; } + + @Override + public void update1(final VariantContext eval, final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + count++; + } + + @Override + public boolean supportsCombine() { + return true; + } + + @Override + public void combine(final VariantEvaluator other) { + this.count += ((CounterEval)other).count; + } + } + + private void initialize(StratifiedEvalTestProvider cfg) { + VEwalker.createStratificationStates(cfg.stratificationObjects, cfg.evaluationObjects); + + final RefMetaDataTracker tracker = new RefMetaDataTracker(); + final ReferenceContext ref = null; + final VariantContext comp = null; + final String compName = null, sampleName = null; + + // increment eval counts for each stratification of divisors of i from from 1...maxI + for ( int i = 1; i <= cfg.maxI; i++ ) { + final String evalName = String.valueOf(i); // terrible hack to stratify by divisor + for ( EvaluationContext nec : VEwalker.getEvaluationContexts(tracker, ref, eval, evalName, comp, compName, sampleName) ) { + synchronized (nec) { + nec.apply(tracker, ref, null, comp, eval); + } + } + } + } + + @DataProvider(name = "StratifiedEvalTestProvider") + public Object[][] makeStratifiedEvalTestProvider() { + + new StratifiedEvalTestProvider(4, // test 1, 2, 3, 4 + Arrays.asList(4, 2), // 4 divisible by 1, 2 by 2 + Arrays.asList(1, 2)); + + new StratifiedEvalTestProvider(6, // test 1, 2, 3, 4, 5, 6 + Arrays.asList(6, 3, 2), // 6 divisible by 1, 3 by 2, 2 by 3 + Arrays.asList(1, 2, 3)); + + // test that some states can be empty -- does this work in VE? + new StratifiedEvalTestProvider(6, + Arrays.asList(3, 2), + Arrays.asList(2, 3)); + + // test a single stratification + new StratifiedEvalTestProvider(6, + Arrays.asList(3), + Arrays.asList(2)); + + // test a meaningless state + new StratifiedEvalTestProvider(4, // test 1, 2, 3, 4 + Arrays.asList(4, 2), // 4 divisible by 1, 2 by 2 + Arrays.asList(1, 2), Arrays.asList(1)); + + // test a adding a state that divides space in half + new StratifiedEvalTestProvider(4, + Arrays.asList(2, 2), + Arrays.asList(1, 2), Arrays.asList(2)); + + // test pairs of strats + new StratifiedEvalTestProvider(12, + Arrays.asList(4, 3, 2, 3), + Arrays.asList(1, 2), Arrays.asList(3, 4)); + + return StratifiedEvalTestProvider.getTests(StratifiedEvalTestProvider.class); + } + + /** + * Ensures that counting and stratifications all are working properly by iterating + * over integers 1...cfg.N and stratify according to cfg, and that the counts in + * each bin are as expected. + * + * @param cfg + */ + @Test(dataProvider = "StratifiedEvalTestProvider") + public void testBasicOperation(StratifiedEvalTestProvider cfg) { + initialize(cfg); + checkStratificationCountsAreExpected(VEwalker.stratManager, cfg.expectedCounts); + } + + private final void checkStratificationCountsAreExpected(final StratificationManager manager, + final List expectedCounts) { + for ( int key = 0; key < manager.size(); key++ ) { + final String stratStateString = manager.getStratsAndStatesStringForKey(key); + final EvaluationContext nec = manager.get(key); + + for ( final VariantEvaluator ve : nec.getVariantEvaluators() ) { + // test for count here + final CounterEval counterEval = (CounterEval)ve; + final int expected = expectedCounts.get(key); + Assert.assertEquals(counterEval.count, expected, "Count seen of " + counterEval.count + " not expected " + expected + " at " + stratStateString); + } + } + } + + /** + * A derived test on testBasicOperation that checks that combining stratifications + * works as expected by ensuring the results are the same when the remapped + * strats are the identity map (A -> A, B -> B, etc) + */ + @Test(dataProvider = "StratifiedEvalTestProvider", dependsOnMethods = {"testBasicOperation"}) + public void testIdentityCombine(StratifiedEvalTestProvider cfg) { + for ( int i = 0; i < cfg.stratificationObjects.size(); i++ ) { + initialize(cfg); + final VariantStratifier toReplace = cfg.stratificationObjects.get(i); + final VariantStratifier newStrat = cfg.stratificationObjects.get(i); + final Map remappedStates = Utils.makeIdentityFunctionMap(newStrat.getAllStates()); + StratificationManager combined = + VEwalker.stratManager.combineStrats(toReplace, newStrat, EvaluationContext.COMBINER, remappedStates); + checkStratificationCountsAreExpected(combined, cfg.expectedCounts); + } + } + +// /** +// * A derived test on testBasicOperation that checks that combining stratifications +// * works as expected. We look into cfg, and if there are multiple states we create +// * dynamically create a combinations of the stratifications, and ensure that the +// * combined results are as we expected. +// */ +// @Test(dataProvider = "StratifiedEvalTestProvider", dependsOnMethods = {"testBasicOperation"}) +// public void testCombinedEachStrat(StratifiedEvalTestProvider cfg) { +// for ( int i = 0; i < cfg.stratificationObjects.size(); i++ ) { +// initialize(cfg); +// final VariantStratifier toReplace = cfg.stratificationObjects.get(i); +// +// // TODO -- replace this code with something that combines values in strat +// final VariantStratifier newStrat = cfg.stratificationObjects.get(i); +// final Map remappedStates = Utils.makeIdentityFunctionMap(newStrat.getAllStates()); +// final List expected = cfg.expectedCounts; +// +// StratificationManager combined = +// VEwalker.stratManager.combineStrats(toReplace, newStrat, EvaluationContext.COMBINER, remappedStates); +// checkStratificationCountsAreExpected(combined, expected); +// } +// } +} \ No newline at end of file From 23ccf772d4f644214eebd9d4e5fcc9358e5d55bf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 13 Apr 2012 11:21:02 -0400 Subject: [PATCH 27/51] IndelSummary now emits all of the underlying counts for ratios, percentages, etc it computes --- .../varianteval/evaluators/IndelSummary.java | 70 ++++++++++++------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index 198172411..dda7e8611 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,51 +40,81 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; public class IndelSummary extends VariantEvaluator implements StandardEval { final protected static Logger logger = Logger.getLogger(IndelSummary.class); + // + // counts of snps and indels + // @DataPoint(description = "Number of SNPs", format = "%d") public int n_SNPs = 0; @DataPoint(description = "Number of singleton SNPs", format = "%d") public int n_singleton_SNPs = 0; - @DataPoint(description = "Number of Indels", format = "%d") + @DataPoint(description = "Number of indels", format = "%d") public int n_indels = 0; - // Number of Indels Sites (counts one for any number of alleles at site) - public int nIndelSites = 0; - - @DataPoint(description = "Number of singleton Indels", format = "%d") + @DataPoint(description = "Number of singleton indels", format = "%d") public int n_singleton_indels = 0; + // + // gold standard + // @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d") public int n_indels_matching_gold_standard = 0; @DataPoint(description = "Percent of indels overlapping gold standard sites") public String gold_standard_matching_rate; + // + // multi-allelics + // + // Number of Indels Sites (counts one for any number of alleles at site) + public int nIndelSites = 0; + @DataPoint(description = "Number of sites with where the number of alleles is greater than 2") public int n_multiallelic_indel_sites = 0; @DataPoint(description = "Percent of indel sites that are multi-allelic") public String percent_of_sites_with_more_than_2_alleles; + // + // snp : indel ratios + // @DataPoint(description = "SNP to indel ratio") public String SNP_to_indel_ratio; @DataPoint(description = "Singleton SNP to indel ratio") public String SNP_to_indel_ratio_for_singletons; + // + // novelty + // + @DataPoint(description = "Number of novel indels", format = "%d") + public int n_novel_indels = 0; + @DataPoint(description = "Indel novelty rate") public String indel_novelty_rate; - @DataPoint(description = "Frameshift percent") - public String frameshift_rate_for_coding_indels; - // // insertions to deletions // + @DataPoint(description = "Number of insertion indels") + public int n_insertions = 0; + + @DataPoint(description = "Number of deletion indels") + public int n_deletions = 0; + @DataPoint(description = "Insertion to deletion ratio") public String insertion_to_deletion_ratio; + @DataPoint(description = "Number of large (>10 bp) deletions") + public int n_large_deletions = 0; + + @DataPoint(description = "Number of large (>10 bp) insertions") + public int n_large_insertions = 0; + + @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") + public String insertion_to_deletion_ratio_for_large_indels; + // // Frameshifts // @@ -95,6 +124,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { @DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift") public int n_coding_indels_in_frame = 0; + @DataPoint(description = "Frameshift percent") + public String frameshift_rate_for_coding_indels; + // // Het : hom ratios // @@ -106,8 +138,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0; - int nKnownIndels = 0, nInsertions = 0; - int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used @@ -129,15 +159,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { public final static int LARGE_INDEL_SIZE_THRESHOLD = 10; - @DataPoint(description = "Number of large (>10 bp) deletions") - public int n_large_deletions = 0; - - @DataPoint(description = "Number of large (>10 bp) insertions") - public int n_large_insertions = 0; - - @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") - public String insertion_to_deletion_ratio_for_large_indels; - @Override public int getComparisonOrder() { return 2; } public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -171,13 +192,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { for ( Allele alt : eval.getAlternateAlleles() ) { n_indels++; // +1 for each alt allele if ( variantWasSingleton(eval) ) n_singleton_indels++; - if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific? + if ( comp == null ) n_novel_indels++; // TODO -- make this test allele specific? if ( gold != null ) n_indels_matching_gold_standard++; // ins : del ratios final int alleleSize = alt.length() - eval.getReference().length(); if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); - if ( alleleSize > 0 ) nInsertions++; + if ( alleleSize > 0 ) n_insertions++; + if ( alleleSize < 0 ) n_deletions++; // requires snpEFF annotations if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) { @@ -220,7 +242,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels); - indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels); + indel_novelty_rate = Utils.formattedNoveltyRate(n_indels - n_novel_indels, n_indels); frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]); @@ -229,7 +251,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms); indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms); - insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions); + insertion_to_deletion_ratio = Utils.formattedRatio(n_insertions, n_deletions); insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions); } From f9190b6fcd5a042b81834bda238323ed09992701 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 13 Apr 2012 11:24:33 -0400 Subject: [PATCH 28/51] VariantEvalUnitTest is better named VariantEvalWalkerUnitTest --- ...{VariantEvalUnitTest.java => VariantEvalWalkerUnitTest.java} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/{VariantEvalUnitTest.java => VariantEvalWalkerUnitTest.java} (99%) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java index 218af3b62..ca06ca699 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java @@ -48,7 +48,7 @@ import org.testng.annotations.Test; import java.util.*; -public class VariantEvalUnitTest extends BaseTest { +public class VariantEvalWalkerUnitTest extends BaseTest { VariantEvalWalker VEwalker; VariantContext eval; From 3f6b2423d8cb741c477281b1859fcef23eaddfbe Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 13 Apr 2012 11:56:48 -0400 Subject: [PATCH 29/51] Update VE IT to reflect new fields and bugfixes --- .../varianteval/VariantEvalIntegrationTest.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 035bf4020..1ab7b679e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65")); + 1, Arrays.asList("8d4530e9cef8531c46bbb693b84d04c7")); executeTestParallel("testSelect1", spec); } @@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bb076f7239039191fde883c5e68483ea")); executeTestParallel("testCompVsEvalAC",spec); } @@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9d24f34d94d74417e00e3b7bcf84650f")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("7329b0bc73c9ccaf5facd754f3410c38")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -488,7 +488,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3") + Arrays.asList("aad01b26198b30da5d59a05c08d863bb") ); executeTest("testModernVCFWithLargeIndels", spec); } @@ -508,7 +508,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c89705147ef4233d5de3a539469bd1d1") + Arrays.asList("4fa2557663ef8fb4cdeecd667791985c") ); executeTest("testStandardIndelEval", spec); } From 87be63c7e46a4ab14832d0ad2314d4f942f77dac Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 13 Apr 2012 16:59:15 -0400 Subject: [PATCH 30/51] Improve variantCallQC.R -- Refactor plotting utilities into master utility in gsalib. Everyone can use it now -- Better plots for standard variantCallQC --- .../utils/R/gsalib/R/gsa.variantqc.utils.R | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R new file mode 100644 index 000000000..88fc48e2a --- /dev/null +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R @@ -0,0 +1,236 @@ +library(gplots) +library(ggplot2) + +# ------------------------------------------------------- +# Utilities for displaying multiple plots per page +# ------------------------------------------------------- + +distributeGraphRows <- function(graphs, heights = c()) { + # Viewport layout 2 graphs top to bottom with given relative heights + # + # + if (length(heights) == 0) { + heights <- rep.int(1, length(graphs)) + } + heights <- heights[!is.na(graphs)] + graphs <- graphs[!is.na(graphs)] + numGraphs <- length(graphs) + Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights) + grid.newpage() + pushViewport(viewport(layout = Layout)) + subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1) + for (i in 1:numGraphs) { + print(graphs[[i]], vp = subplot(i)) + } +} + +distributeLogGraph <- function(graph, xName) { + continuousGraph <- graph + scale_x_continuous(xName) + logGraph <- graph + scale_x_log10(xName) + opts(title="") + distributeGraphRows(list(continuousGraph, logGraph)) +} + +distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) { + distributeGraphRows(list(perSampleGraph, distGraph), ratio) +} + +removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) { + # Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame + # + # Only keeps the column marked with "all" for each removed column + # + for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) { + if (toRemove %in% colnames(variantEvalDataFrame)) { + variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",] + } + } + variantEvalDataFrame +} + +openPDF <- function(outputPDF) { + # Open the outputPDF file with standard dimensions, if outputPDF is not NA + if ( ! is.na(outputPDF) ) { + pdf(outputPDF, height=8.5, width=11) + } +} + +closePDF <- function(outputPDF) { + # close the outputPDF file if not NA, and try to compact the PDF if possible + if ( ! is.na(outputPDF) ) { + dev.off() + if (exists("compactPDF")) { + compactPDF(outputPDF) + } + } +} + +makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) { + if ( is.null(widths) ) widths <- rep(1, length(ACs)) + + value = NULL + titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom) +} + +.reduceACs <- function(binWidthForAC, ACs) { + # computes data structures necessary to reduce the full range of ACs + # + # binWidthForAC returns the number of upcoming bins that should be merged into + # that AC bin. ACs is a vector of all AC values from 0 to 2N that should be + # merged together + # + # Returns a list containing the reduced ACs starts, their corresponding widths, + # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc) + maxAC <- max(ACs) + newACs <- c() + widths <- c() + newACMap <- c() + ac <- 0 + while ( ac < maxAC ) { + newACs <- c(newACs, ac) + width <- binWidthForAC(ac) + widths <- c(widths, width) + newACMap <- c(newACMap, rep(ac, width)) + ac <- ac + width + } + list(ACs = newACs, widths=widths, newACMap = newACMap) +} + +# geometricACs <- function(k, ACs) { +# nBins <- round(k * log10(max(ACs))) +# +# binWidthForAC <- function(ac) { +# max(ceiling(ac / nBins), 1) +# } +# +# return(reduceACs(binWidthForAC, ACs)) +# } + +reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) { + # map the full range of AC values onto a log linear scale + # + # Reduce the full AC range onto one where the width of each new AC increases at a rate of + # 10^scaleFactor in size with growing AC values. This is primarily useful for accurately + # computing ratios or other quantities by AC that aren't well determined when the AC + # values are very large + # + # Returns a list containing the reduced ACs starts, their corresponding widths, + # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc) + maxAC <- max(ACs) + afs <- ACs / maxAC + breaks <- 10^(seq(-4, -1, scaleFactor)) + widths <- c() + lastBreak <- 1 + for ( i in length(breaks):1 ) { + b <- breaks[i] + width <- sum(afs < lastBreak & afs >= b) + widths <- c(widths, width) + lastBreak <- b + } + widths <- rev(widths) + + binWidthForAC <- function(ac) { + af <- ac / maxAC + value = 1 + for ( i in length(breaks):1 ) + if ( af >= breaks[i] ) { + value = widths[i] + break + } + + return(value) + } + + return(.reduceACs(binWidthForAC, ACs)) +} + +.remapACs <- function(remapper, k, df) { + newACs <- remapper(k, df$AC) + + n = length(newACs$ACs) + num = rep(0, n) + denom = rep(0, n) + for ( i in 1:dim(df)[1] ) { + rowI = df$AC == i + row = df[rowI,] + newAC = newACs$newACMap[row$AC] + newRowI = newACs$ACs == newAC + num[newRowI] = num[newRowI] + df$num[rowI] + denom[newRowI] = denom[newRowI] + df$denom[rowI] + } + + newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths ) + newdf +} + +compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) { + df = makeRatioDataFrame(ACs, num, denom, 1) + return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df)) +} + +plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", + fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels", + onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T, moreTitle="") { + metrics$strat = metrics[[requestedStrat]] + + otherFacet = "." + id.vars = c("strat", "nobs") + metrics$nobs <- metrics[[nObsField]] + + # keep track of the other strat and it's implied facet value + if (! is.null(anotherStrat)) { + id.vars = c(id.vars, anotherStrat) + otherFacet = anotherStrat + } + + molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures)) + perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable)) + title <- opts(title=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle)) + + determineFacet <- function(onX) { + if ( onX ) { + paste(otherFacet, "~ variable") + } else { + paste("variable ~", otherFacet) + } + } + + sampleFacet = determineFacet(facetVariableOnXPerSample) + distFacet = determineFacet(facetVariableOnXForDist) + + if ( requestedStrat == "Sample" ) { + perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale + perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") + } else { + perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs))) + perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount") + } + perSampleGraph <- perSampleGraph + ylab("Variable value") + title + perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free") + + nValues = length(unique(molten$value)) + if (nValues > 2) { + if ( requestedStrat == "Sample" ) { + distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable)) + } else { + distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs)) + } + distGraph <- distGraph + geom_histogram(aes(y=..ndensity..)) + distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..)) + distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter")) + scale = "free" + if ( fixHistogramX ) scale = "fixed" + distGraph <- distGraph + facet_grid(distFacet, scales=scale) + distGraph <- distGraph + ylab("Relative frequency") + distGraph <- distGraph + xlab("Variable value (see facet for variable by color)") + distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none") + } else { + distGraph <- NA + } + + if ( onSamePage ) { + suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph)) + } else { + suppressMessages(print(perSampleGraph)) + suppressMessages(print(distGraph + title)) + } +} From 282b4afdca3d3af961742dc2237275d7f91bcb21 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 17 Apr 2012 11:40:02 +0100 Subject: [PATCH 31/51] Licenses for GATK 1 and 2 beta --- licensing/GATK1_LICENSE | 22 ++++++++++++++++++++++ licensing/GATK2_beta_license.doc | Bin 0 -> 43520 bytes licensing/LICENSE | 22 ++++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 licensing/GATK1_LICENSE create mode 100644 licensing/GATK2_beta_license.doc create mode 100644 licensing/LICENSE diff --git a/licensing/GATK1_LICENSE b/licensing/GATK1_LICENSE new file mode 100644 index 000000000..648ec8fc3 --- /dev/null +++ b/licensing/GATK1_LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2012 The Broad Institute + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/licensing/GATK2_beta_license.doc b/licensing/GATK2_beta_license.doc new file mode 100644 index 0000000000000000000000000000000000000000..4fa04a3f62b33c4f8e60f886483f6a35e99e5e29 GIT binary patch literal 43520 zcmeHw349bq7I#f95|ThT5^m^(a3_<4aLAcSG9iH^6LWAZ2uw1QWMDEAW+n*{SWylI zFBC-tTtILY5JUk*Km-v41O+i1f^sPkMdc9CbtT{bRd>&oL_l}H-{*em{QlG3)zz=6 z-h1`x)$8icp--EfS^j40v&?N9#5~#UYCq=Vl&-*WhI}5x*!?)BwA<$TS$GPc@-u}_d5g7mMkM-mHYgP&UV7o~Lfy|D{p zAj3@9({?xe2;_UOJL-fR&EWpaINyx>gNcCKMtnfk7?q1#YHaTHX@W*lJ0tOf&dU~o zZhP#Pb`Wos`6}<__KH`<_MgPbLEJ}tk?(Wg4+MQ9B=1hg{havYyvcN&)9&V2Zf9Ne zh^_^M)^`%+eK$1kCQnjl*Id-;DP>2&n{Ur*d(f`?a{8)vjNL@47w&81bn8g)f%Nx0 z_R-jr{?XPH`C_qeqU5J`e$zS0Q7-?F@=c^F(xLL*x4qD>-bfrt?Xg)HZKTZ ztYJ&WFc7c?EI`qi1>(rhs|m)H`tAQ^4kTo0R7pZ=Ms`kWPHv9cKUh#n^Aw!c9nW#xmS7#-r zs?vlkb+%fSm6#%A=4NGTvenswCL?WBpnp!0!78WcTLaNDXGuTRP`Uo86D}{1{tw_)cB^HAz-%z483R<1cqPJQFtxf17Bx;NE zECyYnUPuI4y(Pkt5+S6kgy`6qxLBd5)Zh zz52Os5j-2Js4pnc=i3apjKZjjI;~AF9+=BbMpV~eDx|jLn~O_~dYfLy?<}#HOF-D7 zFEm(f7Ol--Hbof?`FfL8uNRz3Hkb;`mSTRLl8X&CqR@4|IyqULn3FnOEhMROP#?jEE3&NJ{7(>ImW+u3HT{w3qV8 zErf0q8?0!YNTEz`LU~eW5akrXIZ>ju*bK-IpB8+H5tKTJrT` z`4dg(M%1ct;>MX`(HnUUb>{q1l09c17+Qd<6BXt~6r+6PgAGBM^2uUKO19=q!S54mGWm6R)fSpD?dLRr5 z1ZahVqHmR2$*gf+J1*wK+!mt?qgzJ8Tovmr`36{1^k%)*l3ygChn1M2nBXwaM;Kmg0`fDtcN2LDGhABp+s+l z<)Y3Gi>nnNL&y&-pq9Z{;0_BZuPA&wN%XBUXfL<$R*?-*Yl~>HXs|U8oQAig{UVu0s~>X1R=vrdLSAavMeB@(uo!Y zg302BQHeni;t=#O6gH_t%Pk=rY&TOD0X(;1L?aP6Rh3(!@*-#n(Fc3Uute}t)m&<$ zVHL^c78-hR4Yti-wGv-=XW`b-uET243RzJfZih>7zf(!n4WK$0(9M>K7$Ec(Snmib zRg{<2P;4-2Avb+~k;!Z{7n0<`S5gG9Qk$QTQC$=>Iy$PVGeDZ9U_~AB_C`fO&P-Bp zsR)t`#UxbTW@s*&O%+A+_<(DI{Xu8bq8+42#d=$jS?8_vhgfIV~+uu8y~9@ zvcm94V?9xF?xT`S&;T;`HcKg~pP5cD(xHe7K^oyoOTZty=%CA}BBUo-H;k#~5@@}H zUl^Tv2OtvUUO=uE^!5TGZGoTzQ$KtyWe(r3h$ z$(BemKy*L62JTsr??+l{w2MNv|3aSD3Nerri90e4nLpI{BCSP7ydsw9O7mfL?c6UY zwQ$MvQMH!t?NVQhUhWJ)Iy!(<4n)FB} zi($^QK`%+C9L@%%lQK)*ki&A;RdcB)T3{Y5xKD2?gxQ3ifDX)o7X7K1oGmI?A_i_@ z;O8a_?3Z|sp{l1+NQ4`anxxKBkwX#apIV0pBbgkln`StJaAcCg(TV!Q+WXhLZH9Bz1n7NG{Pr{cNU z0&8rqvs@YKO5is5DN{gKh3|_(;6%e{fHnyI9<$3y!?;HRF{9Ifi}qq13N6|uo)3oB zVt|2lsjBFul2=t~q-hQBp6I=T< zIWVI(&h-<0E^=l`hhWN!g>qPTKK0g?X)zv)=D}hvEiB^wv`}x-Tg+Ar2P6f$k$C5H zzvKzkI#~}4w6n}%LaEi%Ec=K>{fVClCT%enwqsbLjAdHlD+CX@ksXzeahEnP4?S4E zi28Anlcs^y&O{+{$_`>w1Rp2N1?a!9k`Y3IHXp*laA&oZ(p(2gdW^;93b=aWePDt( zClyuIF)UieiK!Se;6~I*O=S-nLUUsSx-cJN(YZ*OG1!Cen0<2{gsPG-++6&~zGO$cKQf z1qEs@EQqXY;Df{FkdsE8i8EU=ZZxjKy73SITEzJT&)QCaYYle|NC=K(=a>jN^maxS zB1(YSZ7Ire>-sflW3=i8zc>T&r;OC()I@2#vXc^35B{UR*kmYxoDm1nn{=WTfF}%J zMr$cHQVdB_JW_G61~wEOw}f~{uH#)Q3`SWqrcyNA8%$_~%}}Vd!Ax-$z%$?qR%;1h zDEYj|8v*x<&%sd<&@&X8kkoa4s@MjagYc+SAhxJ`2#s+n|ywHx_fr5GjoWQ&xrnC65!ag?EA zNo$}(P8B1yf_0`5oivWydoFXM)=-T4SWBVF;61;L1M*r;I<9ULN_B-a!>37_f#$9P zIeRFA?ng0Z^az~Uo9fgc#|*~BHxvtzH67iknAh0Mr#xBoQ7a6ufTi49Or z_>5dJP^{c-xY~==hvE>!Fx|#EdE49z?M7~tz3b6s-btt!d3q*0(WMY5?3Bw40_xlw z6ATSPr}GFzeIIb27X4M22thrP>d5Ps znx2`Ks!l?fHX|`DHwpJf2qRK+QZ%_a=+^0}Vl$9tZvz!capDv-NtKYAmYOpv0@13R z45G+`%0i|pD1rZQdrUx6 zGe~ZrhQ?&4sM6A;I->fiIoT9k&y*v>saZJEWTB0kWEep>dXNr8q8X)D<4}dfv{clL zBK^bFi1iOqQFh2Y$#F58>Vs@x2NwWDi5UQ-lSaIx34!|lT%1#6KS`CY8ba}Whu+*# z_G*yO&{G2Y&1gu*(A=z15=TWZl##o<_D3=|+~ zcxpCjpqP@PLd0BTI>pL08C1qF9+)NmWeCZtM47FWOf4Zj!eff;R8bw}B#Pk<(Lm$S zIr}ID6*3ITpeG`=J;=a(n{2C*hIacPME1mP3b?g1W5~zI03!(-!fm{0X6?6-=_CV4 z2G%t$L!kxONwQ@k3&bG~ZiR^tm?*2vY|x>yeU(BwbR3?(DnqTwh1uwfxTAV<2~41n zNKUlb=;S|(qb_Pw3eyP3n~&HA8>~2Yp5YZ$NERLEuEBYX8AOvi=4PW{)Rv2535;kd z0=T@H(nZAb5I0AwqRx$~(ZHtXR zLBRDyF;D>4UR7+S(FV2P*4R{NWibdt42xFB-wR?J5}J&T{zj=e0nrQ3e5dL9ulMp5+A@sPpSaZ z2R;>s$3V^?CST%>Tw}PiX+R)K=-M;78+?=suvk=H3iwgPn;Kf9>ITGebmUDLE0jVm z#mwZa;E7X=LF9?#q)LRj?;!_k+?1LW&J50>PNR5AQFJo=W1m z5IRhS9=@YA2cQlTQ5!=Mm?#4M2f6H=qNh$(rgp>Arl~3gtxBY*J%aD#ZaQWU(1e}{ z4p_^YvEzo`h5im(Y*A{!P zm;oNLjRJrOnBqYiuN5X(CaRp?fM_b&3+Rzvhlqj-N_ThM($Rx*c;@ye&^*v1(NrRC z(^uIGW(@NhZrwm#j1pj*txJCp}yzP0!CaTXbB(?Qv3` zodzlHiAd}s&&6yi*JF7_5mhoBoWKx`(3*`V*SX4UrAq1Wp2UePX%T8vfJrFlJ0}P? zfY#B#?jua)85rm#{{{v&!|ZE_T4){{9EeRmi=i{`R5Ht^i9&HH;t!%H;)w3UxX7j- z-U*GFC=tUobrEVbEwE7rdaI09GiD4DC2r<%r_F2zRdi(Tw8)Q9K}CElE&EM@-XqX| z@Idxin*gtsOHcBZr4ta=T0ma&h=qVUA;M(EQC^87;(nA>PWBf^z9(`@F zKpv3QY_A@2<>UD>;@tEYk*t{{3g<;TXc=hcl@T9TvJ)?_U;w8B|6f`V z=F3=Ppa&2E!~-fo4P*ch09L>TlmgEI>wu4eQ@~fikHA&nC*T(##E-Gdl{?qISUKy( zO4Io4)a=y3+4NVV-AzM^ZnW6Kv_o66;uJq-1sKxu4f9;nS_T(wns9@K6dmeqbG_2W z`ARR}hn-19$k%pu3uCyJ&|RS!iq{qZIa^5))rpteIi%=s#rb!Y?}rJK?keBbiNo(I zAMxOh^|QBzcyPzuXbo}kPWggqoj=K)WKJ^P010dajyA+00OWKSPr}atOwo#NFTNWw*g;}ssuoKsG=+lMCbny8+UlkAO;m z^ynDy1#l9$1hj%qwFeRa(y^OW;@_!Hsy?hbRVDwPs@k}+>V>&e&3f}(XD&J&!*1*u zmTT|XyieQhMvW_WyFph5byvs+fo{WHlv+DI@e1{9=UZ5NH(TRI(s5ntz3$g*vxN3K zO7}PyXm1&b+xY_3epBtX5@qBL7~zt;quj4k!2A1wIl!~Pe1KMUzX&V>76a#i-+
k!j7M?&81Opq3GhAO zjaQPI0ynEpSMJ@ucYD>w%8iv(l{Z&cExWlLfAeoXIe)rUUp0DJ6&MYU)zAW$-s0BI zy7^+iQiIP8#oiJ<6ffJ@rExqev4%Wr@EWx%_`9`6KJqTyb>XsNt!tXSq#9R1^$De? zYi^`v`<-=7vs0+iBII_je@EhZYoI$21;hY*fqlS!-~@0P_#OztSRVqk09pdA02=?> z108@^;8%b(#oKkjBw!}69k>Pj4EzFwHe>8fpcw`r$h+#Zz1!cUzrCMTt%Qw7ilwTE z|BasBthaCGj4X(iM0r{(jXI<G)*8cExQ9@1*q5PQ9-#_;UjjGV(t)o77jC)`K4P~yGpuW!LI z`RXI_=3gF=5A4J%rXK*ifJ)#f;Ey*>g8&Us1kA&mjn4qj0-J!hffK-K-~w<7@WWe} zjer(FE1(mQ3M>K^18afn7fxL`vVZ56ooioua_tQKm5B#N92(c=tsS|RTmC0n4z9tg z8gd8!+h{hp-(-yyX}Ikc62$+vKvkq$UgZr+~`<>B$ek zRe+(`=b>*(@$vo6i;RL5nE1yr-k_*ow3HjMT0sPLo4VEwx8yKO|~Lk z_Ri=`oZ(RpcbU}Syj_>bTX!3_Ym=cpKT&eaU$@5AEqVJDcdhD9cG;Qd9dRBGbOJg9 zF+eQP6DSAX2CR{o-vji0kl)b7?b{9e5e{6g(4@`2xm@`p_qY(aXs3&Vx=&p0`O5j8 zzwU8Oo3CgG_pFH={IkdU^poMHH3&kwhc?`!P)Ewm6li#3>Q`Zav&_5TJQ7d>Q9v{x z>p&ciNe?Ciqzmr=!O#N>;H(!nu2x;WasI~f8&^Nyzi0oRZJRf4{(SXcm(%xDbM)vM zRsj}->=DDBW(W@J<1}BOJeUVKt~t#CWeY@coc^nS&#u*CySPoO&EB!#Wabym8b&KZ z<|(GP%~Y_?K);gE#30sT{h=sf{h5KG>z8?l0UhV{^$MT2%)8TiSm^bKJX!()^A33= zm5?IUZ+}e=%=HRGxJJR66IZ zZ%uVYV0wIKkFhhK`>g>@M-qo3itfB6G1d5M)yr8DzP%);hu}=l3eeh8aL$IdX5!ca z6!PB^R)n-<%3%LsbqG4kB_!aqP!V!(i&3KP!zH%}i?vuuG z4Qa`Vf{TOEt^|cMHyf}-$kWX0NWFl1Rdd#O=9!?efNPBsSUm2}QY1Ll$d6^Cb%m&l z7Q89t^&%RWNk(+ z(l~0pP>VK^1*BF`&HJ+U;K65#zMv6m=cGFg=RVkudExHnu(K254=X-Xz~0(RZ(O7F zY}7dq-|f^xDr9v$@C`~?_9NL?lzmMx(nX5SHb;OB8G(EASU22@ISOmGQpcJDqr(+r z!WF>`YrLp3@-sDYeHSFVl|~ki6xIlJqlaM8KE9k2W!(N4xQ0+|1bkcZ_ePZju4zp? zz+=p{`BaY82S;6W>8@4=_xJ5XX zef@Fe=)IQvKe@1~JoL*5zaQUPbg6YthoD~qlg~y+B%b^9dT>hePK+c{}>~ri~vvq1l$TErIxpq_yg!WuGlQdF-3>Gx~gTx_RZ|VS%F}mNW>Qt3RlG?42u9BVKLu+7~HpcSjh{ zeiVG6YI4hc8=q`;I(4u0{%IX6=gnHX=}Y10rXH1{KSq2x_TBIa10NqU;?j{@-Oj1M z>(RNPN8j2jANseKXNESwLjgpiLTamg3|0H`D2=&|y7~JZueM8`67^=YF6l4#%e{O& zaHCwCqvzHQi&}NL8n|glczBYk?Q0)C>FLpZQ_~l&89daPEC25O)q%_F z%Fj)j{~QaBd-Q1kU#FPA`t`lR*9+emGjM3jes~k;AH8S%Ysx0}Ox)K^(uW*8Q}+H8 zL$CO(vvD6h@#@M}X^#c0Gj03zli+RZ@}3Ryy0&V}M+b%B-A9ibIWIII|CjckrZoI< z_L%UIJ6E)fO!DaP$X7EXBaTk!;qUiP!}P2r)4v$KD>TsW+lkNbOkTci*O{2ytj06b ze%d(UTuMvh#&O4d?|Y`W^C3mSL3Uxz&)TJJS0R=fysgNm*Yy?=#xbLIeD`ipwpG`| z^1$^MKK{nz@a>mh%znb>o0T_A`QKj~zUnkvxM+82zn98h9s6xh)^_%pRhwQ}z2Wc! zYu{=3Qt-Ur?pw3izi3*x?9s`_qv+17V&HjVw-c+!8I_sWFcIq3QC7xpdqA$Cdk0b$+lPkI0RuQMnAyk*#`UEehd^nPLL zYlj|~`?rM;cB?w_@ci`b<;9EQ$8;X#8TwVDKK;uxAMd&J*yyi!s1tI>Ux?YRS(tdf z*MQ-Dudcd$erB%U`w!<{?)g@qCfeX*&om4h81}@9jHH!6?K{=J$8UWzXl2*rtz&*U)Bn`kpx*)}?Wj6c za(elPez&T=Jkz_~eVx;rCB5IuUlZ|0lOMh|n?IPnaN%&z^e?L_{nE!(Zfcn{$|nxEkCzh z_%L;U_Q;T3ng{kpKRlz2pHVmO%(scF&y9Ve=kbllJlE>Nt(6_;#469OFPi<{mR{|9 z-`?N8*9$j7KP+oBr?S!0m#;kAUinGuS6b&+?rmH2_?!hBAK8%cw(Y(V4Y$AdiLSKx z^A`)(v^kQNZ7b;Uz|rQ56Yt9$yTxZlr^V}6sn!R3EEsX-z#Fg5-qole{IhK*TLx=m z?*Dm8!gEuK+s&DjWPEJN*@lZtn{5r7Rq*d4Kb&ZLUx&q=CdPkJmAr7{!li=?!{5z} z?$pXlGrZZbHJf&4*^UPsUC`nM&*tY|ZxwKMq5t6d0grUP(K~2ElT*R7h8}-?;HHrD zXEFwkydlP} zyIp%@lwx^V;;`9?O;=A?-=pKA{=Y6PPZ%^j>8Ep-f&!0^ysY&dT(bJNIj_w;*)aRi zzZC0^#+|(I{K?VRGqT@16x+A|Q=>x8ZyWvXnipR_7jf#R`Jee*&xvU~_4*g4CH>#; zvE^LU{^uvOuKsk);g1z#+DEASd4!(2-0%t8D{-F6oCC}K^)Z24J&sO0(q9;T=E0D$ zp1+j5Z@k^&>-cK(#81AB)&~~GZyGWrC39p(PK57?S#Rs!`mwqG#?-1eyN_D8SC_MU z%!{KoH28Y@`t3hF++$0F23=?8^!YN}?CDb;TYc){wnuzAzBgpm(Zxsh?SJLS(H(=| zoAl=HfQ*-p54-g5$e*LqBSze+`nK?!Cnvv|JiOiEsDEx=VA$P$SQ!lO<)g$CP|F2$KaQz>`=+R{hCNG_T+0yKr zjEkS2Q$PDs&&wsnA6>`_d}iRNm!sn%J_+p{eWAyQ8;Q+3oSG4D-uZm>!V{Y8*cn~2 z8=P1deWKTy!{OEYRz6Tte6*_P6UlGco(^d8}GW6GqmB+){Yz#RvXLv*9 zFO34%jeqLc%VnD)X0_0?P9XnXq#u27?c;04&wb^g zS#R}h`Qxhxrd+$PhyLcWpZCYV=cE7jQsr~sq$w&k%xSQ+kN4~qBR>6U+(?`8pNpUS z@@Ke#H9dmB!}FHDjlr}H{Dm%VgR1BTaqqy%FOb}#^oAcqH=lXyk#b+N`N)=_XF7lR z*rBzbd(Qm2S77hK;rCzuK5$)xukumt_7@FXUwCO>v;5evVv1I`_U%{o{G>%Yi#K0> zDn2VDW6p{Gy7l89dCJ`9`{2T7hILKv9G=xx8MDyRsLRm#OAgJ=`DOaLzAK&{{&m=j zf2#IwJ(O~*blS43ixpFMhrgE>G^LWYT9CP@dc$vhThA$s>Jak4?fX^M6SKU#zP#ta znU0HBr+Muky=BXXr_nI6=?afXOs?v3l9A16tHou`9Og>E-Enl>9SMFqxsmQSx;tWy z8$;l*J2_;cbLPZ;cS`PFk%Q8a+9+`!$(@F~5;$mdMP$smHov11nZvvh6{TQ|H+ulB z??|3B8!#9?nLyu;!`oT|!(x=tVfauHz7U7ks|JSU<|Ie<4a28W@Yb5vh_Cq$46D#v z!v+ruY&byMFW+cM*Omw%V6yhp4wS#8YQ>A=#agQpUm-N%8hwIAYs0CfFiL09(ie1# zjZv}D(Y>Sa_N*yP$SaH)7-qD^gbB9EIMPkTQC?vzrNvSjj&u`oM4yK=*^omzkW+A7 zOtI(bDW~+5^Tf$1adJv8Ii;7J(pygH9VQgvg9Q`OBHD$~yVogVf_xMfH7I~ljHej$ zYcZBZF*X}!loqxZQ#^XZ;(Q>UmUp5@t}0@6E!%0&4gPgs_4WF_tGjXai@9DyT*;oS zk#ep^3f++e&`bkUZP2zKOinb8sr$Sgf#%C*CvF`!U z6qnkch)lH7oK_ptic`Mo5UY%6^+{-ixRL;i4;dC5^8aapr`4Pe<`4XTR0!CI0v!RO zNvqaqJ+0SDr;u5c9o) zT8cMdL*p!%Euz5sSXl|BT{0`#In0C>^}$OT3Kvw+#a z0^kK;C-4E#1rqKC(2ED_fYA^_L$s0F5(qQ~f`Fy~ElvvtLV#GHCs5Xj{=I?se;Ey+ z6E#6{pe_{w!~-h83X}qGLBtz@N(ku~kd3~Y3+x6y0xE%HK!O*#J5U4}O#o~_8SpT8 zPVykx_X7?Bt)WQG!0TY36VL_t+Kt~bA3p~#D}ld(pS$pAq7pa;%tNnx7T5&511v%h zTnt2_pGN@$fI+|)z$xG|@B{EX`u_{SQuP1jz+fo53iuHHd=Jne4CVlc>X5zf#?p9(;*gZP$ug4r{gl%k@4r zuLW^yL59+O^M^%jJd+;2K_~WjQ$C-uIHom5nxP7WzZE=)eg^`5y_(Eic$*VAkcF)H@1@qY6KFK8{acOQkvRHoCyMn zl%{`(A1#CJWY#wWnU=vd8gnOpTxww_F+Bt%F14sZLga@-bsZx1;lg#&kG@_YA?imj zUj<}JDy`&F_2j>fd+ErN-#-;^z0MT%YpMeGPgRV^xy#KhXy}zTda_DeFu$3e-b%{J zHES6j;kqd2qeYbYu@tJmq^9#IMcia!tvIm~Upq0bUnRb^h{c(pq_DLR6l)jnZzssL zE#BXayUxTUQPoK-sF9r**Snxbf1;^pg6zb&2A&D}lTBUU%ubN&;`(NPqN&qE?8LZM zP7nDLP0eg&Czja?`jy%0Pc~I(YbPkQ1wo5y7swU~Itaf%&FDHepY64#a zFUHa;LwXB+-M|)%7Ut6v&I$bmzVhQYTPLO+sboHpQH)-qqGj%Jiu>5|;C_r=d}+~c z5Th4?EdDA+FEyTTlfj?s(VGwS;!mU*}Kz==DBYL{AIAX?fUf*Jj3&C zu#co9o?o}`7x4rS*@)3JXFN><;wd{7(}3k-DmoD`vvT}WDv3ef#9Wk{rZ>_g#_5=% z0Ye(i11S#=A5U*DZx2te>hA1}YBJkFvkTgILD=VD%1e)y#f1%elt&Mk>4CErB{SB` zXC7AvIGN6yTzN{HPCwarz)z2|&3MobUt28Q-^a^?HePySEv2h4#WtWMx@T0=d5Uzo zdN_x>>#`ANXv;utsg|QqmW3}QxKkVMLmP-wavSQROb_Otk;_Snq|l`>^Gs1r{%}Y^l_c;b?~iVF%)~7G?EvyRWWV(sG_6NkZ_f#k(4K`gwSG zdwA8!X<7h43q1_HR9Xl`Tx6_2?ssqo3F0PKmd>Wr&j`+wLe!UdQ^;p7DzR)YZ#2}$ z6D{>%XR5=oib2}smL{Ucq#L5F1(uF;8`^*u6kuX@G2fQcZyb`{UDi9t<5dcDbY5|M zwm+ZtYazH*m_2x59!Luj7cP?)2FNH0d(z~Z%QUz@VkQ9kgsAeX0NCd?CC>wfrbL82mhy=4w5qcP`4IQ zjvz#ncvOXFpNwCEye*Rq29K^H@f1*9O*%+hFjKQguFD2kjebnPUpgXgdh`xjFk*v6 zjECMEa3pSR_|mt0Xg*mNTEJDv7sSyLI$9JaE; zc@!c|%!WOpc5FKKN&K&i^c4DlyR9M~ zUk^k&g-higOvj|=IgWomm4@$^;ve+kn|>PprT-fFsa_fM{KBZznQg`VI@JXqA(p;D zY_{OkeqRih8Or6P8}jjAH1OY=@TtCheA3U1fA%l}-}qlw_KOQrE&D_xa z3V!_I&(Al%bU-|AyZ=S{ufoJZnUr8>N!x)fqD+qbD*9B^&F_@Ks^WQ zIZ)4mdJfccpq>MDaiFgGKg~xyDi2mJQML)1{{rU!5x=ZULYlW*nhtYtn$Oex7&{)8 zrMY?uM{{G9w5PZOol`JUoHIA#^*~ac`}U|HwTWbeJI7_FCXGv>|45(B(Z!i!=Up5D8q@M0;TN3L-$eMNeGbTTPnt*4b7h)C(ma!%+tV|9dR|Y@ z=xMG_^HZ9W(=&OR3KXYL~RYYztKhWQ~?MDXujGJ z2nTLsbxvpOG3}-AMe*;vbjQ925CKF2N+1e|2Jo1jFWBjceH_pW=neD%XpT*5PWu7< zfdK%~AA~*D1F?8O1tb880L3@xdNTGyfD|AV7zzvn(g3>t8=%2)rgTne+0ro%|56(* z)}$|730RIq-v_c_NzuJ*A5KB^w%={}YARSNKOIX!&8P#lRSRB-@6GBd$6al$5lxfJC%aYKHuA?+T<@A&-sF4d zJO@}M`xq7^19}+8_G9sKC2DaPc0q&wJ(L~AqP#Egdm1SN@#nt9GKIbUWHW2q-jd2U zOsZLaDq5_E^#ADAP~tz?>bvDXl|}JY+A^VaX2c0;F)?WyE#|VE~Xg`snaqDi|PvhBm;6Js=`ggbG|EJ28>HL}UiFaf_$Q;u+Mtr1x zl2v} z6CE9??wNRBQr|>%Lht0nX)+^q)<4?(eEb7ubMeSxW7wbpC|>;BMwY(7>Uv65E43K@ zXelo8iUOD2ilTEDR@^o7At42ojCA+0C=Y|YybcN literal 0 HcmV?d00001 diff --git a/licensing/LICENSE b/licensing/LICENSE new file mode 100644 index 000000000..648ec8fc3 --- /dev/null +++ b/licensing/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2012 The Broad Institute + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. From adad76b36fb869f6c89d6c6f63387df3c2ae885b Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 17 Apr 2012 10:20:43 -0400 Subject: [PATCH 32/51] Fixing NPE in VQSR for the case of very small callsets. --- .../gatk/walkers/variantrecalibration/VariantDataManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a2782fe34..a957bfd85 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -165,9 +165,9 @@ public class VariantDataManager { bottomPercentage = ((float) numToAdd) / ((float) data.size()); } int index = 0, numAdded = 0; - while( numAdded < numToAdd ) { + while( numAdded < numToAdd && index < data.size() ) { final VariantDatum datum = data.get(index++); - if( !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) { + if( datum != null && !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) { datum.atAntiTrainingSite = true; trainingData.add( datum ); numAdded++; From 91cb6547911b2564cd2843e787a08805c84bd83b Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Tue, 17 Apr 2012 11:45:32 -0400 Subject: [PATCH 33/51] AggregateMetrics: - By porting from jython to java now accessible to Queue via automatic extension generation. - Better handling for problematic sample names by using PicardAggregationUtils. GATKReportTable looks up keys using arrays instead of dot-separated strings, which is useful when a sample has a period in the name. CombineVariants has option to suppress the header with the command line, which is now invoked during VCF gathering. Added SelectHeaders walker for filtering headers for dbGAP submission. Generated command line for read filters now correctly prefixes the argument name as --read_filter instead of -read_filter. Latest WholeGenomePipeline. Other minor cleanup to utility methods. --- .../sting/gatk/io/stubs/VCFWriterStub.java | 50 ++-- .../sting/gatk/report/GATKReportTable.java | 59 ++--- .../walkers/variantutils/CombineVariants.java | 10 +- .../walkers/variantutils/SelectHeaders.java | 250 ++++++++++++++++++ .../gatk/GATKExtensionsGenerator.java | 3 +- .../broadinstitute/sting/utils/R/RUtils.java | 90 +++++++ .../sting/utils/SampleUtils.java | 99 +------ .../sting/utils/codecs/vcf/VCFHeader.java | 73 ++++- .../sting/utils/text/ListFileUtils.java | 176 +++++++++++- .../sting/utils/text/XReadLines.java | 136 ++++++---- .../sting/gatk/report/GATKReportUnitTest.java | 47 +++- .../sting/utils/R/RUtilsUnitTest.java | 64 +++++ .../utils/text/ListFileUtilsUnitTest.java | 77 +++++- .../qscripts/examples/ExampleReadFilter.scala | 47 ++++ .../queue/extensions/gatk/GATKIntervals.scala | 1 - .../extensions/gatk/VcfGatherFunction.scala | 3 +- .../sting/queue/pipeline/PipelineTest.scala | 2 +- .../ExampleReadFilterPipelineTest.scala | 90 +++++++ 18 files changed, 1050 insertions(+), 227 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/R/RUtils.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala create mode 100644 public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 82cb43634..94051cc7f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,7 +12,6 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -99,8 +98,13 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Create a new stub given the requested file. + * + * @param engine engine. * @param genotypeFile file to (ultimately) create. * @param isCompressed should we compress the output stream? + * @param argumentSources sources. + * @param skipWritingHeader skip writing header. + * @param doNotWriteGenotypes do not write genotypes. */ public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) { this.engine = engine; @@ -114,8 +118,13 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Create a new stub given the requested file. + * + * @param engine engine. * @param genotypeStream stream to (ultimately) write. * @param isCompressed should we compress the output stream? + * @param argumentSources sources. + * @param skipWritingHeader skip writing header. + * @param doNotWriteGenotypes do not write genotypes. */ public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) { this.engine = engine; @@ -154,7 +163,7 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Gets the master sequence dictionary from the engine associated with this stub * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return + * @return the master sequence dictionary from the engine associated with this stub */ public SAMSequenceDictionary getMasterSequenceDictionary() { return engine.getMasterSequenceDictionary(); @@ -188,22 +197,25 @@ public class VCFWriterStub implements Stub, VCFWriter { vcfHeader = header; // Check for the command-line argument header line. If not present, add it in. - if ( !skipWritingHeader ) { - VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); - boolean foundCommandLineHeaderLine = false; - for (VCFHeaderLine line: vcfHeader.getMetaData()) { - if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) - foundCommandLineHeaderLine = true; + if (!skipWritingHeader && header.isWriteEngineHeaders()) { + + if (header.isWriteCommandLine()) { + VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); + boolean foundCommandLineHeaderLine = false; + for (VCFHeaderLine line: vcfHeader.getMetaData()) { + if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) + foundCommandLineHeaderLine = true; + } + if ( !foundCommandLineHeaderLine ) + vcfHeader.addMetaDataLine(commandLineArgHeaderLine); } - if ( !foundCommandLineHeaderLine ) - vcfHeader.addMetaDataLine(commandLineArgHeaderLine); // also put in the reference contig header lines String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName()); for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() ) vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly)); - vcfHeader.addMetaDataLine(new VCFHeaderLine("reference", "file://" + engine.getArguments().referenceFile.getAbsolutePath())); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath())); } outputTracker.getStorage(this).writeHeader(vcfHeader); @@ -225,7 +237,7 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Gets a string representation of this object. - * @return + * @return a string representation of this object. */ @Override public String toString() { @@ -247,20 +259,20 @@ public class VCFWriterStub implements Stub, VCFWriter { val = String.format("", contig.getSequenceName(), contig.getSequenceLength(), assembly); else val = String.format("", contig.getSequenceName(), contig.getSequenceLength()); - return new VCFHeaderLine("contig", val); + return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val); } private String getReferenceAssembly(String refPath) { // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot String assembly = null; - if ( refPath.indexOf("b37") != -1 || refPath.indexOf("v37") != -1 ) + if (refPath.contains("b37") || refPath.contains("v37")) assembly = "b37"; - else if ( refPath.indexOf("b36") != -1 ) + else if (refPath.contains("b36")) assembly = "b36"; - else if ( refPath.indexOf("hg18") != -1 ) + else if (refPath.contains("hg18")) assembly = "hg18"; - else if ( refPath.indexOf("hg19") != -1 ) + else if (refPath.contains("hg19")) assembly = "hg19"; return assembly; } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 58002bd14..6551bf376 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -250,53 +250,40 @@ public class GATKReportTable { } /** - * Returns the first primary key matching the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * - * @param dottedColumnValues Period concatenated values. + * Returns the first primary key matching the column values. + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" + * @param columnValues column values. * @return The first primary key matching the column values or throws an exception. */ - public Object getPrimaryKeyByData(String dottedColumnValues) { - Object key = findPrimaryKey(dottedColumnValues); + public Object getPrimaryKeyByData(Object... columnValues) { + Object key = findPrimaryKeyByData(columnValues); if (key == null) - throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues); + throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + Arrays.asList(columnValues)); return key; } - /** - * Returns true if there is at least on row with the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * - * @param dottedColumnValues Period concatenated values. - * @return true if there is at least one row matching the columns. - */ - public boolean containsPrimaryKey(String dottedColumnValues) { - return findPrimaryKey(dottedColumnValues) != null; - } - - /** - * Returns the first primary key matching the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * - * @param dottedColumnValues Period concatenated values. - * @return The first primary key matching the column values or null. - */ - private Object findPrimaryKey(String dottedColumnValues) { - return findPrimaryKey(dottedColumnValues.split("\\.")); - } - /** * Returns the first primary key matching the column values. - * Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" } + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" * * @param columnValues column values. - * @return The first primary key matching the column values. + * @return The first primary key matching the column values or null if the key does not exist. */ - private Object findPrimaryKey(Object[] columnValues) { + public Object findPrimaryKeyByData(Object... columnValues) { + if (columnValues == null) + throw new NullPointerException("Column values is null"); + if (columnValues.length == 0) + throw new IllegalArgumentException("Column values is empty"); + int columnCount = columns.size(); for (Object primaryKey : primaryKeyColumn) { boolean matching = true; - for (int i = 0; matching && i < columnValues.length; i++) { - matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i + 1)); + // i --> index into columnValues parameter + // j --> index into columns collection + for (int i = 0, j = 0; matching && i < columnValues.length && j < columnCount; j++) { + if (!columns.getByIndex(j).isDisplayable()) + continue; + matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i)); + i++; } if (matching) return primaryKey; @@ -360,8 +347,8 @@ public class GATKReportTable { * output file), and the format string used to display the data. * * @param columnName the name of the column - * @param defaultValue the default value of a blank cell - * @param display if true - the column will be displayed; if false - the column will be hidden + * @param defaultValue if true - the column will be displayed; if false - the column will be hidden + * @param display display the column * @param format the format string used to display data */ public void addColumn(String columnName, Object defaultValue, boolean display, String format) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 3066b0bc6..18b8424b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -157,6 +157,12 @@ public class CombineVariants extends RodWalker { @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) public int minimumN = 1; + /** + * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. + */ + @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) + public boolean SUPPRESS_COMMAND_LINE_HEADER = false; + @Hidden @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; @@ -183,7 +189,9 @@ public class CombineVariants extends RodWalker { Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); if ( SET_KEY != null ) headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); - vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples)); + VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples); + vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); + vcfWriter.writeHeader(vcfHeader); if ( vcfWriter instanceof VCFWriterStub) { sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java new file mode 100755 index 000000000..714fb938e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.apache.commons.io.FilenameUtils; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.text.ListFileUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.util.*; + +/** + * Selects headers from a VCF source. + *

+ *

+ * Often, a VCF containing many headers will need to be subset in order to facilitate certain formatting guidelines. + * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the + * file (based on a complete header name or a pattern match). + *

+ *

Input

+ *

+ * A set of VCFs. + *

+ *

+ *

Output

+ *

+ * A header selected VCF. + *

+ *

+ *

Examples

+ *
+ * Select only the FILTER, FORMAT, and INFO headers:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO
+ *
+ * Select only the FILTER, FORMAT, and INFO headers and add in the reference file names:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO \
+ *   -irn \
+ *   -iln
+ *
+ * Select only the FILTER, FORMAT, and INFO headers, plus any headers with SnpEff:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO \
+ *   -he '.*SnpEff.*'
+ * 
+ */ +@SuppressWarnings("unused") +public class SelectHeaders extends RodWalker implements TreeReducible { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc = "File to which variants should be written", required = true) + protected VCFWriter vcfWriter; + + @Argument(fullName = "header_name", shortName = "hn", doc = "Include header. Can be specified multiple times", required = false) + public Set headerNames; + + @Argument(fullName = "header_expression", shortName = "he", doc = "Regular expression to select many headers from the tracks provided. Can be specified multiple times", required = false) + public Set headerExpressions; + + /** + * Note that header exclusion takes precedence over inclusion, so that if a header is in both lists it will be excluded. + */ + @Argument(fullName = "exclude_header_name", shortName = "xl_hn", doc = "Exclude header. Can be specified multiple times", required = false) + public Set XLheaderNames; + + /** + * Note that reference inclusion takes precedence over other header matching. If set other reference lines may be excluded but the file name will still be added. + */ + @Argument(fullName = "include_reference_name", shortName = "irn", doc = "If set the reference file name minus the file extension will be added to the headers", required = false) + public boolean includeReference; + + /** + * Note that interval name inclusion takes precedence over other header matching. If set other interval lines may be excluded but the intervals will still be added. + */ + @Argument(fullName = "include_interval_names", shortName = "iln", doc = "If set the interval file name minus the file extension, or the command line intervals, will be added to the headers", required = false) + public boolean includeIntervals; + + /** + * Note that engine header inclusion takes precedence over other header matching. If set other engine lines may be excluded but the intervals will still be added. + */ + @Hidden // TODO: Determine if others find this valuable and either remove @Hidden or remove -ieh. + @Argument(fullName = "include_engine_headers", shortName = "ieh", doc = "If set the headers normally output by the engine will be added to the headers", required = false) + public boolean includeEngineHeaders; + + private static final ListFileUtils.StringConverter headerKey = new ListFileUtils.StringConverter() { + @Override + public String convert(VCFHeaderLine value) { + return value.getKey(); + } + }; + + /** + * Set up the VCF writer, the header expressions and regexps + */ + @Override + public void initialize() { + // Get list of samples to include in the output + List rodNames = Arrays.asList(variantCollection.variants.getName()); + + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + + headerLines.add(new VCFHeaderLine(VCFHeader.SOURCE_KEY, "SelectHeaders")); + + // Select only the headers requested by name or expression. + headerLines = new LinkedHashSet(getSelectedHeaders(headerLines)); + + // Optionally add in the reference. + if (includeReference && getToolkit().getArguments().referenceFile != null) + headerLines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, FilenameUtils.getBaseName(getToolkit().getArguments().referenceFile.getName()))); + + // Optionally add in the intervals. + if (includeIntervals && getToolkit().getArguments().intervals != null) { + for (IntervalBinding intervalBinding : getToolkit().getArguments().intervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + } + } + } + + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + VCFHeader vcfHeader = new VCFHeader(headerLines, vcfSamples); + vcfHeader.setWriteEngineHeaders(includeEngineHeaders); + vcfWriter.writeHeader(vcfHeader); + } + + private Set getSelectedHeaders(Set headerLines) { + Set selectedHeaders = new TreeSet(); + if (headerNames == null && headerExpressions == null) { + // Include everything if nothing was explicitly included. + selectedHeaders.addAll(headerLines); + } else { + // Only include the selected headers. + if (headerNames != null) + selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerNames, true)); + if (headerExpressions != null) + selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerExpressions, false)); + } + + // Remove any excluded headers. + if (XLheaderNames != null) + selectedHeaders = ListFileUtils.excludeMatching(selectedHeaders, headerKey, XLheaderNames, true); + return selectedHeaders; + } + + /** + * Pass through the VC record + * + * @param tracker the ROD tracker + * @param ref reference information + * @param context alignment info + * @return number of records processed + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int count = 0; + if (tracker != null) { + Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + if (vcs != null) { + for (VariantContext vc : vcs) { + vcfWriter.add(vc); + count++; + } + } + } + return count; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + @Override + public void onTraversalDone(Integer result) { + logger.info(result + " records processed."); + } +} diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index a3f80af1c..dcdef5aab 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -194,6 +194,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private static final List gatkPackages = Arrays.asList( "org.broadinstitute.sting.gatk", + "org.broadinstitute.sting.pipeline", "org.broadinstitute.sting.analyzecovariates", "org.broadinstitute.sting.gatk.datasources.reads.utilities"); @@ -251,7 +252,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private void writeFilter(String className, List argumentFields, Set> dependents) throws IOException { String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction", - className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents); + className, "", false, String.format(" + \" --read_filter %s\"", className), argumentFields, dependents); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java b/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java new file mode 100644 index 000000000..b52eed5cf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.apache.commons.lang.StringUtils; + +import java.text.SimpleDateFormat; +import java.util.Collection; +import java.util.Date; + +public class RUtils { + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the values will be escaped with single quotes and combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toStringList(Collection list) { + if (list == null) + return "NA"; + if (list.size() == 0) + return "c()"; + return "c('" + StringUtils.join(list, "','") + "')"; + } + + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the values will be combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toNumberList(Collection list) { + return list == null ? "NA": "c(" + StringUtils.join(list, ",") + ")"; + } + + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the date will be escaped with single quotes and combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toDateList(Collection list) { + return toDateList(list, "''yyyy-MM-dd''"); + } + + /** + * Converts a collection of values to an R compatible list formatted by pattern. + * @param list Collection of values + * @param pattern format pattern string for each date + * @return The R representation of the list + */ + public static String toDateList(Collection list, String pattern) { + + if (list == null) + return "NA"; + SimpleDateFormat format = new SimpleDateFormat(pattern); + StringBuilder sb = new StringBuilder(); + sb.append("c("); + boolean first = true; + for (Date date : list) { + if (!first) sb.append(","); + sb.append(format.format(date)); + first = false; + } + sb.append(")"); + return sb.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 68b220aab..360a855fa 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -31,14 +31,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.text.ListFileUtils; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** @@ -74,10 +73,10 @@ public class SampleUtils { * Same as @link getSAMFileSamples but gets all of the samples * in the SAM files loaded by the engine * - * @param engine - * @return + * @param engine engine + * @return samples */ - public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) { + public static Set getSAMFileSamples(GenomeAnalysisEngine engine) { return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader()); } @@ -209,89 +208,24 @@ public class SampleUtils { * we try to read a file named E from disk, and if possible all lines from that file are expanded * into unique sample names. * - * @param sampleArgs - * @return + * @param sampleArgs args + * @return samples */ public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String SAMPLE_EXPRESSION : sampleArgs) { - File sampleFile = new File(SAMPLE_EXPRESSION); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line.trim()); - } - } catch (FileNotFoundException e) { - samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample - } - } - - return samplesFromFiles; + return ListFileUtils.unpackSet(sampleArgs); } return new HashSet(); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { - Set samples = new HashSet(); - - if (sampleExpressions != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String sampleExpression : sampleExpressions) { - File sampleFile = new File(sampleExpression); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line); - } - } catch (FileNotFoundException e) { - // ignore exception - } - } - - sampleExpressions.addAll(samplesFromFiles); - - // Let's now assume that the values in sampleExpressions are literal sample names and not regular - // expressions. Extract those samples specifically so we don't make the mistake of selecting more - // than what the user really wants. - Set possibleSampleRegexs = new HashSet(); - for (String sampleExpression : sampleExpressions) { - if (!(new File(sampleExpression).exists())) { - if (vcfSamples.contains(sampleExpression)) { - samples.add(sampleExpression); - } else { - possibleSampleRegexs.add(sampleExpression); - } - } - } - - // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions - for (String sampleRegex : possibleSampleRegexs) { - Pattern p = Pattern.compile(sampleRegex); - - for (String vcfSample : vcfSamples) { - Matcher m = p.matcher(vcfSample); - if (m.find()) { - samples.add(vcfSample); - } - } - } + Set samples = ListFileUtils.unpackSet(vcfSamples); + if (sampleExpressions == null) { + return samples; } else { - samples.addAll(vcfSamples); + return ListFileUtils.includeMatching(samples, sampleExpressions, false); } - - return samples; } /** @@ -304,16 +238,7 @@ public class SampleUtils { // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions Set samples = new HashSet(); if (sampleExpressions != null) { - for (String expression : sampleExpressions) { - Pattern p = Pattern.compile(expression); - - for (String originalSample : originalSamples) { - Matcher m = p.matcher(originalSample); - if (m.find()) { - samples.add(originalSample); - } - } - } + samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false)); } return samples; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 27bab8c41..50ff3a656 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -1,5 +1,28 @@ -package org.broadinstitute.sting.utils.codecs.vcf; +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.util.ParsingUtils; @@ -35,6 +58,11 @@ public class VCFHeader { // the header string indicator public static final String HEADER_INDICATOR = "#"; + public static final String SOURCE_KEY = "source"; + public static final String REFERENCE_KEY = "reference"; + public static final String CONTIG_KEY = "contig"; + public static final String INTERVALS_KEY = "intervals"; + // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; @@ -42,6 +70,8 @@ public class VCFHeader { protected ArrayList sampleNamesInOrder = null; protected HashMap sampleNameToOffset = null; + private boolean writeEngineHeaders = true; + private boolean writeCommandLine = true; /** * create a VCF header, given a list of meta data and auxillary tags @@ -79,6 +109,7 @@ public class VCFHeader { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * + * @param genotypeSampleNamesInAppearenceOrder genotype sample names */ protected void buildVCFReaderMaps(List genotypeSampleNamesInAppearenceOrder) { @@ -144,10 +175,7 @@ public class VCFHeader { * @return a set of the header fields, in order */ public Set getHeaderFields() { - Set fields = new LinkedHashSet(); - for (HEADER_FIELDS field : HEADER_FIELDS.values()) - fields.add(field); - return fields; + return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); } /** @@ -217,7 +245,36 @@ public class VCFHeader { public VCFHeaderLine getOtherHeaderLine(String key) { return mOtherMetaData.get(key); } + + /** + * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. + * @return true if additional engine headers will be written to the VCF + */ + public boolean isWriteEngineHeaders() { + return writeEngineHeaders; + } + + /** + * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. + * @param writeEngineHeaders true if additional engine headers will be written to the VCF + */ + public void setWriteEngineHeaders(boolean writeEngineHeaders) { + this.writeEngineHeaders = writeEngineHeaders; + } + + /** + * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. + * @return true if the command line will be written to the VCF + */ + public boolean isWriteCommandLine() { + return writeCommandLine; + } + + /** + * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. + * @param writeCommandLine true if the command line will be written to the VCF + */ + public void setWriteCommandLine(boolean writeCommandLine) { + this.writeCommandLine = writeCommandLine; + } } - - - diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index c146bf4d4..a3bc7a75f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -34,9 +34,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.io.IOException; +import java.util.*; +import java.util.regex.Pattern; /** * A collection of convenience methods for working with list files. @@ -54,6 +54,7 @@ public class ListFileUtils { * LIST_FILE_COMMENT_START are ignored. * * @param samFiles The sam files, in string format. + * @param parser Parser * @return a flattened list of the bam files provided */ public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { @@ -63,10 +64,8 @@ public class ListFileUtils { inputFileName = expandFileName(inputFileName); if (inputFileName.toLowerCase().endsWith(".list") ) { try { - for ( String fileName : new XReadLines(new File(inputFileName), true) ) { - if ( fileName.length() > 0 && ! fileName.startsWith(LIST_FILE_COMMENT_START) ) { - unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); - } + for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) { + unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); } } catch( FileNotFoundException ex ) { @@ -91,9 +90,11 @@ public class ListFileUtils { /** * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. * @param RODBindings a text equivale + * @param parser Parser * @return a list of expanded, bound RODs. */ @Deprecated + @SuppressWarnings("unused") // TODO: Who is still using this? External walkers? public static Collection unpackRODBindingsOldStyle(final Collection RODBindings, final ParsingEngine parser) { // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); @@ -112,7 +113,7 @@ public class ListFileUtils { String name = positionalTags.get(0); String type = positionalTags.get(1); - RMDTriplet.RMDStorageType storageType = null; + RMDTriplet.RMDStorageType storageType; if(tags.getValue("storage") != null) storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage")); else if(fileName.toLowerCase().endsWith("stdin")) @@ -129,9 +130,11 @@ public class ListFileUtils { /** * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. * @param RODBindings a text equivale + * @param parser Parser * @return a list of expanded, bound RODs. */ - public static Collection unpackRODBindings(final Collection RODBindings, final ParsingEngine parser) { + @SuppressWarnings("unchecked") + public static Collection unpackRODBindings(final Collection RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) { // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); FeatureManager builderForValidation = new FeatureManager(); @@ -142,7 +145,7 @@ public class ListFileUtils { String name = rodBinding.getName(); String type = rodBinding.getTribbleType(); - RMDTriplet.RMDStorageType storageType = null; + RMDTriplet.RMDStorageType storageType; if(rodBinding.getTags().getValue("storage") != null) storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage")); else if(fileName.toLowerCase().endsWith("stdin")) @@ -184,4 +187,157 @@ public class ListFileUtils { return "/dev/stdin"; return argument; } + + /** + * Returns a new set of values, containing a final set of values expanded from values + *

+ * Each element E of values can either be a literal string or a file ending in .list. + * For each E ending in .list we try to read a file named E from disk, and if possible + * all lines from that file are expanded into unique values. + * + * @param values Original values + * @return entries from values or the files listed in values + */ + public static Set unpackSet(Collection values) { + if (values == null) + throw new NullPointerException("values cannot be null"); + Set unpackedValues = new LinkedHashSet(); + // Let's first go through the list and see if we were given any files. + // We'll add every entry in the file to our set, and treat the entries as + // if they had been specified on the command line. + for (String value : values) { + File file = new File(value); + if (value.toLowerCase().endsWith(".list") && file.exists()) { + try { + unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines()); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + } else { + unpackedValues.add(value); + } + } + return unpackedValues; + } + + /** + * Returns a new set of values including only values listed by filters + *

+ * Each element E of values can either be a literal string or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique names. + *

+ * Filters may also be a file of filters. + * + * @param values Values or files with values + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values or the files listed in values, filtered by filters + */ + public static Set includeMatching(Collection values, Collection filters, boolean exactMatch) { + return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch); + } + + /** + * Converts a type T to a String representation. + * + * @param Type to convert to a String. + */ + public static interface StringConverter { + String convert(T value); + } + + /** + * Returns a new set of values including only values matching filters + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values including only values matching filters + */ + public static Set includeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.add(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.add(value); + } + } + return filteredValues; + } + + /** + * Returns a new set of values excluding any values matching filters. + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values exluding any values matching filters + */ + public static Set excludeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + filteredValues.addAll(values); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.remove(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.remove(value); + } + } + return filteredValues; + } + + private static Collection compilePatterns(Collection filters) { + Collection patterns = new ArrayList(); + for (String filter: filters) { + patterns.add(Pattern.compile(filter)); + } + return patterns; + } + + protected static final StringConverter IDENTITY_STRING_CONVERTER = new StringConverter() { + @Override + public String convert(String value) { + return value; + } + }; } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java index 49e9ddf52..b7fc1bdab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,15 +12,14 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils.text; @@ -48,75 +47,92 @@ import java.util.List; * For the love of god, please use this system for reading lines in a file. */ public class XReadLines implements Iterator, Iterable { - private BufferedReader in; // The stream we're reading from - private String nextline = null; // Return value of next call to next() - private boolean trimWhitespace = true; + private final BufferedReader in; // The stream we're reading from + private String nextLine = null; // Return value of next call to next() + private final boolean trimWhitespace; + private final String commentPrefix; + + public XReadLines(final File filename) throws FileNotFoundException { + this(new FileReader(filename), true, null); + } + + public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException { + this(new FileReader(filename), trimWhitespace, null); + } /** * Creates a new xReadLines object to read lines from filename * - * @param filename - * @throws FileNotFoundException + * @param filename file name + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set + * @throws FileNotFoundException when the file is not found */ - public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException { - this(new FileReader(filename), trimWhitespace); + public XReadLines(final File filename, final boolean trimWhitespace, final String commentPrefix) throws FileNotFoundException { + this(new FileReader(filename), trimWhitespace, commentPrefix); } - public XReadLines(final File filename) throws FileNotFoundException { - this(filename, true); + public XReadLines(final InputStream inputStream) throws FileNotFoundException { + this(new InputStreamReader(inputStream), true, null); } - /** - * Creates a new xReadLines object to read lines from fileReader - * - * @param fileReader - * @throws FileNotFoundException - */ - public XReadLines(final FileReader fileReader, final boolean trimWhitespace) throws FileNotFoundException { - this(new BufferedReader(fileReader), trimWhitespace); - } - - public XReadLines(final FileReader fileReader) throws FileNotFoundException { - this(fileReader, true); + public XReadLines(final InputStream inputStream, final boolean trimWhitespace) { + this(new InputStreamReader(inputStream), trimWhitespace, null); } /** * Creates a new xReadLines object to read lines from an input stream * - * @param inputStream + * @param inputStream input stream + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set */ - public XReadLines(final InputStream inputStream, final boolean trimWhitespace) { - this(new BufferedReader(new InputStreamReader(inputStream)), trimWhitespace); - } - - public XReadLines(final InputStream inputStream) throws FileNotFoundException { - this(inputStream, true); + public XReadLines(final InputStream inputStream, final boolean trimWhitespace, final String commentPrefix) { + this(new InputStreamReader(inputStream), trimWhitespace, commentPrefix); } /** - * Creates a new xReadLines object to read lines from an bufferedReader + * Creates a new xReadLines object to read lines from a reader * - * @param reader + * @param reader reader + */ + public XReadLines(final Reader reader) { + this(reader, true, null); + } + + /** + * Creates a new xReadLines object to read lines from an reader + * + * @param reader reader + * @param trimWhitespace trim whitespace */ public XReadLines(final Reader reader, final boolean trimWhitespace) { + this(reader, trimWhitespace, null); + } + + /** + * Creates a new xReadLines object to read lines from an bufferedReader + * + * @param reader file name + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set + */ + public XReadLines(final Reader reader, final boolean trimWhitespace, final String commentPrefix) { + this.in = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader); + this.trimWhitespace = trimWhitespace; + this.commentPrefix = commentPrefix; try { - this.in = new BufferedReader(reader); - nextline = readNextLine(); - this.trimWhitespace = trimWhitespace; + this.nextLine = readNextLine(); } catch(IOException e) { throw new IllegalArgumentException(e); } } - public XReadLines(final Reader reader) { - this(reader, true); - } - /** * Reads all of the lines in the file, and returns them as a list of strings * - * @return + * @return all of the lines in the file. */ public List readLines() { List lines = new LinkedList(); @@ -128,38 +144,48 @@ public class XReadLines implements Iterator, Iterable { /** * I'm an iterator too... - * @return + * @return an iterator */ public Iterator iterator() { return this; } public boolean hasNext() { - return nextline != null; + return this.nextLine != null; } /** - * Actually reads the next line from the stream, not accessible publically - * @return + * Actually reads the next line from the stream, not accessible publicly + * @return the next line or null + * @throws IOException if an error occurs */ private String readNextLine() throws IOException { - String nextline = in.readLine(); // Read another line - if (nextline != null && trimWhitespace ) - nextline = nextline.trim(); - return nextline; + String nextLine; + while ((nextLine = this.in.readLine()) != null) { + if (this.trimWhitespace) { + nextLine = nextLine.trim(); + if (nextLine.length() == 0) + continue; + } + if (this.commentPrefix != null) + if (nextLine.startsWith(this.commentPrefix)) + continue; + break; + } + return nextLine; } /** - * Returns the next line (minus whitespace) - * @return + * Returns the next line (optionally minus whitespace) + * @return the next line */ public String next() { try { - String result = nextline; - nextline = readNextLine(); + String result = this.nextLine; + this.nextLine = readNextLine(); // If we haven't reached EOF yet - if (nextline == null) { + if (this.nextLine == null) { in.close(); // And close on EOF } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index ec0db12d3..5759204cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -42,13 +42,13 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(report.getTables().size(), 5); GATKReportTable countVariants = report.getTable("CountVariants"); - Object countVariantsPK = countVariants.getPrimaryKeyByData("dbsnp.eval.none.all"); + Object countVariantsPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "all"); Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "63025520"); Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "0"); Assert.assertEquals(countVariants.get(countVariantsPK, "heterozygosity"), 4.73e-06); GATKReportTable validationReport = report.getTable("ValidationReport"); - Object validationReportPK = countVariants.getPrimaryKeyByData("dbsnp.eval.none.novel"); + Object validationReportPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "novel"); Assert.assertEquals(validationReport.get(validationReportPK, "PPV"), Double.NaN); } @@ -79,6 +79,49 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); } + private GATKReportTable makeBasicTable() { + GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); + GATKReportTable table = report.getTable("TableName"); + report.addRow("foo.1", "hello"); + report.addRow("foo.2", "world"); + return table; + } + + @Test + public void testDottedSampleName() { + GATKReportTable table = makeBasicTable(); + Object pk; + + pk = table.getPrimaryKeyByData("foo.1"); + Assert.assertEquals(table.get(pk, "value"), "hello"); + + pk = table.getPrimaryKeyByData("foo.2"); + Assert.assertEquals(table.get(pk, "value"), "world"); + } + + @Test + public void testFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.1")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.1", "hello")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.2")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.2", "world")); + Assert.assertNull(table.findPrimaryKeyByData("list", "longer", "than", "column", "count")); + Assert.assertNull(table.findPrimaryKeyByData("short")); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testEmptyFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + table.findPrimaryKeyByData(); + } + + @Test(expectedExceptions = NullPointerException.class) + public void testNullFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + table.findPrimaryKeyByData((Object[]) null); + } + @Test public void testSimpleGATKReport() { // Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java new file mode 100644 index 000000000..23bf074e2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class RUtilsUnitTest { + @DataProvider(name = "stringLists") + public Object[][] getStringLists() { + return new Object[][] { + new Object[] { null, "NA" }, + new Object[] { Collections.EMPTY_LIST, "c()" }, + new Object[] { Arrays.asList("1", "2", "3"), "c('1','2','3')" } + }; + } + + @Test(dataProvider = "stringLists") + public void testToStringList(List actual, String expected) { + Assert.assertEquals(RUtils.toStringList(actual), expected); + } + + @DataProvider(name = "numberLists") + public Object[][] getNumberLists() { + return new Object[][] { + new Object[] { null, "NA" }, + new Object[] { Collections.EMPTY_LIST, "c()" }, + new Object[] { Arrays.asList(1, 2, 3), "c(1,2,3)" }, + new Object[] { Arrays.asList(1D, 2D, 3D), "c(1.0,2.0,3.0)" } + }; + } + + @Test(dataProvider = "numberLists") + public void testToNumberList(List actual, String expected) { + Assert.assertEquals(RUtils.toNumberList(actual), expected); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java index f0b1de6fe..f21b4bced 100644 --- a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java @@ -28,17 +28,14 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ParsingEngine; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.testng.Assert; -import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.List; - +import java.util.*; /** * Tests selected functionality in the CommandLineExecutable class @@ -74,6 +71,76 @@ public class ListFileUtilsUnitTest extends BaseTest { performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } + @Test + public void testUnpackSet() throws Exception { + Set expected = new HashSet(Arrays.asList("public/testdata/exampleBAM.bam")); + Set actual; + + actual = ListFileUtils.unpackSet(Arrays.asList("public/testdata/exampleBAM.bam")); + Assert.assertEquals(actual, expected); + + File tempListFile = createTempListFile("testUnpackSet", + "#", + "public/testdata/exampleBAM.bam", + "#public/testdata/foo.bam", + " # public/testdata/bar.bam" + ); + actual = ListFileUtils.unpackSet(Arrays.asList(tempListFile.getAbsolutePath())); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="includeMatchingTests") + public Object[][] getIncludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("a", "ab") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, asSet("a", "ab", "abc") } + }; + } + + @Test(dataProvider = "includeMatchingTests") + public void testIncludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.includeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="excludeMatchingTests") + public Object[][] getExcludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, Collections.EMPTY_SET } + }; + } + + @Test(dataProvider = "excludeMatchingTests") + public void testExcludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.excludeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + private static Set asSet(T... args){ + return new HashSet(Arrays.asList(args)); + } + private File createTempListFile( String tempFilePrefix, String... lines ) throws Exception { File tempListFile = File.createTempFile(tempFilePrefix, ".list"); tempListFile.deleteOnExit(); diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala new file mode 100644 index 000000000..89f2f55fb --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.qscripts.examples + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +/** + * Script used for testing output to /dev/null + */ +class ExampleReadFilter extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + def script() { + val genotyper = new UnifiedGenotyper with BadMate + genotyper.reference_sequence = referenceFile + genotyper.memoryLimit = 2 + genotyper.input_file :+= bamFile + add(genotyper) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 085e0b008..2f604a809 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -49,7 +49,6 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) { else IntervalUtils.parseIntervalArguments(parser, intervals) Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) Collections.unmodifiableList(mergedLocs) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 70046c913..8ac711f25 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -32,6 +32,8 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor * Merges a vcf text file. */ class VcfGatherFunction extends CombineVariants with GatherFunction { + this.assumeIdenticalSamples = true + this.suppressCommandLineHeader = true private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] @@ -43,7 +45,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput - this.assumeIdenticalSamples = true // NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index 22f4f6225..9d51b01a0 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -136,7 +136,7 @@ object PipelineTest extends BaseTest with Logging { println(" value (min,target,max) table key metric") for (validation <- evalSpec.validations) { val table = report.getTable(validation.table) - val key = table.getPrimaryKeyByData(validation.key) + val key = table.getPrimaryKeyByData(validation.table +: validation.key.split('.') : _*) val value = String.valueOf(table.get(key, validation.metric)) val inRange = if (value == null) false else validation.inRange(value) val flag = if (!inRange) "*" else " " diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala new file mode 100644 index 000000000..7e5e9a93e --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleReadFilterPipelineTest { + @Test + def testExampleReadFilter() { + val spec = new PipelineTestSpec + spec.name = "examplereadfilter" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + PipelineTest.executeTest(spec) + } +} From 13c800417e5a8d7054315c70b09f2b1325a310d7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 17 Apr 2012 15:51:23 -0400 Subject: [PATCH 35/51] Handle NPE in UG indel code: deletions immediately preceding insertions were not handled well in the code. --- .../sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index b9422b6e5..e64a4f42d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -146,6 +146,10 @@ public class ConsensusAlleleCounter { String indelString = p.getEventBases(); if ( p.isBeforeInsertion() ) { + // edge case: ignore a deletion immediately preceding an insertion as p.getEventBases() returns null [EB] + if ( indelString == null ) + continue; + boolean foundKey = false; // copy of hashmap into temp arrayList ArrayList> cList = new ArrayList>(); From cf705f6c626f7c2af8854703a5e59dd93d5700cc Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 17 Apr 2012 17:00:00 -0400 Subject: [PATCH 36/51] Adding read position rank sum test to the list of annotations that get produced with the HaplotypeCaller --- .../annotator/BaseQualityRankSumTest.java | 2 +- .../annotator/MappingQualityRankSumTest.java | 2 +- .../gatk/walkers/annotator/RankSumTest.java | 4 ++-- .../walkers/annotator/ReadPosRankSumTest.java | 21 ++++++++++++------- .../sting/utils/pileup/PileupElement.java | 2 +- .../sting/utils/sam/AlignmentUtils.java | 14 ++++++++----- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 6eea12e2b..526f25797 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -30,7 +30,7 @@ public class BaseQualityRankSumTest extends RankSumTest { } } } - protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { // TODO -- implement me; how do we pull out the correct offset from the read? return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 520b0f232..749278ce7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -34,7 +34,7 @@ public class MappingQualityRankSumTest extends RankSumTest { } } - protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { final boolean matchesRef = ref.equals(alleleBin.getKey()); final boolean matchesAlt = alts.contains(alleleBin.getKey()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 80d248ac2..ad9600edf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -123,7 +123,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if ( context == null ) continue; - fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), context, refQuals, altQuals); + fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), vc.getStart(), context, refQuals, altQuals); } if ( refQuals.size() == 0 || altQuals.size() == 0 ) @@ -146,7 +146,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar return map; } - protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals); + protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, List altQuals); protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index e013f0e08..9ff8886cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.*; @@ -47,11 +48,7 @@ public class ReadPosRankSumTest extends RankSumTest { } } - protected void fillQualsFromPileup(final Allele ref, final List alts, final Map> stratifiedContext, final List refQuals, List altQuals) { - // TODO -- implement me; how do we pull out the correct offset from the read? - return; - -/* + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { final boolean matchesRef = ref.equals(alleleBin.getKey()); final boolean matchesAlt = alts.contains(alleleBin.getKey()); @@ -59,13 +56,21 @@ public class ReadPosRankSumTest extends RankSumTest { continue; for ( final GATKSAMRecord read : alleleBin.getValue() ) { + final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getUnclippedStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) + continue; + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); + + final int numAlignedBases = AlignmentUtils.getNumAlignedBases( read ); + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + if ( matchesRef ) - refQuals.add((double)read.getMappingQuality()); + refQuals.add((double) readPos); else - altQuals.add((double)read.getMappingQuality()); + altQuals.add((double) readPos); } } -*/ } protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 81ba00888..e5cd9f4d5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -73,7 +73,7 @@ public class PileupElement implements Comparable { } public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) { - this(read,offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, -1); + this(read, offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, -1); } public boolean isDeletion() { return isDeletion; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 3b2736418..e0fee66ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -381,15 +381,19 @@ public class AlignmentUtils { return alignment; } - public static int calcAlignmentByteArrayOffset(final Cigar cigar, PileupElement pileup, final int alignmentStart, final int refLocus) { - int pileupOffset = pileup.getOffset(); + public static int calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus) { + return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isInsertionAtBeginningOfRead(), pileupElement.isDeletion(), alignmentStart, refLocus ); + } + + public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isInsertionAtBeginningOfRead, final boolean isDeletion, final int alignmentStart, final int refLocus) { + int pileupOffset = offset; // Special case for reads starting with insertion - if (pileup.isInsertionAtBeginningOfRead()) + if (isInsertionAtBeginningOfRead) return 0; // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases - if (pileup.isDeletion()) { + if (isDeletion) { pileupOffset = refLocus - alignmentStart; final CigarElement ce = cigar.getCigarElement(0); if (ce.getOperator() == CigarOperator.S) { @@ -414,7 +418,7 @@ public class AlignmentUtils { break; case D: case N: - if (!pileup.isDeletion()) { + if (!isDeletion) { alignmentPos += elementLength; } else { if (pos + elementLength - 1 >= pileupOffset) { From f0c81b59b05fe7e5bfc292402c575edc94a50475 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 12 Apr 2012 13:52:32 -0400 Subject: [PATCH 37/51] Implementation of the new BQSR plotting infrastructure * removed low quality bases from the recalibration report. * refactored the Datum (Recal and Accuracy) class structure * created a new plotting csv table for optimized performance with the R script * added a datum object that carries the accuracy information (AccuracyDatum) for plotting * added mean reported quality score to all covariates * added QualityScore as a covariate for plotting purposes * added unit test to the key manager to operate with one required covariate and multiple optional covariates * integrated the plotting into BQSR (automatically generates the pdf with the recalibration tearsheet) --- .../gatk/walkers/bqsr/AccuracyDatum.java | 52 +++++++++ .../gatk/walkers/bqsr/BQSRKeyManager.java | 6 +- .../gatk/walkers/bqsr/ContextCovariate.java | 3 - .../gatk/walkers/bqsr/CovariateValues.java | 6 +- .../{RecalDatumOptimized.java => Datum.java} | 57 +++------- .../sting/gatk/walkers/bqsr/EventType.java | 4 +- .../gatk/walkers/bqsr/QuantizationInfo.java | 4 +- .../gatk/walkers/bqsr/ReadCovariates.java | 6 +- .../gatk/walkers/bqsr/RecalDataManager.java | 106 ++---------------- .../sting/gatk/walkers/bqsr/RecalDatum.java | 57 ++++------ .../bqsr/RecalibrationArgumentCollection.java | 4 +- .../walkers/bqsr/RecalibrationReport.java | 13 +-- .../recalibration/BaseRecalibration.java | 3 +- .../walkers/bqsr/BQSRKeyManagerUnitTest.java | 11 ++ 14 files changed, 135 insertions(+), 197 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java rename public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/{RecalDatumOptimized.java => Datum.java} (65%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java new file mode 100644 index 000000000..b66a81f34 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AccuracyDatum.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.LinkedList; +import java.util.List; + +/** + * Short one line description of the walker. + * + *

[Long description of the walker]

+ * + * + *

Input

[Description of the Input]

+ * + *

Output

[Description of the Output]

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T [walker name]
+ *  
+ * + * @author Mauricio Carneiro + * @since 4/17/12 + */ +public class AccuracyDatum extends RecalDatum { + private final List accuracy = new LinkedList(); + private final List reportedQualities = new LinkedList(); + + public AccuracyDatum(final RecalDatum recalDatum, final byte originalQuality) { + super(recalDatum); + accuracy.add(calculateAccuracy(recalDatum, originalQuality)); + reportedQualities.add(originalQuality); + } + + public void combine(final RecalDatum recalDatum, final byte originalQuality) { + this.combine(recalDatum); + accuracy.add(calculateAccuracy(recalDatum, originalQuality)); + reportedQualities.add(originalQuality); + } + + @Override + public String toString() { + return String.format("%s,%.2f,%.2f", super.toString(), MathUtils.average(reportedQualities), MathUtils.average(accuracy)); + } + + private static double calculateAccuracy(final RecalDatum recalDatum, final byte originalQuality) { + return recalDatum.getEmpiricalQuality() - originalQuality; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index bcbda4b20..2b48e5871 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -52,7 +52,7 @@ public class BQSRKeyManager { for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management int nBits = required.numberOfBits(); // number of bits used by this covariate BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate - this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate + this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate nRequiredBits += nBits; } @@ -184,7 +184,7 @@ public class BQSRKeyManager { * @return an object array with the values for each key */ public List keySetFrom(BitSet key) { - List objectKeys = new ArrayList(); + List objectKeys = new LinkedList(); for (RequiredCovariateInfo info : requiredCovariates) { BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface @@ -286,7 +286,7 @@ public class BQSRKeyManager { public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) public final Covariate covariate; // this allows reverse lookup of the Covariates in order - RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) { + RequiredCovariateInfo(int bitsBefore, BitSet mask, Covariate covariate) { this.bitsBefore = bitsBefore; this.mask = mask; this.covariate = covariate; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index 69461ed0e..c7c281943 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -47,9 +47,6 @@ public class ContextCovariate implements StandardCovariate { private int insertionsContextSize; private int deletionsContextSize; - private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L); -// protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it - private byte LOW_QUAL_TAIL; // Initialize any member variables using the command-line arguments passed to the walkers diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java index 00d3b650c..ebf90ebfd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateValues.java @@ -14,9 +14,9 @@ import java.util.BitSet; * @since 2/8/12 */ public class CovariateValues { - private BitSet[] mismatches; - private BitSet[] insertions; - private BitSet[] deletions; + private final BitSet[] mismatches; + private final BitSet[] insertions; + private final BitSet[] deletions; public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) { this.mismatches = mismatch; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java similarity index 65% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java index 39807283a..b3ea88d58 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.List; - /* * Copyright (c) 2010 The Broad Institute * @@ -38,10 +36,13 @@ import java.util.List; * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ -public class RecalDatumOptimized { +public class Datum { + + long numObservations; // number of bases seen in total + long numMismatches; // number of bases seen that didn't match the reference + + private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero - protected long numObservations; // number of bases seen in total - protected long numMismatches; // number of bases seen that didn't match the reference //--------------------------------------------------------------------------------------------------------------- // @@ -49,67 +50,43 @@ public class RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public RecalDatumOptimized() { + public Datum() { numObservations = 0L; numMismatches = 0L; } - public RecalDatumOptimized(final long _numObservations, final long _numMismatches) { - numObservations = _numObservations; - numMismatches = _numMismatches; - } - - public RecalDatumOptimized(final RecalDatumOptimized copy) { - this.numObservations = copy.numObservations; - this.numMismatches = copy.numMismatches; - } - //--------------------------------------------------------------------------------------------------------------- // // increment methods // //--------------------------------------------------------------------------------------------------------------- - public synchronized final void increment(final long incObservations, final long incMismatches) { + synchronized void increment(final long incObservations, final long incMismatches) { numObservations += incObservations; numMismatches += incMismatches; } - public synchronized final void increment(final RecalDatumOptimized other) { - increment(other.numObservations, other.numMismatches); - } - - public synchronized final void increment(final List data) { - for (RecalDatumOptimized other : data) { - this.increment(other); - } - } - //--------------------------------------------------------------------------------------------------------------- // // methods to derive empirical quality score // //--------------------------------------------------------------------------------------------------------------- - public final double empiricalQualDouble(final int smoothing, final double maxQual) { - final double doubleMismatches = (double) (numMismatches + smoothing); - final double doubleObservations = (double) (numObservations + smoothing); + double empiricalQualDouble() { + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(empiricalQual, maxQual); + return Math.min(empiricalQual, (double) QualityUtils.MAX_QUAL_SCORE); } - public final byte empiricalQualByte(final int smoothing) { - final double doubleMismatches = (double) (numMismatches + smoothing); - final double doubleObservations = (double) (numObservations + smoothing); - return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 - } - - public final byte empiricalQualByte() { - return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero + byte empiricalQualByte() { + final double doubleMismatches = (double) (numMismatches); + final double doubleObservations = (double) (numObservations); + return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 } @Override - public final String toString() { + public String toString() { return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java index 4c53dcca5..6d004edb1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java @@ -7,8 +7,8 @@ public enum EventType { BASE_INSERTION(1, "I"), BASE_DELETION(2, "D"); - public int index; - public String representation; + public final int index; + private final String representation; private EventType(int index, String representation) { this.index = index; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java index afe847583..9c91a1874 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java @@ -19,9 +19,9 @@ import java.util.Map; public class QuantizationInfo { private List quantizedQuals; private List empiricalQualCounts; - int quantizationLevels; + private int quantizationLevels; - public QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { + private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { this.quantizedQuals = quantizedQuals; this.empiricalQualCounts = empiricalQualCounts; this.quantizationLevels = quantizationLevels; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java index f87986b47..fc4445b22 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -13,9 +13,9 @@ import java.util.BitSet; * @since 2/8/12 */ public class ReadCovariates { - private BitSet[][] mismatchesKeySet; - private BitSet[][] insertionsKeySet; - private BitSet[][] deletionsKeySet; + private final BitSet[][] mismatchesKeySet; + private final BitSet[][] insertionsKeySet; + private final BitSet[][] deletionsKeySet; private int nextCovariateIndex; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index ac80e2017..cedff0a80 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -221,7 +220,7 @@ public class RecalDataManager { logger.info(""); } - public static List generateReportTables(Map> keysAndTablesMap) { + private static List generateReportTables(Map> keysAndTablesMap) { List result = new LinkedList(); int tableIndex = 0; @@ -349,7 +348,7 @@ public class RecalDataManager { * @param read The SAMRecord to parse * @return whether or not this read should be skipped */ - public static boolean checkColorSpace(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { + public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { if (ReadUtils.isSOLiDRead(read)) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); @@ -378,99 +377,10 @@ public class RecalDataManager { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); else - return false; // otherwise, just skip the read + return true; // otherwise, just skip the read } } - return true; - } - - /** - * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases - * This method doesn't add the inconsistent tag to the read like parseColorSpace does - * - * @param read The SAMRecord to parse - * @param originalQualScores The array of original quality scores to modify during the correction - * @param solidRecalMode Which mode of solid recalibration to apply - * @param refBases The reference for this read - * @return A new array of quality scores that have been ref bias corrected - */ - public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) { - - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) { - colorSpace = ((String) attr).getBytes(); - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - byte[] readBases = read.getReadBases(); - final byte[] colorImpliedBases = readBases.clone(); - byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases - if (read.getReadNegativeStrandFlag()) { - readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone()); - } - final int[] inconsistency = new int[readBases.length]; - byte prevBase = colorSpace[0]; // The sentinel - for (int iii = 0; iii < readBases.length; iii++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); - colorImpliedBases[iii] = thisBase; - inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1); - prevBase = readBases[iii]; - } - - // Now that we have the inconsistency array apply the desired correction to the inconsistent bases - if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0 - final boolean setBaseN = false; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } - else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) { - final boolean setBaseN = true; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } - else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases - solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); - } - - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - - return originalQualScores; - } - - public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { - if (ReadUtils.isSOLiDRead(read)) { - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) { - colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - for (byte color : colorSpace) { - if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { - return true; // There is a bad color in this SOLiD read and the user wants to skip over it - } - } - - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - } - - return false; // There aren't any color no calls in this SOLiD read + return false; } /** @@ -625,16 +535,16 @@ public class RecalDataManager { * @param offset The offset in the read at which to check * @return Returns true if the base was inconsistent with the color space */ - public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) { + public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); if (attr != null) { final byte[] inconsistency = (byte[]) attr; // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! if (read.getReadNegativeStrandFlag()) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] != (byte) 0; + return inconsistency[inconsistency.length - offset - 1] == (byte) 0; } else { // Forward direction - return inconsistency[offset] != (byte) 0; + return inconsistency[offset] == (byte) 0; } // This block of code is for if you want to check both the offset and the next base for color space inconsistency @@ -654,7 +564,7 @@ public class RecalDataManager { } else { // No inconsistency array, so nothing is inconsistent - return false; + return true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 0b66bb182..d232fde81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -33,12 +33,11 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ -public class RecalDatum extends RecalDatumOptimized { +public class RecalDatum extends Datum { private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) - private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero //--------------------------------------------------------------------------------------------------------------- // @@ -50,7 +49,7 @@ public class RecalDatum extends RecalDatumOptimized { numObservations = 0L; numMismatches = 0L; estimatedQReported = 0.0; - empiricalQuality = 0.0; + empiricalQuality = -1.0; } public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) { @@ -67,60 +66,52 @@ public class RecalDatum extends RecalDatumOptimized { this.empiricalQuality = copy.empiricalQuality; } - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final void combine(final RecalDatum other) { + public void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); this.increment(other.numObservations, other.numMismatches); this.estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations); + this.empiricalQuality = -1.0; // reset the empirical quality calculation so we never have a wrongly calculated empirical quality stored } - //--------------------------------------------------------------------------------------------------------------- - // - // methods to derive empirical quality score - // - //--------------------------------------------------------------------------------------------------------------- - - public final void calcCombinedEmpiricalQuality(final int maxQual) { - this.empiricalQuality = empiricalQualDouble(SMOOTHING_CONSTANT, maxQual); // cache the value so we don't call log over and over again + public final void calcCombinedEmpiricalQuality() { + this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again } public final void calcEstimatedReportedQuality() { this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / numObservations); } - //--------------------------------------------------------------------------------------------------------------- - // - // misc. methods - // - //--------------------------------------------------------------------------------------------------------------- - public final double getEstimatedQReported() { return estimatedQReported; } public final double getEmpiricalQuality() { + if (empiricalQuality < 0) + calcCombinedEmpiricalQuality(); return empiricalQuality; } - private double calcExpectedErrors() { - return (double) this.numObservations * qualToErrorProb(estimatedQReported); - } - - private double qualToErrorProb(final double qual) { - return Math.pow(10.0, qual / -10.0); - } - /** * Makes a hard copy of the recal datum element * * @return a new recal datum object with the same contents of this datum. */ - protected RecalDatum copy() { + public RecalDatum copy() { return new RecalDatum(numObservations, numMismatches, estimatedQReported, empiricalQuality); } + + @Override + public String toString() { + return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); + } + + + private double calcExpectedErrors() { + return (double) this.numObservations * qualToErrorProb(estimatedQReported); + } + + private double qualToErrorProb(final double qual) { + return Math.pow(10.0, qual / -10.0); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 07cb8d7a8..4a695ecb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.Utils; -import java.io.PrintStream; +import java.io.File; import java.util.Collections; import java.util.List; @@ -62,7 +62,7 @@ public class RecalibrationArgumentCollection { */ @Gather(BQSRGatherer.class) @Output - public PrintStream RECAL_FILE; + public File RECAL_FILE; /** * List all implemented covariates. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java index c434cc96b..19c04361b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -18,13 +18,11 @@ import java.util.*; */ public class RecalibrationReport { private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager - private ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation + private final LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager + private final ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation - GATKReportTable argumentTable; // keep the argument table untouched just for output purposes - RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter | todo -- this should be a new parameter, not necessarily coming from the original table parameter list - - private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter public RecalibrationReport(final File RECAL_FILE) { GATKReport report = new GATKReport(RECAL_FILE); @@ -53,6 +51,7 @@ public class RecalibrationReport { final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES) + final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); table = parseReadGroupTable(keyManager, reportTable); @@ -292,7 +291,7 @@ public class RecalibrationReport { public void calculateEmpiricalAndQuantizedQualities() { for (Map table : keysAndTablesMap.values()) for (RecalDatum datum : table.values()) - datum.calcCombinedEmpiricalQuality(QualityUtils.MAX_QUAL_SCORE); + datum.calcCombinedEmpiricalQuality(); quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 3a5b07e58..2badca44c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -51,6 +51,7 @@ public class BaseRecalibration { * Constructor using a GATK Report file * * @param RECAL_FILE a GATK Report file containing the recalibration information + * @param quantizationLevels number of bins to quantize the quality scores */ public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) { RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); @@ -80,7 +81,7 @@ public class BaseRecalibration { for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read byte qualityScore = originalQuals[offset]; - if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + if (qualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java index 636d4ffb8..286b08a2c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java @@ -53,6 +53,17 @@ public class BQSRKeyManagerUnitTest { createReadAndTest(covariates, nRequired); } + @Test(enabled = true) + public void testOneCovariateWithOptionalCovariates() { + final int nRequired = 1; + final ArrayList covariates = new ArrayList(4); + covariates.add(new ReadGroupCovariate()); + covariates.add(new QualityScoreCovariate()); + covariates.add(new CycleCovariate()); + covariates.add(new ContextCovariate()); + createReadAndTest(covariates, nRequired); + } + private void createReadAndTest(List covariates, int nRequired) { int readLength = 1000; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M"); From 46a212d8e96aad4a5e34af93af97bcd5d5095e21 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 12 Apr 2012 14:09:18 -0400 Subject: [PATCH 38/51] Added "simplify reads" option to PrintReads. --- .../sting/gatk/walkers/PrintReadsWalker.java | 20 ++++++++++++------- .../sting/utils/sam/GATKSAMRecord.java | 3 ++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 0702b08c1..cb2944d31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -91,7 +90,7 @@ import java.util.TreeSet; */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReadsWalker extends ReadWalker { +public class PrintReadsWalker extends ReadWalker { @Output(doc="Write output to this BAM filename instead of STDOUT") SAMFileWriter out; @@ -129,6 +128,13 @@ public class PrintReadsWalker extends ReadWalker { @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set sampleNames = new TreeSet(); + /** + * Erase all extra attributes in the read but keep the read group information + */ + @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) + public boolean simplifyReads = false; + + private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; @@ -162,7 +168,7 @@ public class PrintReadsWalker extends ReadWalker { * The reads filter function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a SAMRecord + * @param read the read itself, as a GATKSAMRecord * @return true if the read passes the filter, false if it doesn't */ public boolean filter(ReferenceContext ref, GATKSAMRecord read) { @@ -208,11 +214,11 @@ public class PrintReadsWalker extends ReadWalker { * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a SAMRecord + * @param read the read itself, as a GATKSAMRecord * @return the read itself */ - public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { - return read; + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { + return simplifyReads ? read.simplify() : read; } /** @@ -232,7 +238,7 @@ public class PrintReadsWalker extends ReadWalker { * @param output the output source * @return the SAMFileWriter, so that the next reduce can emit to the same source */ - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { + public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) { output.addAlignment(read); return output; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 51c3715f3..7d3477a7b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -335,10 +335,11 @@ public class GATKSAMRecord extends BAMRecord { /** * Clears all attributes except ReadGroup of the read. */ - public void simplify () { + public GATKSAMRecord simplify () { GATKSAMReadGroupRecord rg = getReadGroup(); this.clearAttributes(); setReadGroup(rg); + return this; } /** From ea793d8e27712804c26c91a128d7a0508eb5e277 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 17 Apr 2012 21:21:29 -0400 Subject: [PATCH 41/51] Khalid pressured me into adding an integration test that makes sure we don't fail on reads with adjacent I and D events. --- .../genotyper/UnifiedGenotyperIntegrationTest.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 015f11048..4d00f6113 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -66,6 +66,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test Multiple SNP alleles", spec); } + @Test + public void testBadRead() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH -I " + validationDataLocation + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + Arrays.asList("7678827a2ee21870a41c09d28d26b996")); + executeTest("test bad read", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing compressed output From 6d03bce0d3a04eaa5ad328861d2707041e227132 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 17 Apr 2012 22:38:18 -0400 Subject: [PATCH 42/51] Important refactoring of the VQSR recal file format: we now use a VCF instead of a CSV file. The most important reason for this change is that we no longer need to read the entire recal file into memory up front in ApplyRecalibration. For 1000G calling this was prohibitive in terms of memory requirements. Now we go through the rod system and pull in just the records we need at a given position. As an added bonus, once BCF2 is live we can drastically cut down the sizes of these recal files (which can grow large for whole genome calling). --- .../ApplyRecalibration.java | 47 +++++++------------ .../VariantDataManager.java | 38 +++++++++++---- .../variantrecalibration/VariantDatum.java | 6 +-- .../VariantRecalibrator.java | 9 ++-- 4 files changed, 54 insertions(+), 46 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 6b36f4e1b..fd0997802 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -37,14 +37,11 @@ import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import java.io.File; -import java.io.FileNotFoundException; import java.util.*; /** @@ -98,9 +95,9 @@ public class ApplyRecalibration extends RodWalker { @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) public List> input; @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) - private File RECAL_FILE; + protected RodBinding recal; @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) - private File TRANCHES_FILE; + protected File TRANCHES_FILE; ///////////////////////////// // Outputs @@ -123,8 +120,6 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// final private List tranches = new ArrayList(); final private Set inputNames = new HashSet(); - final private NestedHashMap lodMap = new NestedHashMap(); - final private NestedHashMap annotationMap = new NestedHashMap(); final private Set ignoreInputFilterSet = new TreeSet(); //--------------------------------------------------------------------------------------------------------------- @@ -174,20 +169,6 @@ public class ApplyRecalibration extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); - - try { - logger.info("Reading in recalibration table..."); - for ( final String line : new XReadLines( RECAL_FILE ) ) { - final String[] vals = line.split(","); - lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys - annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, e); - } catch ( Exception e ) { - throw new UserException.MalformedFile(RECAL_FILE, "Could not parse LOD and annotation information in input recal file. File is somehow malformed."); - } - } //--------------------------------------------------------------------------------------------------------------- @@ -202,21 +183,27 @@ public class ApplyRecalibration extends RodWalker { return 1; } - for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) { + for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) { if( vc != null ) { - if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - String filterString = null; - final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); - final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); - if( lod == null ) { + if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { + + final VariantContext recalDatum = tracker.getFirstValue(recal, context.getLocation()); + if( recalDatum == null ) { throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } + final double lod = recalDatum.getAttributeAsDouble(VariantRecalibrator.VQS_LOD_KEY, Double.NEGATIVE_INFINITY); + if( lod == Double.NEGATIVE_INFINITY ) { + throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); + } + + VariantContextBuilder builder = new VariantContextBuilder(vc); + String filterString = null; + // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a957bfd85..6f82d0885 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -30,14 +30,16 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -285,11 +287,31 @@ public class VariantDataManager { (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } - public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { + public void writeOutRecalibrationTable( final VCFWriter recalWriter ) { + // we need to sort in coordinate order in order to produce a valid VCF + Collections.sort( data, new Comparator() { + public int compare(VariantDatum vd1, VariantDatum vd2) { + return vd1.loc.compareTo(vd2.loc); + }} ); + + // create dummy alleles to be used + final List alleles = new ArrayList(2); + alleles.add(Allele.create("N", true)); + alleles.add(Allele.create("", false)); + + final VCFHeader vcfHeader = new VCFHeader( null, Collections.emptySet() ); + recalWriter.writeHeader(vcfHeader); + + // to be used for the important INFO tags + final HashMap attributes = new HashMap(3); + for( final VariantDatum datum : data ) { - RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", - datum.contig, datum.start, datum.stop, datum.lod, - (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); + attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); + attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); + recalWriter.add(builder.make()); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index eb9e98fcb..32350f0fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -46,9 +48,7 @@ public class VariantDatum implements Comparable { public double originalQual; public double prior; public int consensusCount; - public String contig; - public int start; - public int stop; + public GenomeLoc loc; public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 3cdcf4982..58bbec7d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; @@ -136,7 +137,7 @@ public class VariantRecalibrator extends RodWalker Date: Tue, 17 Apr 2012 23:17:28 -0400 Subject: [PATCH 43/51] Minor tweaks and updated integration tests MD5s --- .../ApplyRecalibration.java | 97 +++++++++++-------- .../VariantDataManager.java | 5 +- .../VariantRecalibrator.java | 17 +++- ...ntRecalibrationWalkersIntegrationTest.java | 4 +- 4 files changed, 69 insertions(+), 54 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index fd0997802..26f881063 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -109,7 +109,7 @@ public class ApplyRecalibration extends RodWalker { // Command Line Arguments ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) - private double TS_FILTER_LEVEL = 99.0; + protected double TS_FILTER_LEVEL = 99.0; @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) @@ -183,58 +183,69 @@ public class ApplyRecalibration extends RodWalker { return 1; } - for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) { - if( vc != null ) { + final List VCs = tracker.getValues(input, context.getLocation()); + final List recals = tracker.getValues(recal, context.getLocation()); - if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { + for( final VariantContext vc : VCs ) { - final VariantContext recalDatum = tracker.getFirstValue(recal, context.getLocation()); - if( recalDatum == null ) { - throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); - } + if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { - final double lod = recalDatum.getAttributeAsDouble(VariantRecalibrator.VQS_LOD_KEY, Double.NEGATIVE_INFINITY); - if( lod == Double.NEGATIVE_INFINITY ) { - throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); - } - - VariantContextBuilder builder = new VariantContextBuilder(vc); - String filterString = null; - - // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); - - for( int i = tranches.size() - 1; i >= 0; i-- ) { - final Tranche tranche = tranches.get(i); - if( lod >= tranche.minVQSLod ) { - if( i == tranches.size() - 1 ) { - filterString = VCFConstants.PASSES_FILTERS_v4; - } else { - filterString = tranche.name; - } - break; - } - } - - if( filterString == null ) { - filterString = tranches.get(0).name+"+"; - } - - if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { - builder.filters(filterString); - } - - vcfWriter.add( builder.make() ); - } else { // valid VC but not compatible with this mode, so just emit the variant untouched - vcfWriter.add( vc ); + final VariantContext recalDatum = getMatchingRecalVC(vc, recals); + if( recalDatum == null ) { + throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } + + final double lod = recalDatum.getAttributeAsDouble(VariantRecalibrator.VQS_LOD_KEY, Double.NEGATIVE_INFINITY); + if( lod == Double.NEGATIVE_INFINITY ) { + throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); + } + + VariantContextBuilder builder = new VariantContextBuilder(vc); + String filterString = null; + + // Annotate the new record with its VQSLOD and the worst performing annotation + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); + + for( int i = tranches.size() - 1; i >= 0; i-- ) { + final Tranche tranche = tranches.get(i); + if( lod >= tranche.minVQSLod ) { + if( i == tranches.size() - 1 ) { + filterString = VCFConstants.PASSES_FILTERS_v4; + } else { + filterString = tranche.name; + } + break; + } + } + + if( filterString == null ) { + filterString = tranches.get(0).name+"+"; + } + + if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { + builder.filters(filterString); + } + + vcfWriter.add( builder.make() ); + } else { // valid VC but not compatible with this mode, so just emit the variant untouched + vcfWriter.add( vc ); } } return 1; // This value isn't used for anything } + private static VariantContext getMatchingRecalVC(final VariantContext target, final List recalVCs) { + for( final VariantContext recalVC : recalVCs ) { + if ( target.getEnd() == recalVC.getEnd() ) { + return recalVC; + } + } + + return null; + } + //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 6f82d0885..e2d1692d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -299,9 +299,6 @@ public class VariantDataManager { alleles.add(Allele.create("N", true)); alleles.add(Allele.create("", false)); - final VCFHeader vcfHeader = new VCFHeader( null, Collections.emptySet() ); - recalWriter.writeHeader(vcfHeader); - // to be used for the important INFO tags final HashMap attributes = new HashMap(3); @@ -310,7 +307,7 @@ public class VariantDataManager { attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStart(), alleles).attributes(attributes); recalWriter.add(builder.make()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 58bbec7d9..f86908dbe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -37,7 +37,8 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; @@ -137,9 +138,11 @@ public class VariantRecalibrator extends RodWalkeremptySet() ); + recalWriter = new StandardVCFWriter(recalFile, getMasterSequenceDictionary(), false); + recalWriter.writeHeader(vcfHeader); } //--------------------------------------------------------------------------------------------------------------- diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index c81891ac6..91a06bd42 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -27,7 +27,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "0ddd1e0e483d2eaf56004615cea23ec7", // tranches - "58780f63182e139fdbe17f6c18b5b774", // recal file + "f8e21a1987960b950db1f0d98be45352", // recal file "f67d844b6252a55452cf4167b77530b1"); // cut VCF @DataProvider(name = "VRTest") @@ -74,7 +74,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", "6d7ee4cb651c8b666e4a4523363caaff", // tranches - "4759b111a5aa53975d46e0f22c7983bf", // recal file + "ee5b408c8434a594496118875690c438", // recal file "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF @DataProvider(name = "VRIndelTest") From 4448a3ea76fae50d9efcb4b27a59833ab44674ce Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 17 Apr 2012 23:54:10 -0400 Subject: [PATCH 45/51] Final tweaks. Added an integration test to cover the case of SNPs and indels that start at the same position. --- .../variantrecalibration/ApplyRecalibration.java | 12 +++++++++--- ...riantRecalibrationWalkersIntegrationTest.java | 16 ++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 26f881063..898401e1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -195,16 +195,22 @@ public class ApplyRecalibration extends RodWalker { throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } - final double lod = recalDatum.getAttributeAsDouble(VariantRecalibrator.VQS_LOD_KEY, Double.NEGATIVE_INFINITY); - if( lod == Double.NEGATIVE_INFINITY ) { + final String lodString = recalDatum.getAttributeAsString(VariantRecalibrator.VQS_LOD_KEY, null); + if( lodString == null ) { throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); } + final double lod; + try { + lod = Double.valueOf(lodString); + } catch (NumberFormatException e) { + throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc ); + } VariantContextBuilder builder = new VariantContextBuilder(vc); String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 91a06bd42..11e093a6c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -118,5 +118,21 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(params.cutVCFMD5)); executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } + + @Test + public void testApplyRecalibrationSnpAndIndelTogether() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " -NO_HEADER" + + " -input " + validationDataLocation + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + validationDataLocation + "VQSR.mixedTest.tranches" + + " -recalFile " + validationDataLocation + "VQSR.mixedTest.recal", + Arrays.asList("08060b7f5c9cf3bb1692b50c58fd5a4b")); + executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); + } } From 8a844566268c55b159dbfa7016405ff08437ea3c Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 18 Apr 2012 11:24:04 -0400 Subject: [PATCH 46/51] Following Eric's awesome update to change the VQSR recal file into a VCF file, the ApplyRecalibration step is now scatter/gather-able and tree reducible. --- .../walkers/variantrecalibration/ApplyRecalibration.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 898401e1b..5b1d69f14 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -83,8 +84,8 @@ import java.util.*; * */ -@PartitionBy(PartitionType.NONE) -public class ApplyRecalibration extends RodWalker { +@PartitionBy(PartitionType.LOCUS) +public class ApplyRecalibration extends RodWalker implements TreeReducible { ///////////////////////////// // Inputs @@ -266,6 +267,10 @@ public class ApplyRecalibration extends RodWalker { return 1; // This value isn't used for anything } + public Integer treeReduce( final Integer lhs, final Integer rhs ) { + return 1; // This value isn't used for anything + } + public void onTraversalDone( final Integer reduceSum ) { } } From 392f1903f72938159927b49a463ddf4ba242b394 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 18 Apr 2012 12:57:37 -0400 Subject: [PATCH 48/51] Handling some of the NumberFormatExceptions seen via Tableau that are really user errors. --- .../sting/utils/codecs/refseq/RefSeqCodec.java | 2 ++ .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index efcd3ecf0..cb392f29c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -73,6 +73,8 @@ public class RefSeqCodec implements ReferenceDependentFeatureCodec alleles) { if ( index.equals(VCFConstants.EMPTY_ALLELE) ) return Allele.NO_CALL; - int i = Integer.valueOf(index); + final int i; + try { + i = Integer.valueOf(index); + } catch ( NumberFormatException e ) { + throw new TribbleException.InternalCodecException("The following invalid GT allele index was encountered in the file: " + index); + } if ( i >= alleles.size() ) throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record"); return alleles.get(i); From d3c84e7b1fe1bf819caa0be4e2f5471d5b01bae1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 18 Apr 2012 13:09:23 -0400 Subject: [PATCH 49/51] This should be a User Error since it's provided from the DoC command-line arguments --- .../sting/gatk/walkers/coverage/DepthOfCoverageStats.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java index 5ad213903..5345f9f48 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageStats.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.coverage; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.HashMap; import java.util.Map; @@ -45,8 +46,7 @@ public class DepthOfCoverageStats { public static int[] calculateBinEndpoints(int lower, int upper, int bins) { if ( bins > upper - lower || lower < 1 ) { - throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+ - "lower bound must be at least 1, and number of bins may not exceed stop - start"); + throw new UserException.BadInput("the start must be at least 1 and the number of bins may not exceed stop - start"); } int[] binLeftEndpoints = new int[bins+1]; From dcc4871468dfa3ef29b1eeb65bb93fa259249ba0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 18 Apr 2012 15:02:26 -0400 Subject: [PATCH 51/51] minor misc optimizations to PairHMM --- .../org/broadinstitute/sting/utils/MathUtils.java | 4 ++++ .../sting/utils/MathUtilsUnitTest.java | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 5e3160452..29d47cf3c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -106,6 +106,10 @@ public class MathUtils { return approxSum; } + public static double approximateLog10SumLog10(double a, double b, double c) { + return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); + } + public static double approximateLog10SumLog10(double small, double big) { // make sure small is really the smaller value if (small > big) { diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 5327d4cf2..04b0199d8 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -271,6 +271,19 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); } @Test