From 9e5a31b5958cfb78e4da4e75f4d6b9bf06f7151d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 23 Feb 2013 17:32:19 -0500 Subject: [PATCH 001/226] Brought all of ReduceReads to fastutils -- Added unit tests to ReduceReads name compression -- Updated reduce reads walker for unit testing GSATDG-83 --- ivy.xml | 2 + .../reducereads/CompressionStash.java | 8 +- .../reducereads/HeaderElement.java | 10 +- .../reducereads/MultiSampleCompressor.java | 23 ++-- .../compression/reducereads/ReduceReads.java | 54 +++++---- .../reducereads/SingleSampleCompressor.java | 21 ++-- .../reducereads/SlidingWindow.java | 72 +++++++----- .../reducereads/SyntheticRead.java | 16 +-- .../reducereads/ReduceReadsUnitTest.java | 111 ++++++++++++++++++ .../reducereads/SlidingWindowUnitTest.java | 11 +- .../reducereads/SyntheticReadUnitTest.java | 40 +++---- 11 files changed, 244 insertions(+), 124 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java diff --git a/ivy.xml b/ivy.xml index 4bd6ad7b8..ed13af1c2 100644 --- a/ivy.xml +++ b/ivy.xml @@ -41,6 +41,8 @@ + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java index bd7bdfe89..22ea78521 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -46,10 +46,12 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet; +import it.unimi.dsi.fastutil.objects.ObjectSortedSet; import org.broadinstitute.sting.utils.*; import java.util.Collection; -import java.util.TreeSet; + /** * A stash of regions that must be kept uncompressed in all samples @@ -61,7 +63,7 @@ import java.util.TreeSet; * Date: 10/15/12 * Time: 4:08 PM */ -public class CompressionStash extends TreeSet { +public class CompressionStash extends ObjectAVLTreeSet { public CompressionStash() { super(); } @@ -75,7 +77,7 @@ public class CompressionStash extends TreeSet { */ @Override public boolean add(final FinishedGenomeLoc insertLoc) { - TreeSet removedLocs = new TreeSet(); + ObjectSortedSet removedLocs = new ObjectAVLTreeSet(); for (FinishedGenomeLoc existingLoc : this) { if (existingLoc.isPast(insertLoc)) { break; // if we're past the loc we're done looking for overlaps. diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 83efaa254..1cd9c1bc0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -46,10 +46,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.ints.IntArrayList; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.LinkedList; /** * The element that describes the header of the sliding window. @@ -64,7 +64,7 @@ public class HeaderElement { private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right private int nSoftClippedBases; // How many bases in this site came from soft clipped bases private int location; // Genome location of this site (the sliding window knows which contig we're at - private LinkedList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site) + private IntArrayList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site) public int getLocation() { return location; @@ -85,7 +85,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList()); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new IntArrayList()); } /** @@ -95,7 +95,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new LinkedList()); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new IntArrayList()); } /** @@ -109,7 +109,7 @@ public class HeaderElement { * @param mappingQuality the list of mapping quality values of all reads that contributed to this * HeaderElement */ - public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList mappingQuality) { + public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, IntArrayList mappingQuality) { this.consensusBaseCounts = consensusBaseCounts; this.filteredBaseCounts = filteredBaseCounts; this.insertionsToTheRight = insertionsToTheRight; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index d45efeb65..2f377bee8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.objects.*; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.SampleUtils; @@ -54,10 +55,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; /* * Copyright (c) 2009 The Broad Institute @@ -91,7 +88,7 @@ import java.util.TreeSet; public class MultiSampleCompressor { protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); - protected Map compressorsPerSample = new HashMap(); + protected Object2ObjectMap compressorsPerSample = new Object2ObjectOpenHashMap(); public MultiSampleCompressor(SAMFileHeader header, final int contextSize, @@ -109,13 +106,13 @@ public class MultiSampleCompressor { } } - public Set addAlignment(GATKSAMRecord read) { + public ObjectSet addAlignment(GATKSAMRecord read) { String sampleName = read.getReadGroup().getSample(); SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); if ( compressor == null ) throw new ReviewedStingException("No compressor for sample " + sampleName); - Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); - Set reads = readsAndStash.getFirst(); + Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); + ObjectSet reads = readsAndStash.getFirst(); CompressionStash regions = readsAndStash.getSecond(); reads.addAll(closeVariantRegionsInAllSamples(regions)); @@ -123,17 +120,17 @@ public class MultiSampleCompressor { return reads; } - public Set close() { - Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + public ObjectSet close() { + ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { - Pair, CompressionStash> readsAndStash = sample.close(); + Pair, CompressionStash> readsAndStash = sample.close(); reads = readsAndStash.getFirst(); } return reads; } - private Set closeVariantRegionsInAllSamples(CompressionStash regions) { - Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + private ObjectSet closeVariantRegionsInAllSamples(CompressionStash regions) { + ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { for (SingleSampleCompressor sample : compressorsPerSample.values()) { reads.addAll(sample.closeVariantRegions(regions)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 8e45f6db1..7f39452c4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -46,6 +46,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; +import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.fastutil.objects.ObjectSortedSet; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMProgramRecord; @@ -71,7 +75,6 @@ import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.util.*; /** * Reduces the BAM file using read based compression that keeps only essential information for variant calling @@ -107,7 +110,7 @@ import java.util.*; @PartitionBy(PartitionType.CONTIG) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40) -public class ReduceReads extends ReadWalker, ReduceReadsStash> { +public class ReduceReads extends ReadWalker, ReduceReadsStash> { @Output private StingSAMFileWriter out = null; @@ -240,10 +243,10 @@ public class ReduceReads extends ReadWalker, ReduceRea int nCompressedReads = 0; - HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). + Object2LongOpenHashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). Long nextReadNumber = 1L; // The next number to use for the compressed read name. - SortedSet intervalList; + ObjectSortedSet intervalList; // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag @@ -257,8 +260,8 @@ public class ReduceReads extends ReadWalker, ReduceRea public void initialize() { super.initialize(); GenomeAnalysisEngine toolkit = getToolkit(); - readNameHash = new HashMap(); // prepare the read name hash to keep track of what reads have had their read names compressed - intervalList = new TreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode + readNameHash = new Object2LongOpenHashMap(100000); // prepare the read name hash to keep track of what reads have had their read names compressed + intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode if (toolkit.getIntervals() != null) intervalList.addAll(toolkit.getIntervals()); @@ -295,8 +298,8 @@ public class ReduceReads extends ReadWalker, ReduceRea * @return a linked list with all the reads produced by the clipping operations */ @Override - public LinkedList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - LinkedList mappedReads; + public ObjectArrayList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + ObjectArrayList mappedReads; if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) System.out.println("Found debug read!"); @@ -325,18 +328,18 @@ public class ReduceReads extends ReadWalker, ReduceRea if (HARD_CLIP_TO_INTERVAL) mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval else { - mappedReads = new LinkedList(); + mappedReads = new ObjectArrayList(); mappedReads.add(read); } } else { - mappedReads = new LinkedList(); + mappedReads = new ObjectArrayList(); if (!read.isEmpty()) mappedReads.add(read); } if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) { - LinkedList tempList = new LinkedList(); + ObjectArrayList tempList = new ObjectArrayList(); for (GATKSAMRecord mRead : mappedReads) { GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual); if (!clippedRead.isEmpty()) @@ -375,7 +378,7 @@ public class ReduceReads extends ReadWalker, ReduceRea * @param stash the stash that keeps the reads in order for processing * @return the stash with all reads that have not been processed yet */ - public ReduceReadsStash reduce(LinkedList mappedReads, ReduceReadsStash stash) { + public ReduceReadsStash reduce(ObjectArrayList mappedReads, ReduceReadsStash stash) { if (debugLevel == 1) stash.print(); @@ -387,7 +390,7 @@ public class ReduceReads extends ReadWalker, ReduceRea throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); if (originalRead) { - List readsReady = new LinkedList(); + ObjectArrayList readsReady = new ObjectArrayList(); readsReady.addAll(stash.getAllReadsBefore(read)); readsReady.add(read); @@ -433,8 +436,8 @@ public class ReduceReads extends ReadWalker, ReduceRea * @param read the read to be hard clipped to the interval. * @return a shallow copy of the read hard clipped to the interval */ - private LinkedList hardClipReadToInterval(GATKSAMRecord read) { - LinkedList clippedReads = new LinkedList(); + private ObjectArrayList hardClipReadToInterval(GATKSAMRecord read) { + ObjectArrayList clippedReads = new ObjectArrayList(); GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list) @@ -588,7 +591,7 @@ public class ReduceReads extends ReadWalker, ReduceRea System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd()); if (!DONT_COMPRESS_READ_NAMES) - compressReadName(read); + nextReadNumber = compressReadName(readNameHash, read, nextReadNumber); writerToUse.addAlignment(read); } @@ -625,19 +628,20 @@ public class ReduceReads extends ReadWalker, ReduceRea * * @param read any read */ - private void compressReadName(GATKSAMRecord read) { - String name = read.getReadName(); + protected static long compressReadName(Object2LongOpenHashMap hash, GATKSAMRecord read, long nextReadNumber) { + final String name = read.getReadName(); + long result = nextReadNumber; String compressedName = read.isReducedRead() ? "C" : ""; - final Long readNumber = readNameHash.get(name); + final Long readNumber = hash.get(name); if (readNumber != null) { compressedName += readNumber.toString(); } else { - readNameHash.put(name, nextReadNumber); - compressedName += nextReadNumber.toString(); - nextReadNumber++; + hash.put(name, nextReadNumber); + compressedName += "" + nextReadNumber; + result++; } - read.setReadName(compressedName); + return result; } /** @@ -649,8 +653,8 @@ public class ReduceReads extends ReadWalker, ReduceRea * @param read the read * @return Returns true if the read is the original read that went through map(). */ - private boolean isOriginalRead(LinkedList list, GATKSAMRecord read) { - return isWholeGenome() || list.getFirst().equals(read); + private boolean isOriginalRead(ObjectArrayList list, GATKSAMRecord read) { + return isWholeGenome() || list.get(0).equals(read); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index b4de1f0cb..42db83c04 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -46,14 +46,11 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.objects.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.Collections; -import java.util.Set; -import java.util.TreeSet; - /** * * @author carneiro, depristo @@ -72,7 +69,7 @@ public class SingleSampleCompressor { private SlidingWindow slidingWindow; private int slidingWindowCounter; - public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new TreeSet(), new CompressionStash()); + public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new ObjectAVLTreeSet(), new CompressionStash()); public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, @@ -93,8 +90,8 @@ public class SingleSampleCompressor { this.allowPolyploidReduction = allowPolyploidReduction; } - public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { - Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { + ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash stash = new CompressionStash(); int readOriginalStart = read.getUnclippedStart(); @@ -104,7 +101,7 @@ public class SingleSampleCompressor { (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window // close the current sliding window - Pair, CompressionStash> readsAndStash = slidingWindow.close(); + Pair, CompressionStash> readsAndStash = slidingWindow.close(); reads = readsAndStash.getFirst(); stash = readsAndStash.getSecond(); slidingWindow = null; // so we create a new one on the next if @@ -116,15 +113,15 @@ public class SingleSampleCompressor { } stash.addAll(slidingWindow.addRead(read)); - return new Pair, CompressionStash>(reads, stash); + return new Pair, CompressionStash>(reads, stash); } - public Pair, CompressionStash> close() { + public Pair, CompressionStash> close() { return (slidingWindow != null) ? slidingWindow.close() : emptyPair; } - public Set closeVariantRegions(CompressionStash regions) { - return slidingWindow == null ? Collections.emptySet() : slidingWindow.closeVariantRegions(regions); + public ObjectSet closeVariantRegions(CompressionStash regions) { + return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 680489042..7124b4772 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -48,6 +48,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import it.unimi.dsi.fastutil.bytes.Byte2IntArrayMap; +import it.unimi.dsi.fastutil.bytes.Byte2IntMap; +import it.unimi.dsi.fastutil.bytes.Byte2IntOpenHashMap; +import it.unimi.dsi.fastutil.objects.*; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -62,7 +66,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.util.*; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.ListIterator; + /** * Created by IntelliJ IDEA. @@ -73,7 +81,7 @@ import java.util.*; public class SlidingWindow { // Sliding Window data - final private TreeSet readsInWindow; + final private ObjectAVLTreeSet readsInWindow; final private LinkedList windowHeader; protected int contextSize; // the largest context size (between mismatches and indels) protected String contig; @@ -144,7 +152,7 @@ public class SlidingWindow { this.windowHeader = new LinkedList(); windowHeader.addFirst(new HeaderElement(startLocation)); - this.readsInWindow = new TreeSet(); + this.readsInWindow = new ObjectAVLTreeSet(); } public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) { @@ -157,7 +165,7 @@ public class SlidingWindow { this.MIN_MAPPING_QUALITY = minMappingQuality; this.windowHeader = new LinkedList(); - this.readsInWindow = new TreeSet(new Comparator() { + this.readsInWindow = new ObjectAVLTreeSet(new Comparator() { @Override public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { final int difference = read1.getSoftEnd() - read2.getSoftEnd(); @@ -287,7 +295,7 @@ public class SlidingWindow { } while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.pollFirst(); + readsInWindow.remove(readsInWindow.first()); } return regions; @@ -401,8 +409,8 @@ public class SlidingWindow { */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - protected List addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) { - LinkedList reads = new LinkedList(); + protected ObjectArrayList addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) { + ObjectArrayList reads = new ObjectArrayList(); if (start < end) { ListIterator headerElementIterator = header.listIterator(start); @@ -454,9 +462,9 @@ public class SlidingWindow { * @param type the synthetic reads you want to close * @return a possibly null list of GATKSAMRecords generated by finalizing the synthetic reads */ - private List finalizeAndAdd(ConsensusType type) { + private ObjectArrayList finalizeAndAdd(ConsensusType type) { GATKSAMRecord read = null; - List list = new LinkedList(); + ObjectArrayList list = new ObjectArrayList(); switch (type) { case CONSENSUS: @@ -556,8 +564,8 @@ public class SlidingWindow { */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - private List addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { - List result = new ArrayList(0); + private ObjectArrayList addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + ObjectArrayList result = new ObjectArrayList(); if (filteredDataConsensus == null) filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); @@ -640,8 +648,8 @@ public class SlidingWindow { */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected List compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { - List allReads = new LinkedList(); + protected ObjectList compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { + ObjectList allReads = new ObjectArrayList(); // Try to compress into a polyploid consensus int nVariantPositions = 0; @@ -685,7 +693,7 @@ public class SlidingWindow { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); - LinkedList toRemove = new LinkedList(); + ObjectList toRemove = new ObjectArrayList(); for (GATKSAMRecord read : readsInWindow) { if (read.getSoftStart() <= refStop) { if (read.getAlignmentEnd() >= refStart) { @@ -710,24 +718,24 @@ public class SlidingWindow { */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected List closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { - List allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition); + protected ObjectList closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { + ObjectList allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition); - List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; + ObjectList result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary } - public Set closeVariantRegions(CompressionStash regions) { - TreeSet allReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + public ObjectSet closeVariantRegions(CompressionStash regions) { + ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { int lastStop = -1; int windowHeaderStart = getStartLocation(windowHeader); for (GenomeLoc region : regions) { - if (((FinishedGenomeLoc)region).isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { + if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { int start = region.getStart() - windowHeaderStart; int stop = region.getStop() - windowHeaderStart; @@ -759,7 +767,7 @@ public class SlidingWindow { */ @Requires({"allReads != null"}) @Ensures("result != null") - protected List downsampleVariantRegion(final List allReads) { + protected ObjectList downsampleVariantRegion(final ObjectList allReads) { int nReads = allReads.size(); if (nReads == 0) return allReads; @@ -769,7 +777,7 @@ public class SlidingWindow { ReservoirDownsampler downsampler = new ReservoirDownsampler(downsampleCoverage); downsampler.submit(allReads); - return downsampler.consumeFinalizedItems(); + return new ObjectArrayList(downsampler.consumeFinalizedItems()); } @@ -781,9 +789,9 @@ public class SlidingWindow { * @return A non-null set/list of all reads generated */ @Ensures("result != null") - public Pair, CompressionStash> close() { + public Pair, CompressionStash> close() { // mark variant regions - Set finalizedReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash regions = new CompressionStash(); boolean forceCloseUnfinishedRegions = true; @@ -798,7 +806,7 @@ public class SlidingWindow { } } - return new Pair, CompressionStash>(finalizedReads, regions); + return new Pair, CompressionStash>(finalizedReads, regions); } /** @@ -847,16 +855,16 @@ public class SlidingWindow { */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - private List createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { + private ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { // we will create two (positive strand, negative strand) headers for each contig - List> headersPosStrand = new ArrayList>(); - List> headersNegStrand = new ArrayList>(); - List hetReads = new LinkedList(); - Map haplotypeHeaderMap = new HashMap(2); + ObjectList> headersPosStrand = new ObjectArrayList>(); + ObjectList> headersNegStrand = new ObjectArrayList>(); + ObjectList hetReads = new ObjectArrayList(); + Byte2IntMap haplotypeHeaderMap = new Byte2IntArrayMap(2); int currentHaplotype = 0; int refStart = windowHeader.get(start).getLocation(); int refStop = windowHeader.get(stop).getLocation(); - List toRemove = new LinkedList(); + ObjectList toRemove = new ObjectArrayList(); for (GATKSAMRecord read : readsInWindow) { int haplotype; @@ -1031,7 +1039,7 @@ public class SlidingWindow { } } - private void removeReadsFromWindow (List readsToRemove) { + private void removeReadsFromWindow (ObjectList readsToRemove) { for (GATKSAMRecord read : readsToRemove) { readsInWindow.remove(read); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index 631e099a9..72fd52ebe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -47,6 +47,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Requires; +import it.unimi.dsi.fastutil.bytes.ByteArrayList; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -57,10 +59,8 @@ import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.ArrayList; import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; + /** * Running Consensus is a read that is compressed as a sliding window travels over the reads @@ -123,7 +123,7 @@ public class SyntheticRead { } - private final List basesCountsQuals; + private final ObjectArrayList basesCountsQuals; private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus private String readTag; @@ -151,7 +151,7 @@ public class SyntheticRead { */ public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) { final int initialCapacity = 10000; - basesCountsQuals = new ArrayList(initialCapacity); + basesCountsQuals = new ObjectArrayList(initialCapacity); mappingQuality = 0.0; this.readTag = readTag; @@ -165,8 +165,8 @@ public class SyntheticRead { this.isNegativeStrand = isNegativeRead; } - public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { - basesCountsQuals = new ArrayList(bases.size()); + public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { + basesCountsQuals = new ObjectArrayList(bases.size()); for (int i = 0; i < bases.size(); ++i) { basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i))); } @@ -316,7 +316,7 @@ public class SyntheticRead { * @return the cigar string for the synthetic read */ private Cigar buildCigar() { - LinkedList cigarElements = new LinkedList(); + ObjectArrayList cigarElements = new ObjectArrayList(); CigarOperator cigarOperator = null; int length = 0; for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java new file mode 100644 index 000000000..b9399bb1b --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java @@ -0,0 +1,111 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Random; + + +public class ReduceReadsUnitTest extends BaseTest { + + Random random = new Random(987743); + Object2LongOpenHashMap hash = new Object2LongOpenHashMap(); + long nextNumber = 0L; + + /** + * Combinatorial unit test data provider example. + * + * Creates data for testMyData test function, containing two arguments, start and size at each value + * + * @return Object[][] for testng DataProvider + */ + @DataProvider(name = "ReadNameProvider") + public Object[][] readNameProvider() { + final int readNameLength = 4; + final int nReads = 100000; + final int charVariety = 20; + ObjectArrayList tests = new ObjectArrayList(); + ObjectOpenHashSet truthSet = new ObjectOpenHashSet(); + byte[] bytes = new byte[readNameLength]; + for ( int i = 0; i basicReads = new ArrayList(20); + private final ObjectList basicReads = new ObjectArrayList(20); private IndexedFastaSequenceFile seq; private SAMFileHeader header; @@ -364,7 +367,7 @@ public class SlidingWindowUnitTest extends BaseTest { SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(); + Pair, CompressionStash> result = slidingWindow.close(); Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); @@ -403,7 +406,7 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "Downsampling", enabled = true) public void testDownsamplingTest(DSTest test) { final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, test.dcov, ReduceReads.DownsampleStrategy.Normal, false, false); - final List result = slidingWindow.downsampleVariantRegion(basicReads); + final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); Assert.assertEquals(result.size(), Math.min(test.dcov, basicReads.size())); } @@ -453,7 +456,7 @@ public class SlidingWindowUnitTest extends BaseTest { final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(); + final Pair, CompressionStash> result = slidingWindow.close(); Assert.assertEquals(result.getFirst().size(), 1); final GATKSAMRecord read = result.getFirst().iterator().next(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java index c94130d18..1ed28dec2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.bytes.ByteArrayList; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; @@ -54,9 +56,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.Random; - public class SyntheticReadUnitTest extends BaseTest { final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1); final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic"); @@ -66,35 +65,32 @@ public class SyntheticReadUnitTest extends BaseTest { final int artificialRefStart = 1; final double artificialMappingQuality = 60; - final Random random = new Random(8854875); - - @Test public void testBaseCounts() { BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A}; - Byte[] quals = new Byte[] {20, 20, 20, 20 }; + byte[] quals = new byte[] {20, 20, 20, 20 }; TestRead [] testReads = new TestRead [] { - new TestRead(bases, quals, new Byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}), - new TestRead(bases, quals, new Byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}), - new TestRead(bases, quals, new Byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}), - new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; + new TestRead(bases, quals, new byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}), + new TestRead(bases, quals, new byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}), + new TestRead(bases, quals, new byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}), + new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; for (TestRead testRead : testReads) { - SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false); + SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false); Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); } } private class TestRead { BaseIndex[] bases; - Byte[] quals; - Byte[] insQuals; - Byte[] delQuals; - Byte[] counts; - byte [] expectedCounts; + byte[] quals; + byte[] insQuals; + byte[] delQuals; + byte[] counts; + byte[] expectedCounts; - private TestRead(BaseIndex[] bases, Byte[] quals, Byte[] counts, byte[] expectedCounts) { + private TestRead(BaseIndex[] bases, byte[] quals, byte[] counts, byte[] expectedCounts) { this.bases = bases; this.quals = quals; this.insQuals = quals; @@ -107,19 +103,19 @@ private class TestRead { return bases; } - public Byte[] getQuals() { + public byte[] getQuals() { return quals; } - public Byte[] getInsQuals() { + public byte[] getInsQuals() { return insQuals; } - public Byte[] getDelQuals() { + public byte[] getDelQuals() { return delQuals; } - public Byte[] getCounts() { + public byte[] getCounts() { return counts; } From 0ff3343282b1f48bddf8ffaf0ff8e8993c395c01 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 25 Feb 2013 13:33:47 -0500 Subject: [PATCH 002/226] Addressing Eric's comments -- added @param docs to the new variables -- made all variables final -- switched to string builder instead of String for performance. GSATDG-83 --- .../compression/reducereads/ReduceReads.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 7f39452c4..e89158412 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -626,21 +626,27 @@ public class ReduceReads extends ReadWalker, Redu * Compresses the read name using the readNameHash if we have already compressed * this read name before. * - * @param read any read + * @param hash the hash table containing the read name to compressed read name map + * @param read any read + * @param nextReadNumber the number to use in the compressed read name in case this is a new read name + * @return the next number to use in the compressed read name */ - protected static long compressReadName(Object2LongOpenHashMap hash, GATKSAMRecord read, long nextReadNumber) { + protected static long compressReadName(final Object2LongOpenHashMap hash, final GATKSAMRecord read, final long nextReadNumber) { final String name = read.getReadName(); + final StringBuilder compressedName = new StringBuilder(); long result = nextReadNumber; - String compressedName = read.isReducedRead() ? "C" : ""; + if (read.isReducedRead()) { + compressedName.append("C"); + } final Long readNumber = hash.get(name); if (readNumber != null) { - compressedName += readNumber.toString(); + compressedName.append(readNumber); } else { hash.put(name, nextReadNumber); - compressedName += "" + nextReadNumber; + compressedName.append(nextReadNumber); result++; } - read.setReadName(compressedName); + read.setReadName(compressedName.toString()); return result; } From 7519484a386d2defc0a0c143e76dceac299ff4c3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 25 Feb 2013 12:24:35 -0500 Subject: [PATCH 003/226] Refactored PairHMM.initialize to first take haplotype max length and then the read max length so that it is consistent with other PairHMM methods. --- .../LikelihoodCalculationEngine.java | 2 +- .../indels/PairHMMIndelErrorModel.java | 2 +- .../utils/pairhmm/LoglessCachingPairHMM.java | 4 ++-- .../sting/utils/pairhmm/PairHMMUnitTest.java | 24 +++++++++---------- .../sting/utils/pairhmm/Log10PairHMM.java | 4 ++-- .../sting/utils/pairhmm/PairHMM.java | 4 ++-- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index c3e7276a6..aeeb95c87 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -109,7 +109,7 @@ public class LikelihoodCalculationEngine { Y_METRIC_LENGTH += 2; // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index f5f4b9aeb..041089c62 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -385,7 +385,7 @@ public class PairHMMIndelErrorModel { if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); } int startIndexInHaplotype = 0; diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java index 6f8bec94f..24d6e1220 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java @@ -78,8 +78,8 @@ public class LoglessCachingPairHMM extends PairHMM { * {@inheritDoc} */ @Override - public void initialize( final int readMaxLength, final int haplotypeMaxLength) { - super.initialize(readMaxLength, haplotypeMaxLength); + public void initialize( final int haplotypeMaxLength, final int readMaxLength) { + super.initialize(haplotypeMaxLength, readMaxLength); constantMatrix = new double[X_METRIC_MAX_LENGTH][6]; distanceMatrix = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 9de562aa5..64819c245 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -136,7 +136,7 @@ public class PairHMMUnitTest extends BaseTest { } public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { - pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); + pairHMM.initialize(refBasesWithContext.length, readBasesWithContext.length); return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), @@ -262,7 +262,7 @@ public class PairHMMUnitTest extends BaseTest { double expectedLogL = cfg.expectedLogL(hmm); // compare to our theoretical expectation with appropriate tolerance - Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm); + Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm + (hmm instanceof Log10PairHMM ? " (" + ((Log10PairHMM)hmm).isDoingExactLog10Calculations() + ")" : "")); // compare to the exact reference implementation with appropriate tolerance Assert.assertEquals(actualLogL, exactLogL, cfg.getTolerance(hmm), "Failed with hmm " + hmm); Assert.assertTrue(MathUtils.goodLog10Probability(actualLogL), "Bad log10 likelihood " + actualLogL); @@ -303,7 +303,7 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - originalHMM.initialize(mread.length, haplotype1.length); + originalHMM.initialize(haplotype1.length, mread.length); double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, @@ -335,7 +335,7 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - originalHMM.initialize(mread.length, haplotype1.length); + originalHMM.initialize(haplotype1.length, mread.length); double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, @@ -372,7 +372,7 @@ public class PairHMMUnitTest extends BaseTest { byte insQual = 37; byte delQual = 37; byte gcp = 10; - hmm.initialize(readBases.length, refBases.length); + hmm.initialize(refBases.length, readBases.length); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), @@ -389,7 +389,7 @@ public class PairHMMUnitTest extends BaseTest { byte insQual = 100; byte delQual = 100; byte gcp = 100; - hmm.initialize(readBases.length, refBases.length); + hmm.initialize(refBases.length, readBases.length); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), @@ -429,7 +429,7 @@ public class PairHMMUnitTest extends BaseTest { byte insQual = 40; byte delQual = 40; byte gcp = 10; - hmm.initialize(readBases.length, refBases.length); + hmm.initialize(refBases.length, readBases.length); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), @@ -447,7 +447,7 @@ public class PairHMMUnitTest extends BaseTest { byte delQual = 40; byte gcp = 10; - exactHMM.initialize(readBases.length, refBases.length); + exactHMM.initialize(refBases.length, readBases.length); double d = exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), @@ -455,7 +455,7 @@ public class PairHMMUnitTest extends BaseTest { Utils.dupBytes(gcp, readBases.length), 0, true); //exactHMM.dumpMatrices(); - loglessHMM.initialize(readBases.length, refBases.length); + loglessHMM.initialize(refBases.length, readBases.length); double logless = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), @@ -489,7 +489,7 @@ public class PairHMMUnitTest extends BaseTest { final int maxHaplotypeLength = refBases.length + nExtraMaxSize; final int maxReadLength = readBases.length + nExtraMaxSize; - hmm.initialize(maxReadLength, maxHaplotypeLength); + hmm.initialize(maxHaplotypeLength, maxReadLength); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, @@ -535,7 +535,7 @@ public class PairHMMUnitTest extends BaseTest { final int maxHaplotypeLength = prefix.length() + root1.length(); // the initialization occurs once, at the start of the evalution of reads - hmm.initialize(maxReadLength, maxHaplotypeLength); + hmm.initialize(maxHaplotypeLength, maxReadLength); for ( int prefixStart = prefix.length(); prefixStart >= 0; prefixStart-- ) { final String myPrefix = prefix.substring(prefixStart, prefix.length()); @@ -633,7 +633,7 @@ public class PairHMMUnitTest extends BaseTest { byte[] refBases = "AAAT".getBytes(); byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length); - hmm.initialize(2, 3); + hmm.initialize(3, 2); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, baseQuals, baseQuals, baseQuals, baseQuals, 0, true); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index c9d364aac..62793bc54 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -64,8 +64,8 @@ public class Log10PairHMM extends PairHMM { * {@inheritDoc} */ @Override - public void initialize( final int readMaxLength, final int haplotypeMaxLength) { - super.initialize(readMaxLength, haplotypeMaxLength); + public void initialize( final int haplotypeMaxLength, final int readMaxLength) { + super.initialize(haplotypeMaxLength, readMaxLength); for( int iii=0; iii < X_METRIC_MAX_LENGTH; iii++ ) { Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index f898faaf3..e590d1df8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -61,10 +61,10 @@ public abstract class PairHMM { /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * @param readMaxLength the max length of reads we want to use with this PairHMM * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + public void initialize( final int haplotypeMaxLength, final int readMaxLength ) { if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); From 396b7e093307a21008b80645ee504f8b2d7d600b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 25 Feb 2013 14:58:17 -0500 Subject: [PATCH 004/226] Fixed the intermittent PairHMM unit test failure. The issue here is that the OptimizedLikelihoodTestProvider uses the same basic underlying class as the BasicLikelihoodTestProvider and we were using the BasicTestProvider functionality to pull out tests of that class; so if the optimized tests were run first we were unintentionally running those same tests again with the basic ones (but expecting different results). --- .../sting/utils/pairhmm/PairHMMUnitTest.java | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 64819c245..c94674c98 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -82,11 +82,12 @@ public class PairHMMUnitTest extends BaseTest { // // -------------------------------------------------------------------------------- - private class BasicLikelihoodTestProvider extends TestDataProvider { + private class BasicLikelihoodTestProvider { final String ref, read; final byte[] refBasesWithContext, readBasesWithContext; final int baseQual, insQual, delQual, gcp; final int expectedQual; + final boolean left, right; final static String CONTEXT = "ACGTAATGACGATTGCA"; final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; @@ -96,7 +97,6 @@ public class PairHMMUnitTest extends BaseTest { } public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { - super(BasicLikelihoodTestProvider.class, String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); this.baseQual = baseQual; this.delQual = delQual; this.insQual = insQual; @@ -104,11 +104,18 @@ public class PairHMMUnitTest extends BaseTest { this.read = read; this.ref = ref; this.expectedQual = expectedQual; + this.left = left; + this.right = right; refBasesWithContext = asBytes(ref, left, right); readBasesWithContext = asBytes(read, false, false); } + @Override + public String toString() { + return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); + } + public double expectedLogL(final PairHMM hmm) { return (expectedQual / -10.0) + 0.03 + hmm.getNPotentialXStartsLikelihoodPenaltyLog10(refBasesWithContext.length, readBasesWithContext.length); @@ -178,6 +185,8 @@ public class PairHMMUnitTest extends BaseTest { final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10); final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2); + final List tests = new ArrayList(); + for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { @@ -188,7 +197,7 @@ public class PairHMMUnitTest extends BaseTest { final String ref = new String(new byte[]{refBase}); final String read = new String(new byte[]{readBase}); final int expected = refBase == readBase ? 0 : baseQual; - new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)}); } } @@ -204,10 +213,10 @@ public class PairHMMUnitTest extends BaseTest { final String ref = insertionP ? small : big; final String read = insertionP ? big : small; - new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); - new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); - new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true)}); } } } @@ -215,7 +224,7 @@ public class PairHMMUnitTest extends BaseTest { } } - return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + return tests.toArray(new Object[][]{}); } @DataProvider(name = "OptimizedLikelihoodTestProvider") @@ -227,6 +236,8 @@ public class PairHMMUnitTest extends BaseTest { final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2); + final List tests = new ArrayList(); + for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { @@ -243,14 +254,14 @@ public class PairHMMUnitTest extends BaseTest { for ( final boolean leftFlank : Arrays.asList(true, false) ) for ( final boolean rightFlank : Arrays.asList(true, false) ) - new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank)}); } } } } } - return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + return tests.toArray(new Object[][]{}); } @Test(enabled = !DEBUG, dataProvider = "BasicLikelihoodTestProvider") @@ -262,7 +273,7 @@ public class PairHMMUnitTest extends BaseTest { double expectedLogL = cfg.expectedLogL(hmm); // compare to our theoretical expectation with appropriate tolerance - Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm + (hmm instanceof Log10PairHMM ? " (" + ((Log10PairHMM)hmm).isDoingExactLog10Calculations() + ")" : "")); + Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm); // compare to the exact reference implementation with appropriate tolerance Assert.assertEquals(actualLogL, exactLogL, cfg.getTolerance(hmm), "Failed with hmm " + hmm); Assert.assertTrue(MathUtils.goodLog10Probability(actualLogL), "Bad log10 likelihood " + actualLogL); From c8368ae2a512051b7a99a3aa9b4537521ed7df57 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Tue, 26 Feb 2013 16:23:12 -0500 Subject: [PATCH 009/226] Eliminate 7-element arrays in BaseCounts and BaseAndQualsCount and replace with in-line primitive attributes. This is ugly but reduces heap overhead, and changes are localized. When used in conjunction with Mauricio's FastUtil changes it saves and additional 9% or so of execution time. --- .../reducereads/BaseAndQualsCounts.java | 75 +++++++-- .../compression/reducereads/BaseCounts.java | 151 ++++++++++++------ 2 files changed, 160 insertions(+), 66 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index 7f8b0dded..c7b990a88 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -53,39 +53,82 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; * @since 6/15/12 */ public class BaseAndQualsCounts extends BaseCounts { - private final long[] sumInsertionQuals; - private final long[] sumDeletionQuals; - public BaseAndQualsCounts() { - super(); - this.sumInsertionQuals = new long[BaseIndex.values().length]; - this.sumDeletionQuals = new long[BaseIndex.values().length]; - // Java primitive arrays comes zero-filled, so no need to do it explicitly. - } + private long sumInsertionQual_A = 0; + private long sumDeletionQual_A = 0; + private long sumInsertionQual_C = 0; + private long sumDeletionQual_C = 0; + private long sumInsertionQual_G = 0; + private long sumDeletionQual_G = 0; + private long sumInsertionQual_T = 0; + private long sumDeletionQual_T = 0; + private long sumInsertionQual_D = 0; + private long sumDeletionQual_D = 0; + private long sumInsertionQual_I = 0; + private long sumDeletionQual_I = 0; + private long sumInsertionQual_N = 0; + private long sumDeletionQual_N = 0; + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { final BaseIndex i = BaseIndex.byteToBase(base); super.incr(i, baseQual); - sumInsertionQuals[i.index] += insQual; - sumDeletionQuals[i.index] += delQual; + switch (i) { + case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; + case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; + case G: sumInsertionQual_G += insQual; sumDeletionQual_G += delQual; break; + case T: sumInsertionQual_T += insQual; sumDeletionQual_T += delQual; break; + case D: sumInsertionQual_D += insQual; sumDeletionQual_D += delQual; break; + case I: sumInsertionQual_I += insQual; sumDeletionQual_I += delQual; break; + case N: sumInsertionQual_N += insQual; sumDeletionQual_N += delQual; break; + } } public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { final BaseIndex i = BaseIndex.byteToBase(base); super.decr(i, baseQual); - sumInsertionQuals[i.index] -= insQual; - sumDeletionQuals[i.index] -= delQual; + switch (i) { + case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; + case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; + case G: sumInsertionQual_G -= insQual; sumDeletionQual_G -= delQual; break; + case T: sumInsertionQual_T -= insQual; sumDeletionQual_T -= delQual; break; + case D: sumInsertionQual_D -= insQual; sumDeletionQual_D -= delQual; break; + case I: sumInsertionQual_I -= insQual; sumDeletionQual_I -= delQual; break; + case N: sumInsertionQual_N -= insQual; sumDeletionQual_N -= delQual; break; + } } public byte averageInsertionQualsOfBase(final BaseIndex base) { - return getGenericAverageQualOfBase(base, sumInsertionQuals); + return (byte) (getInsertionQual(base) / countOfBase(base)); } public byte averageDeletionQualsOfBase(final BaseIndex base) { - return getGenericAverageQualOfBase(base, sumDeletionQuals); + return (byte) (getDeletionQual(base) / countOfBase(base)); } - private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) { - return (byte) (sumQuals[base.index] / countOfBase(base)); + private long getInsertionQual(final BaseIndex base) { + switch (base) { + case A: return sumInsertionQual_A; + case C: return sumInsertionQual_C; + case G: return sumInsertionQual_G; + case T: return sumInsertionQual_T; + case D: return sumInsertionQual_D; + case I: return sumInsertionQual_I; + case N: return sumInsertionQual_N; + default: throw new IllegalArgumentException(base.name()); + } + } + + private long getDeletionQual(final BaseIndex base) { + switch (base) { + case A: return sumDeletionQual_A; + case C: return sumDeletionQual_C; + case G: return sumDeletionQual_G; + case T: return sumDeletionQual_T; + case D: return sumDeletionQual_D; + case I: return sumDeletionQual_I; + case N: return sumDeletionQual_N; + default: throw new IllegalArgumentException(base.name()); + } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 399cbd2a5..17ce3c90d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -62,70 +62,107 @@ import com.google.java.contract.Requires; public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - private final int[] counts; // keeps track of the base counts - private final long[] sumQuals; // keeps track of the quals of each base + + private int count_A = 0; // keeps track of the base counts + private int sumQual_A = 0; // keeps track of the quals of each base + private int count_C = 0; + private int sumQual_C = 0; + private int count_G = 0; + private int sumQual_G = 0; + private int count_T = 0; + private int sumQual_T = 0; + private int count_D = 0; + private int sumQual_D = 0; + private int count_I = 0; + private int sumQual_I = 0; + private int count_N = 0; + private int sumQual_N = 0; private int totalCount = 0; // keeps track of total count since this is requested so often - public BaseCounts() { - counts = new int[BaseIndex.values().length]; - sumQuals = new long[BaseIndex.values().length]; - // Java primitive arrays comes zero-filled, so no need to do it explicitly. - } public static BaseCounts createWithCounts(int[] countsACGT) { BaseCounts baseCounts = new BaseCounts(); - baseCounts.counts[BaseIndex.A.index] = countsACGT[0]; - baseCounts.counts[BaseIndex.C.index] = countsACGT[1]; - baseCounts.counts[BaseIndex.G.index] = countsACGT[2]; - baseCounts.counts[BaseIndex.T.index] = countsACGT[3]; + baseCounts.count_A = countsACGT[0]; + baseCounts.count_C = countsACGT[1]; + baseCounts.count_G = countsACGT[2]; + baseCounts.count_T = countsACGT[3]; baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; return baseCounts; } @Requires("other != null") public void add(final BaseCounts other) { - for (final BaseIndex i : BaseIndex.values()) { - final int otherCount = other.counts[i.index]; - counts[i.index] += otherCount; - totalCount += otherCount; - } + this.count_A += other.count_A; + this.count_C += other.count_C; + this.count_G += other.count_G; + this.count_T += other.count_T; + this.count_D += other.count_D; + this.count_I += other.count_I; + this.count_N += other.count_N; + this.totalCount += other.totalCount; } @Requires("other != null") public void sub(final BaseCounts other) { - for (final BaseIndex i : BaseIndex.values()) { - final int otherCount = other.counts[i.index]; - counts[i.index] -= otherCount; - totalCount -= otherCount; - } + this.count_A -= other.count_A; + this.count_C -= other.count_C; + this.count_G -= other.count_G; + this.count_T -= other.count_T; + this.count_D -= other.count_D; + this.count_I -= other.count_I; + this.count_N -= other.count_N; + this.totalCount -= other.totalCount; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(final byte base) { - final BaseIndex i = BaseIndex.byteToBase(base); - counts[i.index]++; - totalCount++; + add(BaseIndex.byteToBase(base), 1); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(final BaseIndex base, final byte qual) { - counts[base.index]++; - totalCount++; - sumQuals[base.index] += qual; + switch (base) { + case A: ++count_A; sumQual_A += qual; break; + case C: ++count_C; sumQual_C += qual; break; + case G: ++count_G; sumQual_G += qual; break; + case T: ++count_T; sumQual_T += qual; break; + case D: ++count_D; sumQual_D += qual; break; + case I: ++count_I; sumQual_I += qual; break; + case N: ++count_N; sumQual_N += qual; break; + } + ++totalCount; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(final byte base) { - final BaseIndex i = BaseIndex.byteToBase(base); - counts[i.index]--; - totalCount--; + add(BaseIndex.byteToBase(base), -1); + } + + private void add(final BaseIndex base, int amount) { + switch(base) { + case A: count_A += amount; break; + case C: count_C += amount; break; + case G: count_G += amount; break; + case T: count_T += amount; break; + case D: count_D += amount; break; + case I: count_I += amount; break; + case N: count_N += amount; break; + } + totalCount += amount; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(final BaseIndex base, final byte qual) { - counts[base.index]--; - totalCount--; - sumQuals[base.index] -= qual; + switch (base) { + case A: --count_A; sumQual_A -= qual; break; + case C: --count_C; sumQual_C -= qual; break; + case G: --count_G; sumQual_G -= qual; break; + case T: --count_T; sumQual_T -= qual; break; + case D: --count_D; sumQual_D -= qual; break; + case I: --count_I; sumQual_I -= qual; break; + case N: --count_N; sumQual_N -= qual; break; + } + --totalCount; } @Ensures("result >= 0") @@ -135,7 +172,16 @@ import com.google.java.contract.Requires; @Ensures("result >= 0") public long getSumQuals(final BaseIndex base) { - return sumQuals[base.index]; + switch (base) { + case A: return sumQual_A; + case C: return sumQual_C; + case G: return sumQual_G; + case T: return sumQual_T; + case D: return sumQual_D; + case I: return sumQual_I; + case N: return sumQual_N; + default: throw new IllegalArgumentException(base.name()); + } } @Ensures("result >= 0") @@ -155,12 +201,21 @@ import com.google.java.contract.Requires; @Ensures("result >= 0") public int countOfBase(final BaseIndex base) { - return counts[base.index]; + switch (base) { + case A: return count_A; + case C: return count_C; + case G: return count_G; + case T: return count_T; + case D: return count_D; + case I: return count_I; + case N: return count_N; + default: throw new IllegalArgumentException(base.name()); + } } @Ensures("result >= 0") public long sumQualsOfBase(final BaseIndex base) { - return sumQuals[base.index]; + return getSumQuals(base); } @Ensures("result >= 0") @@ -193,14 +248,14 @@ import com.google.java.contract.Requires; */ @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportion(final BaseIndex baseIndex) { - return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount; + return (totalCount == 0) ? 0.0 : (double)countOfBase(baseIndex) / (double)totalCount; } @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); for (final BaseIndex i : BaseIndex.values()) { - b.append(i.toString()).append("=").append(counts[i.index]).append(","); + b.append(i.toString()).append("=").append(countOfBase(i)).append(","); } return b.toString(); } @@ -213,7 +268,7 @@ import com.google.java.contract.Requires; public BaseIndex baseIndexWithMostCounts() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; for (final BaseIndex i : BaseIndex.values()) { - if (counts[i.index] > counts[maxI.index]) + if (countOfBase(i) > countOfBase(maxI)) maxI = i; } return maxI; @@ -223,7 +278,7 @@ import com.google.java.contract.Requires; public BaseIndex baseIndexWithMostCountsWithoutIndels() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; for (final BaseIndex i : BaseIndex.values()) { - if (i.isNucleotide() && counts[i.index] > counts[maxI.index]) + if (i.isNucleotide() && countOfBase(i) > countOfBase(maxI)) maxI = i; } return maxI; @@ -237,25 +292,25 @@ import com.google.java.contract.Requires; public BaseIndex baseIndexWithMostProbability() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; for (final BaseIndex i : BaseIndex.values()) { - if (sumQuals[i.index] > sumQuals[maxI.index]) + if (getSumQuals(i) > getSumQuals(maxI)) maxI = i; } - return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts()); + return (getSumQuals(maxI) > 0L ? maxI : baseIndexWithMostCounts()); } @Ensures("result != null") public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; for (final BaseIndex i : BaseIndex.values()) { - if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index]) + if (i.isNucleotide() && getSumQuals(i) > getSumQuals(maxI)) maxI = i; } - return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); + return (getSumQuals(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); } @Ensures("result >=0") public int totalCountWithoutIndels() { - return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index]; + return totalCount - countOfBase(BaseIndex.D) - countOfBase(BaseIndex.I); } /** @@ -268,10 +323,6 @@ import com.google.java.contract.Requires; @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportionWithoutIndels(final BaseIndex base) { final int total = totalCountWithoutIndels(); - return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total; - } - - public int[] countsArray() { - return counts.clone(); + return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total; } } From 69b81735359f0d9b72603cb3151c2ef6267d4b57 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 27 Feb 2013 14:01:09 -0500 Subject: [PATCH 010/226] Replace uses of NestedHashMap with NestedIntegerArray. * Removed from codebase NestedHashMap since it is unused and untested. * Integration tests change because the BQSR CSV is now sorted automatically. * Resolves GSA-732 --- .../sting/utils/recalibration/RecalUtils.java | 70 +++++++--- .../walkers/bqsr/BQSRIntegrationTest.java | 2 +- .../utils/collections/NestedHashMap.java | 132 ------------------ 3 files changed, 50 insertions(+), 154 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 6d98803c9..ce2869e94 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.utils.recalibration; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.report.GATKReport; @@ -59,7 +61,6 @@ import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -423,7 +424,7 @@ public class RecalUtils { private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { - final NestedHashMap deltaTable = new NestedHashMap(); + final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); // add the quality score table to the delta table final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); @@ -470,24 +471,57 @@ public class RecalUtils { covariateNameMap.put(covariate, parseCovariateName(covariate)); // print each data line - for (final NestedHashMap.Leaf leaf : deltaTable.getAllLeaves()) { + for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); - final RecalDatum deltaDatum = (RecalDatum)leaf.value; + final RecalDatum deltaDatum = leaf.value; deltaTableFile.print(Utils.join(",", deltaKeys)); deltaTableFile.print("," + deltaDatum.stringForCSV()); deltaTableFile.println("," + recalibrationMode); } } - protected static List generateValuesFromKeys(final List keys, final Covariate[] covariates, final Map covariateNameMap) { + /* + * Return an initialized nested integer array with appropriate dimensions for use with the delta tables + * + * @param recalibrationTables the recal tables + * @param numCovariates the total number of covariates being used + * @return a non-null nested integer array + */ + @Requires("recalibrationTables != null && numCovariates > 0") + @Ensures("result != null") + private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { + + final int[] dimensionsForDeltaTable = new int[4]; + + // initialize the dimensions with those of the qual table to start with + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + final int[] dimensionsOfQualTable = qualTable.getDimensions(); + dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups + dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates + dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; + dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; + + // now, update the dimensions based on the optional covariate tables as needed + for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + final int[] dimensionsOfCovTable = covTable.getDimensions(); + dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); + dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); + } + + return new NestedIntegerArray(dimensionsForDeltaTable); + } + + protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { final List values = new ArrayList(4); - values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey((Integer)keys.get(0))); - final int covariateIndex = (Integer)keys.get(1); + values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); + + final int covariateIndex = keys[1]; + final int covariateKey = keys[2]; final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; - final int covariateKey = (Integer)keys.get(2); values.add(covariate.formatKey(covariateKey)); values.add(covariateNameMap.get(covariate)); - values.add(EventType.eventFrom((Integer)keys.get(3)).prettyPrint()); + values.add(EventType.eventFrom(keys[3]).prettyPrint()); return values; } @@ -501,20 +535,14 @@ public class RecalUtils { * @param deltaKey the key to the table * @param recalDatum the recal datum to combine with the accuracyDatum element in the table */ - private static void addToDeltaTable(final NestedHashMap deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { - Object[] wrappedKey = wrapKeys(deltaKey); - final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(wrappedKey); // check if we already have a RecalDatum for this key + private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { + final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key if (deltaDatum == null) - deltaTable.put(new RecalDatum(recalDatum), wrappedKey); // if we don't have a key yet, create a new one with the same values as the curent datum + // if we don't have a key yet, create a new one with the same values as the current datum + deltaTable.put(new RecalDatum(recalDatum), deltaKey); else - deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. - } - - private static Object[] wrapKeys(final int[] keys) { - final Object[] wrappedKeys = new Object[keys.length]; - for (int i = 0; i < keys.length; i++) - wrappedKeys[i] = keys[i]; - return wrappedKeys; + // if we do have a datum, combine it with this one + deltaDatum.combine(recalDatum); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 8a40b44e6..2149091af 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -151,7 +151,7 @@ public class BQSRIntegrationTest extends WalkerTest { " -sortAllCols" + " --plot_pdf_file /dev/null" + " --intermediate_csv_file %s", - Arrays.asList("dd6e0e1e3f53f8ae0c8f5de21ded6ee9")); + Arrays.asList("90ad19143024684e3c4410dc8fd2bd9d")); executeTest("testBQSR-CSVfile", spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java deleted file mode 100644 index 9f330f226..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java +++ /dev/null @@ -1,132 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.collections; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Dec 29, 2009 - */ - -public class NestedHashMap { - - public final Map data = new HashMap(); - - public Object get( final Object... keys ) { - Map map = this.data; - final int nestedMaps = keys.length - 1; - for( int iii = 0; iii < nestedMaps; iii++ ) { - map = (Map) map.get(keys[iii]); - if( map == null ) { return null; } - } - return map.get(keys[nestedMaps]); - } - - public synchronized void put( final Object value, final Object... keys ) { // WARNING! value comes before the keys! - this.put(value, false, keys ); - } - - public synchronized Object put( final Object value, boolean keepOldBindingIfPresent, final Object... keys ) { - Map map = this.data; - final int keysLength = keys.length; - for( int iii = 0; iii < keysLength; iii++ ) { - if( iii == keysLength - 1 ) { - if ( keepOldBindingIfPresent && map.containsKey(keys[iii]) ) { - // this code test is for parallel protection when you call put() multiple times in different threads - // to initialize the map. It returns the already bound key[iii] -> value - return map.get(keys[iii]); - } else { - // we are a new binding, put it in the map - map.put(keys[iii], value); - return value; - } - } else { - Map tmp = (Map) map.get(keys[iii]); - if( tmp == null ) { - tmp = new HashMap(); - map.put(keys[iii], tmp); - } - map = tmp; - } - } - - return value; // todo -- should never reach this point - } - - public List getAllValues() { - final List result = new ArrayList(); - fillAllValues(data, result); - return result; - } - - private void fillAllValues(final Map map, final List result) { - for ( Object value : map.values() ) { - if ( value == null ) - continue; - if ( value instanceof Map ) - fillAllValues((Map)value, result); - else - result.add(value); - } - } - - public static class Leaf { - public final List keys; - public final Object value; - - public Leaf(final List keys, final Object value) { - this.keys = keys; - this.value = value; - } - } - - public List getAllLeaves() { - final List result = new ArrayList(); - final List path = new ArrayList(); - fillAllLeaves(data, path, result); - return result; - } - - private void fillAllLeaves(final Map map, final List path, final List result) { - for ( final Object key : map.keySet() ) { - final Object value = map.get(key); - if ( value == null ) - continue; - final List newPath = new ArrayList(path); - newPath.add(key); - if ( value instanceof Map ) { - fillAllLeaves((Map) value, newPath, result); - } else { - result.add(new Leaf(newPath, value)); - } - } - } -} From d2904cb636296fdea96ea8b201064d28d698c9aa Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 27 Feb 2013 14:55:49 -0500 Subject: [PATCH 011/226] Update docs for RTC. --- .../sting/gatk/walkers/indels/RealignerTargetCreator.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index dea17cd02..1ee04e317 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -91,9 +91,12 @@ import java.util.TreeSet; *
  • Running the realigner over those intervals (see the IndelRealigner tool)
  • * *

    - * An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * Important note 1: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

    - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * Important note 2: when multiple potential indels are found by the tool in the same general region, the tool will choose the most likely + * one for realignment to the exclusion of the others. This is a known limitation of the tool. + *

    + * Important note 3: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. * *

    Input

    From 12fc198b806d5076b0a883740101b9a9d8eae096 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 27 Feb 2013 16:02:56 -0500 Subject: [PATCH 012/226] Added better error message for BAMs with bad read groups. * Split the cases into reads that don't have a RG at all vs. those with a RG that's not defined in the header. * Added integration tests to make sure that the correct error is thrown. * Resolved GSA-407. --- .../gatk/filters/MalformedReadFilter.java | 12 +++-- .../sting/utils/exceptions/UserException.java | 10 +++- .../filters/BadReadGroupsIntegrationTest.java | 52 +++++++++++++++++++ 3 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java index 0f2353ce5..366e927dc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.filters; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMTagUtil; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -59,9 +60,14 @@ public class MalformedReadFilter extends ReadFilter { !checkCigarDisagreesWithAlignment(read); } - private static boolean checkHasReadGroup(SAMRecord read) { - if ( read.getReadGroup() == null ) - throw new UserException.ReadMissingReadGroup(read); + private static boolean checkHasReadGroup(final SAMRecord read) { + if ( read.getReadGroup() == null ) { + // there are 2 possibilities: either the RG tag is missing or it is not defined in the header + final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG); + if ( rgID == null ) + throw new UserException.ReadMissingReadGroup(read); + throw new UserException.ReadHasUndefinedReadGroup(read, rgID); + } return true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 5c67c899c..0c01539d4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -276,8 +276,14 @@ public class UserException extends ReviewedStingException { } public static class ReadMissingReadGroup extends MalformedBAM { - public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + public ReadMissingReadGroup(final SAMRecord read) { + super(read, String.format("Read %s is missing the read group (RG) tag, which is required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + } + } + + public static class ReadHasUndefinedReadGroup extends MalformedBAM { + public ReadHasUndefinedReadGroup(final SAMRecord read, final String rgID) { + super(read, String.format("Read %s uses a read group (%s) that is not defined in the BAM header, which is not valid. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName(), rgID)); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java new file mode 100644 index 000000000..12d875a4d --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadReadGroupsIntegrationTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + + +public class BadReadGroupsIntegrationTest extends WalkerTest { + + @Test + public void testMissingReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T PrintReads -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", + 0, + UserException.ReadMissingReadGroup.class); + executeTest("test Missing Read Group", spec); + } + + @Test + public void testUndefinedReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T PrintReads -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", + 0, + UserException.ReadHasUndefinedReadGroup.class); + executeTest("test Undefined Read Group", spec); + } +} From 4095a9ef32eda00be7a2af9a9d9f0e856c3746fe Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Feb 2013 15:38:17 -0500 Subject: [PATCH 013/226] Bugfixes for AssessNA12878 -- Refactor initialization routine into BadSitesWriter. This now adds the GQ and DP genotype header lines which are necessarily if the input VCF doesn't have proper headers -- GATKVariantContextUtils subset to biallelics now tolerates samples with bad GL values for multi-allelics, where it just removes the PLs and issues a warning. --- .../sting/utils/variant/GATKVariantContextUtils.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 3a5ddb7a0..37bd798cf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -405,6 +405,7 @@ public class GATKVariantContextUtils { // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); final int numNewAltAlleles = allelesToUse.size() - 1; // which PLs should be carried forward? @@ -444,6 +445,9 @@ public class GATKVariantContextUtils { double[] newLikelihoods; if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; + } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { + logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + newLikelihoods = null; } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; @@ -455,13 +459,13 @@ public class GATKVariantContextUtils { } // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + if ( newLikelihoods != null && MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); } else { final GenotypeBuilder gb = new GenotypeBuilder(g); - if ( numNewAltAlleles == 0 ) + if ( newLikelihoods == null || numNewAltAlleles == 0 ) gb.noPL(); else gb.PL(newLikelihoods); From e6ac94fd75f6d1b174a66b722d863b323c75cf3e Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 28 Feb 2013 16:39:43 -0500 Subject: [PATCH 016/226] Experimental script to run tests using class-level parallelism on the farm -script to dispatch one farm job per test class and monitor jobs until completion -new ant target to run tests without doing ANY compilation or extra steps at all allows multiple instances of the test suite to share the same working directory --- build.xml | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/build.xml b/build.xml index bb02c1ff1..2555227dc 100644 --- a/build.xml +++ b/build.xml @@ -1104,7 +1104,7 @@ - + @@ -1114,7 +1114,7 @@ - + @@ -1244,7 +1244,7 @@ listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter"> - + @@ -1287,7 +1287,7 @@ - + @@ -1442,4 +1442,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + From ebd540412474e91ac4b153045bf08a2949e22fa2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 28 Feb 2013 13:46:49 -0500 Subject: [PATCH 020/226] Fixed the add functionality of GenomeLocSortedSet. * Fixed GenomeLocSortedSet.add() to ensure that overlapping intervals are detected and an exception is thrown. * Fixed GenomeLocSortedSet.addRegion() by merging it with the add() method; it now produces sorted inputs in all cases. * Cleaned up duplicated code throughout the engine to create a list of intervals over all contigs. * Added more unit tests for add functionality of GLSS. * Resolves GSA-775. --- .../sting/gatk/GenomeAnalysisEngine.java | 4 +- .../gatk/datasources/reads/BAMScheduler.java | 18 +-- .../datasources/reads/IntervalSharder.java | 5 +- .../gatk/datasources/reads/SAMDataSource.java | 8 +- .../reads/utilities/FindLargeShards.java | 13 +- .../sting/utils/GenomeLocSortedSet.java | 142 ++++++++++-------- .../reads/SAMDataSourceUnitTest.java | 2 +- .../utils/GenomeLocSortedSetUnitTest.java | 36 ++++- 8 files changed, 130 insertions(+), 98 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ba25ac957..85c94cc92 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -558,7 +558,7 @@ public class GenomeAnalysisEngine { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); } @@ -566,7 +566,7 @@ public class GenomeAnalysisEngine { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8d7cfbaa7..adb668ff9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKChunk; -import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -53,14 +52,15 @@ public class BAMScheduler implements Iterator { private PeekableIterator locusIterator; private GenomeLoc currentLocus; - public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { - BAMScheduler scheduler = new BAMScheduler(dataSource); - GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); - for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { - // Match only on sequence name; trust startup validation to make sure all the sequences match. - if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) - intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); - } + /* + * Creates BAMScheduler using contigs from the given BAM data source. + * + * @param dataSource BAM source + * @return non-null BAM scheduler + */ + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { + final BAMScheduler scheduler = new BAMScheduler(dataSource); + final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); scheduler.populateFilteredIntervalList(intervals); return scheduler; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index f7ca7593f..048ce17f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -51,8 +50,8 @@ public class IntervalSharder implements Iterator { return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); } - public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { - return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); + public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser); } public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d52e55d6d..1223dd2af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -1060,10 +1060,12 @@ public class SAMDataSource { /** * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any * read that has been assigned - * @return + * + * @param shardBalancer shard balancer object + * @return non-null initialized version of the shard balancer */ - public Iterable createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); + public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); return shardBalancer; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 14bec213e..66463e576 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -26,12 +26,10 @@ package org.broadinstitute.sting.gatk.datasources.reads.utilities; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; @@ -98,14 +96,11 @@ public class FindLargeShards extends CommandLineProgram { SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); // intervals - GenomeLocSortedSet intervalSortedSet = null; - if(intervals != null) + final GenomeLocSortedSet intervalSortedSet; + if ( intervals != null ) intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); - else { - intervalSortedSet = new GenomeLocSortedSet(genomeLocParser); - for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences()) - intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength())); - } + else + intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 5adef5cdf..28cdaaf56 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -266,80 +266,96 @@ public class GenomeLocSortedSet extends AbstractSet { } /** - * add a genomeLoc to the collection, simply inserting in order into the set + * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. + * Throws an exception if the loc overlaps another loc already in the set. * - * TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or - * TODO -- other locations already in the set. This code should check to see if - * TODO -- e is overlapping with its nearby elements and merge them or alternatively - * TODO -- throw an exception + * @param loc the GenomeLoc to add * - * @param e the GenomeLoc to add - * - * @return true + * @return true if the loc was added or false otherwise (if the loc was null) */ - public boolean add(GenomeLoc e) { - // assuming that the intervals coming arrive in order saves us a fair amount of time (and it's most likely true) - if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) { - mArray.add(e); - return true; - } else { - final int loc = Collections.binarySearch(mArray,e); - if (loc >= 0) { - throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); - } else { - mArray.add((loc+1) * -1,e); - return true; - } - } + public boolean add(final GenomeLoc loc) { + return add(loc, false); } /** * Adds a GenomeLoc to the collection, merging it if it overlaps another region. - * If it's not overlapping then we add it in sorted order. + * If it's not overlapping then we insert it at the correct sorted position into the set. * - * TODO TODO TODO -- this function is buggy and will not properly create a sorted - * TODO TODO TODO -- genome loc is addRegion is called sequentially where the second - * TODO TODO TODO -- loc added is actually before the first. So when creating - * TODO TODO TODO -- sets make sure to sort the input locations first! + * @param loc the GenomeLoc to add * - * @param e the GenomeLoc to add to the collection - * - * @return true, if the GenomeLoc could be added to the collection + * @return true if the loc was added or false otherwise (if the loc was null) */ - public boolean addRegion(GenomeLoc e) { - if (e == null) { - return false; - } - // have we added it to the collection? - boolean haveAdded = false; + public boolean addRegion(final GenomeLoc loc) { + return add(loc, true); + } - /** - * check if the specified element overlaps any current locations, if so - * we should merge the two. - */ - for (GenomeLoc g : mArray) { - if (g.contiguousP(e)) { - GenomeLoc c = g.merge(e); - mArray.set(mArray.indexOf(g), c); - haveAdded = true; - } else if ((g.getContigIndex() == e.getContigIndex()) && - (e.getStart() < g.getStart()) && !haveAdded) { - mArray.add(mArray.indexOf(g), e); - return true; - } else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) || - (g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) { - return true; - } + /** + * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set. + * + * @param loc the GenomeLoc to add + * @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception + * + * @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate) + */ + public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) { + if ( loc == null ) + return false; + + // if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the + // case because locs are generally added in order) then be extra efficient and just add the loc to the end + if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) { + return mArray.add(loc); } - /** we're at the end and we haven't found locations that should fall after it, - * so we'll put it at the end - */ - if (!haveAdded) { - mArray.add(e); + + // find where in the list the new loc belongs + final int binarySearchIndex = Collections.binarySearch(mArray,loc); + + // if it already exists in the list, return or throw an exception as needed + if ( binarySearchIndex >= 0 ) { + if ( mergeIfIntervalOverlaps ) + return false; + throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc); } + + // if it overlaps a loc already in the list merge or throw an exception as needed + final int insertionIndex = -1 * (binarySearchIndex + 1); + if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) { + // it does not overlap any current intervals, so add it to the set + mArray.add(insertionIndex, loc); + } + return true; } + /* + * If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested) + * + * @param loc the GenomeLoc to add + * @param insertionIndex the index in the sorted set to add the new loc + * @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them + * + * @return true if the loc was added or false otherwise + */ + private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) { + // try merging with the previous index + if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) { + if ( throwExceptionIfOverlapping ) + throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString())); + mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc)); + return true; + } + + // try merging with the following index + if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) { + if ( throwExceptionIfOverlapping ) + throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString())); + mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc)); + return true; + } + + return false; + } + public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { LinkedList good = new LinkedList(); Stack toProcess = new Stack(); @@ -401,11 +417,11 @@ public class GenomeLocSortedSet extends AbstractSet { * * @return the GenomeLocSet of all references sequences as GenomeLoc's */ - public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) { - GenomeLocParser parser = new GenomeLocParser(dict); - GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); - for (SAMSequenceRecord record : dict.getSequences()) { - returnSortedSet.add(parser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength())); + public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) { + final GenomeLocParser parser = new GenomeLocParser(dict); + final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); + for ( final SAMSequenceRecord sequence : dict.getSequences() ) { + returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName())); } return returnSortedSet; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 23720e60d..8d33aa8b6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -111,7 +111,7 @@ public class SAMDataSourceUnitTest extends BaseTest { new ArrayList(), false); - Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); + Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); int count = 0; try { diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index df41dc642..443cf2771 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -117,11 +117,31 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); mSortedSet.addRegion(f); assertTrue(mSortedSet.size() == 1); - } + @Test + public void addRegionsOutOfOrder() { + final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName(); + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); + mSortedSet.addRegion(f); + assertTrue(mSortedSet.size() == 2); + assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName)); + assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName)); + } - @Test(expectedExceptions=ReviewedStingException.class) + @Test(expectedExceptions = IllegalArgumentException.class) + public void addThrowsException() { + assertTrue(mSortedSet.size() == 0); + GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50); + mSortedSet.add(g); + GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); + mSortedSet.add(f); + } + + @Test(expectedExceptions=IllegalArgumentException.class) public void testAddDuplicate() { assertTrue(mSortedSet.size() == 0); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0); @@ -141,9 +161,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.size() == 1); Iterator iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); - assertTrue(loc.getStart() == 0); - assertTrue(loc.getStop() == 100); - assertTrue(loc.getContigIndex() == 1); + assertEquals(loc.getStart(), 0); + assertEquals(loc.getStop(), 100); + assertEquals(loc.getContigIndex(), 1); } @Test @@ -192,9 +212,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(mSortedSet.size() == 1); Iterator iter = mSortedSet.iterator(); GenomeLoc loc = iter.next(); - assertTrue(loc.getStart() == 0); - assertTrue(loc.getStop() == 100); - assertTrue(loc.getContigIndex() == 1); + assertEquals(loc.getStart(), 0); + assertEquals(loc.getStop(), 100); + assertEquals(loc.getContigIndex(), 1); } @Test From c5c99c83394ea925e0bee9ecb63af89580e27366 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 1 Mar 2013 13:06:58 -0500 Subject: [PATCH 021/226] Split long-running integration test classes into multiple classes This is to facilitate the current experiment with class-level test suite parallelism. It's our hope that with these changes, we can get the runtime of the integration test suite down to 20 minutes or so. -UnifiedGenotyper tests: these divided nicely into logical categories that also happened to distribute the runtime fairly evenly -UnifiedGenotyperPloidy: these had to be divided arbitrarily into two classes in order to halve the runtime -HaplotypeCaller: turns out that the tests for complex and symbolic variants make up half the runtime here, so merely moving these into a separate class was sufficient -BiasedDownsampling: most of these tests use excessively large intervals that likely can't be reduced without defeating the goals of the tests. I'm disabling these tests for now until they can either be redesigned to use smaller intervals around the variants of interest, or refactored into unit tests (creating a JIRA for Yossi for this task) --- .../BiasedDownsamplingIntegrationTest.java | 58 +-- ...perGeneralPloidySuite1IntegrationTest.java | 84 +++++ ...perGeneralPloidySuite2IntegrationTest.java | 72 ++++ ...edGenotyperGeneralPloidyTestExecutor.java} | 73 +--- ...dGenotyperIndelCallingIntegrationTest.java | 197 ++++++++++ .../UnifiedGenotyperIntegrationTest.java | 340 +++--------------- ...GenotyperNormalCallingIntegrationTest.java | 126 +++++++ ...dGenotyperReducedReadsIntegrationTest.java | 87 +++++ ...lexAndSymbolicVariantsIntegrationTest.java | 98 +++++ .../HaplotypeCallerIntegrationTest.java | 51 +-- 10 files changed, 761 insertions(+), 425 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{UnifiedGenotyperGeneralPloidyIntegrationTest.java => UnifiedGenotyperGeneralPloidyTestExecutor.java} (76%) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java index 3f2ace800..77c9f96c9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java @@ -67,7 +67,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - @Test + @Test(enabled = false) public void testContaminationDownsamplingFlat() { WalkerTestSpec spec = new WalkerTestSpec( baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1, @@ -75,7 +75,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { executeTest("test contamination_percentage_to_filter 0.20", spec); } - @Test + @Test(enabled = false) public void testContaminationDownsamplingFlatAndPerSample() { WalkerTestSpec spec = new WalkerTestSpec( baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_per_sample_file " + ArtificalBAMLocation + "NA12878.NA19240.contam.txt --contamination_fraction_to_filter 0.10", 1, @@ -83,7 +83,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { executeTest("test contamination_percentage_to_filter per-sample and .20 overall", spec); } - @Test + @Test(enabled = false) public void testContaminationDownsamplingPerSampleOnly() { WalkerTestSpec spec = new WalkerTestSpec( baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contaminationFile " + ArtificalBAMLocation + "NA19240.contam.txt", 1, @@ -98,7 +98,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - @Test + @Test(enabled = false) private void testDefaultContamination() { final String bam1 = "NA11918.with.1.NA12842.reduced.bam"; final String bam2 = "NA12842.with.1.NA11918.reduced.bam"; @@ -116,47 +116,47 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec); } - @Test + @Test(enabled = false) public void testFlatContaminationCase1() { testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "e2e5a8dd313f8d7e382e7d49dfac59a2"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase2() { testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "549737002f98775fea8f46e7ea174dde"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase3() { testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "529d82c2a33fcc303a5dc55de2d56979"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase4() { testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.1, "b5689972fbb7d230a372ee5f0da1c6d7"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase5() { testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.2, "9dceee2e921b53fbc1ce137a7e0b7b74"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase6() { testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.3, "d6a74061033503af80dcaea065bfa075"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase7() { testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "7d1b5efab58a1b8f9d99fcf5af82f15a"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase8() { testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "a7f8d5c79626aff59d7f426f79d8816e"); } - @Test + @Test(enabled = false) public void testFlatContaminationCase9() { testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.3, "fcf482398b7c908e3e2d1e4d5da6377b"); } @@ -168,42 +168,42 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { executeTest("test contamination on Artificial Contamination (per-sample) on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase1() { testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "e00278527a294833259e9e411728e395"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase2() { testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "a443e793f0b0e2ffce1b751634d706e2"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase3() { testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "e11d83a7815ce757afbcf7689568cb25"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase4() { testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "615042eeeffe042bd1c86279d34f80b6"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase5() { testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "9bc99fc79ca34744bf26cb19ee4ef44d"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase6() { testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "143626fe5fce765d6c997a64f058a813"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase7() { testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "f2593674cef894eda4e0be9cf3158f57"); } - @Test + @Test(enabled = false) public void testPerSampleContaminationCase8() { testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "fb7ce0740767ae3896b3e552026da1e4"); } @@ -227,17 +227,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { // verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level - @Test + @Test(enabled = false) public void testPerSampleEqualsFlatContaminationCase1() { testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0, ""); } - @Test + @Test(enabled = false) public void testPerSampleEqualsFlatContaminationCase2() { testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15, ""); } - @Test + @Test(enabled = false) public void testPerSampleEqualsFlatContaminationCase3() { testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3, ""); } @@ -250,7 +250,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- - @Test + @Test(enabled = false) public void testHCContaminationDownsamplingFlat() { final String baseCommand = "-T HaplotypeCaller -R " + b36KGReference + " --no_cmdline_in_header --dbsnp " + b36dbSNP129; WalkerTestSpec spec = new WalkerTestSpec( @@ -260,7 +260,7 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { } // HaplotypeCaller can only (currently) use flat contamination reduction, not per-sample. Until that is implemented, this test - @Test + @Test(enabled = false) public void testHCCannotProcessPerSampleContamination() { final String baseCommand = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:3,000,000-5,000,000"; final String bam1 = "NA11918.with.1.NA12842.reduced.bam"; @@ -281,17 +281,17 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { executeTest("HC test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec); } - @Test + @Test(enabled = false) public void testHCFlatContaminationCase1() { testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "c3e695381d8627e3922d8c642b66c3ce"); } - @Test + @Test(enabled = false) public void testHCFlatContaminationCase2() { testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "002d2b45336d88d7c04e19f9f26e29d9"); } - @Test + @Test(enabled = false) public void testHCFlatContaminationCase3() { testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "1809a33ac112d1a3bd7a071c566794dd"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java new file mode 100644 index 000000000..ef9f483ff --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 4/5/12 + * Time: 11:28 AM + * To change this template use File | Settings | File Templates. + */ +public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTest { + + private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); + + @Test(enabled = true) + public void testSNP_ACS_Pools() { + executor.PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES", "LSV_SNP_ACS", "SNP", "df0e67c975ef74d593f1c704daab1705"); + } + + @Test(enabled = true) + public void testBOTH_GGA_Pools() { + executor.PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "71f16e19b7d52e8edee46f4121e59f54"); + } + + @Test(enabled = true) + public void testINDEL_GGA_Pools() { + executor.PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_INDEL_GGA", "INDEL", "3f7d763c654f1d708323f369ea4a099b"); + } + + @Test(enabled = true) + public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "5812da66811887d834d0379a33e655c0"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java new file mode 100644 index 000000000..dc9220b7e --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -0,0 +1,72 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*; + +public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { + + private final UnifiedGenotyperGeneralPloidyTestExecutor executor = new UnifiedGenotyperGeneralPloidyTestExecutor(); + + @Test(enabled = true) + public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","3a321896c4b8b6457973c76c486da4d4"); + } + + @Test(enabled = true) + public void testMT_SNP_DISCOVERY_sp4() { + executor.PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); + } + + @Test(enabled = true) + public void testMT_SNP_GGA_sp10() { + executor.PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java similarity index 76% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java index 6a381e0cf..53d32832b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyTestExecutor.java @@ -47,90 +47,47 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; import java.util.Arrays; -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: 4/5/12 - * Time: 11:28 AM - * To change this template use File | Settings | File Templates. - */ -public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { +public class UnifiedGenotyperGeneralPloidyTestExecutor extends WalkerTest { final static String REF = b37KGReference; - final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list"; - final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; - final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; - final String REFSAMPLE_NAME = "NA12878"; - final String MTINTERVALS = "MT:1-1000"; - final String LSVINTERVALS = "20:40,500,000-41,000,000"; - final String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000"; - final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; - final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; - final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; + final static String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list"; + final static String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; + final static String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; + final static String REFSAMPLE_NAME = "NA12878"; + final static String MTINTERVALS = "MT:1-1000"; + final static String LSVINTERVALS = "20:40,500,000-41,000,000"; + final static String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000"; + final static String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; + final static String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; + final static String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; - private void PC_MT_Test(String bam, String args, String name, String md5) { + public void PC_MT_Test(String bam, String args, String name, String md5) { final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -ignoreLane ", REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } - private void PC_LSV_Test(String args, String name, String model, String md5) { + public void PC_LSV_Test(String args, String name, String model, String md5) { final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ", REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } - private void PC_LSV_Test_short(String args, String name, String model, String md5) { + public void PC_LSV_Test_short(String args, String name, String model, String md5) { final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ", REF, LSV_BAM, LSVINTERVALS_SHORT, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } - private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) { + public void PC_LSV_Test_NoRef(String args, String name, String model, String md5) { final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane", REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } - - @Test(enabled = true) - public void testSNP_ACS_Pools() { - PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705"); - } - - @Test(enabled = true) - public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","71f16e19b7d52e8edee46f4121e59f54"); - } - - @Test(enabled = true) - public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","3f7d763c654f1d708323f369ea4a099b"); - } - - @Test(enabled = true) - public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","3a321896c4b8b6457973c76c486da4d4"); - } - - @Test(enabled = true) - public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","5812da66811887d834d0379a33e655c0"); - } - - @Test(enabled = true) - public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); - } - - @Test(enabled = true) - public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); - } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java new file mode 100644 index 000000000..670666fe2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -0,0 +1,197 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { + + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing indel caller + // + // -------------------------------------------------------------------------------------------------------------- + // Basic indel testing with SLX data + @Test + public void testSimpleIndels() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + + " -o %s" + + " -L 1:10,000,000-10,500,000", + 1, + Arrays.asList("1cb469b9cc8e6c70430021540bf1af8b")); + + executeTest(String.format("test indel caller in SLX"), spec); + } + + // Basic indel testing with SLX data + @Test + public void testIndelsWithLowMinAlleleCnt() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + + " -o %s" + + " -minIndelCnt 1" + + " -L 1:10,000,000-10,100,000", + 1, + Arrays.asList("c7e59f9ab718df4c604626a0f51af606")); + + executeTest(String.format("test indel caller in SLX with low min allele count"), spec); + } + + @Test + public void testMultiTechnologyIndels() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,500,000", + 1, + Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c")); + + executeTest(String.format("test indel calling, multiple technologies"), spec); + } + + @Test + public void testWithIndelAllelesPassedIn1() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("86880ec78755ae91cb5bb34a0631a32c")); + executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); + } + + @Test + public void testWithIndelAllelesPassedIn2() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, + Arrays.asList("2584d5e3ade1b548f1fe9cdcafbe1b28")); + executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); + } + + @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes + public void testMultiSampleIndels1() { + // since we're going to test the MD5s with GGA only do one here + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, + Arrays.asList("")); + List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, + Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf")); + executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); + } + + @Test + public void testGGAwithNoEvidenceInReads() { + final String vcf = "small.indel.test.vcf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); + executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); + } + + @Test + public void testBaseIndelQualityScores() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + + " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" + + " -o %s" + + " -L 20:10,000,000-10,100,000", + 1, + Arrays.asList("8a7966e4b67334bca6083670c5a16b67")); + + executeTest(String.format("test UG with base indel quality scores"), spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing MinIndelFraction + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("556c214366e82e4682e753ce93307a4e")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("1df02b805d9dfbd532fa3632875a989d")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); + executeTest("test minIndelFraction 1.0", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4342b8bfc..ca965a042 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -51,10 +51,8 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; -import java.io.File; import java.util.Arrays; import java.util.Collections; -import java.util.List; // ********************************************************************************** // // Note that this class also serves as an integration test for the VariantAnnotator! // @@ -63,128 +61,8 @@ import java.util.List; public class UnifiedGenotyperIntegrationTest extends WalkerTest { private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; - // -------------------------------------------------------------------------------------------------------------- - // - // testing normal calling - // - // -------------------------------------------------------------------------------------------------------------- - @Test - public void testMultiSamplePilot1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("2f15ef1ead56d875a3f1d53772f52b3a")); - executeTest("test MultiSample Pilot1", spec); - } - - @Test - public void testWithAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("5b31b811072a4df04524e13604015f9b")); - executeTest("test MultiSample Pilot2 with alleles passed in", spec1); - } - - @Test - public void testWithAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); - executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); - } - - @Test - public void testSingleSamplePilot2() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("33ab66c2f062cfa1f7fcc077165f778c")); - executeTest("test SingleSample Pilot2", spec); - } - - @Test - public void testMultipleSNPAlleles() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("9fac00485419878749b03706ae6b852f")); - executeTest("test Multiple SNP alleles", spec); - } - - @Test - public void testBadRead() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("d915535c1458733f09f82670092fcab6")); - executeTest("test bad read", spec); - } - - @Test - public void testReverseTrim() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("eb9604b77a7d6baab60c81ac3db5e47b")); - executeTest("test reverse trim", spec); - } - - @Test - public void testMismatchedPLs() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("de2c5707c1805d17d70acaecd36b7372")); - executeTest("test mismatched PLs", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing compressed output - // - // -------------------------------------------------------------------------------------------------------------- - - private final static String COMPRESSED_OUTPUT_MD5 = "d5a7326fdcf6d441b73c381912ad3a2a"; - - @Test - public void testCompressedOutput() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); - executeTest("test compressed output", spec); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing parallelization - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testParallelization() { - - // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - - String md5 = "d408b4661b820ed86272415b8ea08780"; - - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, - Arrays.asList(md5)); - executeTest("test parallelization (single thread)", spec1); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, - Arrays.asList(md5)); - executeTest("test parallelization (2 threads)", spec2); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, - Arrays.asList(md5)); - executeTest("test parallelization (4 threads)", spec3); - } - // -------------------------------------------------------------------------------------------------------------- // // testing parameters @@ -283,6 +161,54 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest(String.format("test heterozyosity[%s]", arg), spec); } + // -------------------------------------------------------------------------------------------------------------- + // + // testing compressed output + // + // -------------------------------------------------------------------------------------------------------------- + + private final static String COMPRESSED_OUTPUT_MD5 = "d5a7326fdcf6d441b73c381912ad3a2a"; + + @Test + public void testCompressedOutput() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, + Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); + executeTest("test compressed output", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing parallelization + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testParallelization() { + + // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations + + String md5 = "d408b4661b820ed86272415b8ea08780"; + + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, + Arrays.asList(md5)); + executeTest("test parallelization (single thread)", spec1); + + GenomeAnalysisEngine.resetRandomGenerator(); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, + Arrays.asList(md5)); + executeTest("test parallelization (2 threads)", spec2); + + GenomeAnalysisEngine.resetRandomGenerator(); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, + Arrays.asList(md5)); + executeTest("test parallelization (4 threads)", spec3); + } // -------------------------------------------------------------------------------------------------------------- // @@ -321,110 +247,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest(String.format("test calling with BAQ"), spec); } - // -------------------------------------------------------------------------------------------------------------- - // - // testing indel caller - // - // -------------------------------------------------------------------------------------------------------------- - // Basic indel testing with SLX data - @Test - public void testSimpleIndels() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + - " -o %s" + - " -L 1:10,000,000-10,500,000", - 1, - Arrays.asList("1cb469b9cc8e6c70430021540bf1af8b")); - - executeTest(String.format("test indel caller in SLX"), spec); - } - - // Basic indel testing with SLX data - @Test - public void testIndelsWithLowMinAlleleCnt() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam" + - " -o %s" + - " -minIndelCnt 1" + - " -L 1:10,000,000-10,100,000", - 1, - Arrays.asList("c7e59f9ab718df4c604626a0f51af606")); - - executeTest(String.format("test indel caller in SLX with low min allele count"), spec); - } - - @Test - public void testMultiTechnologyIndels() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,500,000", - 1, - Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c")); - - executeTest(String.format("test indel calling, multiple technologies"), spec); - } - - @Test - public void testWithIndelAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("86880ec78755ae91cb5bb34a0631a32c")); - executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); - } - - @Test - public void testWithIndelAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " - + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2584d5e3ade1b548f1fe9cdcafbe1b28")); - executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); - } - - @Test(timeOut = 20*1000*60) // this guy can take a long time because it's two steps, so give it 12 minutes - public void testMultiSampleIndels1() { - // since we're going to test the MD5s with GGA only do one here - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("")); - List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + - "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf")); - executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); - } - - @Test - public void testGGAwithNoEvidenceInReads() { - final String vcf = "small.indel.test.vcf"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + - "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); - executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); - } - - @Test - public void testBaseIndelQualityScores() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + - " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" + - " -o %s" + - " -L 20:10,000,000-10,100,000", - 1, - Arrays.asList("8a7966e4b67334bca6083670c5a16b67")); - - executeTest(String.format("test UG with base indel quality scores"), spec); - } - // -------------------------------------------------------------------------------------------------------------- // // testing SnpEff @@ -441,39 +263,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } - // -------------------------------------------------------------------------------------------------------------- - // - // testing MinIndelFraction - // - // -------------------------------------------------------------------------------------------------------------- - - final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation - + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; - - @Test - public void testMinIndelFraction0() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("556c214366e82e4682e753ce93307a4e")); - executeTest("test minIndelFraction 0.0", spec); - } - - @Test - public void testMinIndelFraction25() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("1df02b805d9dfbd532fa3632875a989d")); - executeTest("test minIndelFraction 0.25", spec); - } - - @Test - public void testMinIndelFraction100() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); - executeTest("test minIndelFraction 1.0", spec); - } - // -------------------------------------------------------------------------------------------------------------- // // testing Ns in CIGAR @@ -487,37 +276,4 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); executeTest("test calling on reads with Ns in CIGAR", spec); } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304")); - executeTest("test calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamSNPs() { - testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f"); - } - - @Test - public void testReducedBamINDELs() { - testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21"); - } - - - private void testReducedCalling(final String model, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, - Arrays.asList(md5)); - executeTest("test calling on a ReducedRead BAM with " + model, spec); - } - } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java new file mode 100644 index 000000000..49083e45b --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -0,0 +1,126 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ + + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing normal calling + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testMultiSamplePilot1() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, + Arrays.asList("2f15ef1ead56d875a3f1d53772f52b3a")); + executeTest("test MultiSample Pilot1", spec); + } + + @Test + public void testWithAllelesPassedIn1() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("5b31b811072a4df04524e13604015f9b")); + executeTest("test MultiSample Pilot2 with alleles passed in", spec1); + } + + @Test + public void testWithAllelesPassedIn2() { + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); + executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); + } + + @Test + public void testSingleSamplePilot2() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, + Arrays.asList("33ab66c2f062cfa1f7fcc077165f778c")); + executeTest("test SingleSample Pilot2", spec); + } + + @Test + public void testMultipleSNPAlleles() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, + Arrays.asList("9fac00485419878749b03706ae6b852f")); + executeTest("test Multiple SNP alleles", spec); + } + + @Test + public void testBadRead() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + Arrays.asList("d915535c1458733f09f82670092fcab6")); + executeTest("test bad read", spec); + } + + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("eb9604b77a7d6baab60c81ac3db5e47b")); + executeTest("test reverse trim", spec); + } + + @Test + public void testMismatchedPLs() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, + Arrays.asList("de2c5707c1805d17d70acaecd36b7372")); + executeTest("test mismatched PLs", spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java new file mode 100644 index 000000000..d65020dcc --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -0,0 +1,87 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { + + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304")); + executeTest("test calling on a ReducedRead BAM", spec); + } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f"); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21"); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java new file mode 100644 index 000000000..3e57663f8 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -0,0 +1,98 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.*; + +import java.util.Arrays; + +public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { + + private void HCTestComplexVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleComplex() { + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d"); + } + + private void HCTestSymbolicVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); + } + + // TODO -- need a better symbolic allele test + @Test + public void testHaplotypeCallerSingleSampleSymbolic() { + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "56f2ef9acc6c0d267cf2b7a447d87fb7"); + } + + private void HCTestComplexGGA(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleGGAComplex() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", + "417174e043dbb8b86cc3871da9b50536"); + } + + @Test + public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", + "f2df7a8f53ce449e4a8e8f8496e7c745"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 856ef58a1..4988fbe77 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -54,11 +54,11 @@ import java.util.Collections; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; - final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; - final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; - final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; + final static String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; + final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; private void HCTest(String bam, String args, String md5) { final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; @@ -87,47 +87,6 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { "283524b3e3397634d4cf0dc2b8723002"); } - private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSampleGGAComplex() { - HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "417174e043dbb8b86cc3871da9b50536"); - } - - @Test - public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { - HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "f2df7a8f53ce449e4a8e8f8496e7c745"); - } - - private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); - } - - @Test - public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d"); - } - - private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; - final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); - } - - // TODO -- need a better symbolic allele test - @Test - public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "56f2ef9acc6c0d267cf2b7a447d87fb7"); - } - private void HCTestIndelQualityScores(String bam, String args, String md5) { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); From a0be74c2ef145ca784691cf7bdc33ae260c23cf7 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 1 Mar 2013 15:33:59 -0500 Subject: [PATCH 023/226] Ant target to package a GATK jar with private included Needed before we can start emitting full unstable jars from Bamboo for our internal use. --- build.xml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/build.xml b/build.xml index 2555227dc..03f3232f2 100644 --- a/build.xml +++ b/build.xml @@ -865,14 +865,18 @@ - - + + + + + + @@ -921,12 +925,17 @@ + + - + + + + From 42d3919ca4c5fc5f05b700897937a87c3ac8017f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Feb 2013 15:22:43 -0500 Subject: [PATCH 026/226] Expanded functionality for writing BAMs from HaplotypeCaller -- The new code includes a new mode to write out a BAM containing reads realigned to the called haplotypes from the HC, which can be easily visualized in IGV. -- Previous functionality maintained, with bug fixes -- Haplotype BAM writing code now lives in utils -- Created a base class that includes most of the functionality of writing reads realigned to haplotypes onto haplotypes. -- Created two subclasses, one that writes all haplotypes (previous functionality) and a CalledHaplotypeBAMWriter that will only write reads aligned to the actually called haplotypes -- Extended PerReadAlleleLikelihoodMap.getMostLikelyAllele to optionally restrict set of alleles to consider best -- Massive increase in unit tests in AlignmentUtils, along with several new powerful functions for manipulating cigars -- Fix bug in SWPairwiseAlignment that produces cigar elements with 0 size, and are now fixed with consolidateCigar in AlignmentUtils -- HaplotypeCaller now tracks the called haplotypes in the GenotypingEngine, and returns this information to the HC for use in visualization. -- Added extensive docs to HaplotypeCaller on how to use this capability -- BUGFIX -- don't modify the read bases in GATKSAMRecord in LikelihoodCalculationEngine in the HC -- Cleaned up SWPairwiseAlignment. Refactored out the big main and supplementary static methods. Added a unit test with a bug TODO to fix what seems to be an edge case bug in SW -- Integration test to make sure we can actually write a BAM for each mode. This test only ensures that the code runs and doesn't exception out. It doesn't actually enforce any MD5s -- HaplotypeBAMWriter also left aligns indels in the reads, as SW can return a random placement of a read against the haplotype. Calls leftAlign to make the alignments more clear, with unit test of real read to cover this case -- Writes out haplotypes for both all haplotype and called haplotype mode -- Haplotype writers now get the active region call, regardless of whether an actual call was made. Only emitting called haplotypes is moved down to CalledHaplotypeBAMWriter --- .../haplotypecaller/GenotypingEngine.java | 63 ++- .../haplotypecaller/HaplotypeCaller.java | 175 ++----- .../LikelihoodCalculationEngine.java | 3 +- .../HaplotypeCallerModesIntegrationTest.java | 85 ++++ .../utils/SWPairwiseAlignmentUnitTest.java | 94 ++++ .../broadinstitute/sting/utils/Haplotype.java | 16 + .../sting/utils/SWPairwiseAlignment.java | 446 ++---------------- .../sting/utils/SWPairwiseAlignmentMain.java | 222 +++++++++ .../org/broadinstitute/sting/utils/Utils.java | 11 + .../genotyper/PerReadAlleleLikelihoodMap.java | 21 +- .../AllHaplotypeBAMWriter.java | 80 ++++ .../CalledHaplotypeBAMWriter.java | 87 ++++ .../HaplotypeBAMWriter.java | 282 +++++++++++ .../sting/utils/sam/AlignmentUtils.java | 391 ++++++++++++++- .../sting/utils/HaplotypeUnitTest.java | 20 + .../sting/utils/UtilsUnitTest.java | 8 + .../HaplotypeBAMWriterUnitTest.java | 287 +++++++++++ .../utils/sam/AlignmentUtilsUnitTest.java | 354 ++++++++++++-- 18 files changed, 2050 insertions(+), 595 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index bef0cd96c..ae181aa69 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -79,6 +79,39 @@ public class GenotypingEngine { noCall.add(Allele.NO_CALL); } + /** + * Carries the result of a call to #assignGenotypeLikelihoods + */ + public static class CalledHaplotypes { + private final List calls; + private final Set calledHaplotypes; + + protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { + if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) + throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); + this.calls = calls; + this.calledHaplotypes = calledHaplotypes; + } + + /** + * Get the list of calls made at this location + * @return a non-null (but potentially empty) list of calls + */ + public List getCalls() { + return calls; + } + + /** + * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). + * @return a non-null set of haplotypes + */ + public Set getCalledHaplotypes() { + return calledHaplotypes; + } + } + /** * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling @@ -93,21 +126,21 @@ public class GenotypingEngine { * @param activeRegionWindow Active window * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype - * @return List of VC's with genotyped events + * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes */ @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - public List assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final List samples, - final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final List activeAllelesToGenotype ) { + public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { // sanity check input arguments if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); @@ -157,6 +190,8 @@ public class GenotypingEngine { } } + final Set calledHaplotypes = new HashSet(); + // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region @@ -239,6 +274,10 @@ public class GenotypingEngine { final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); + // maintain the set of all called haplotypes + for ( final Allele calledAllele : call.getAlleles() ) + calledHaplotypes.addAll(alleleMapper.get(calledAllele)); + if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); } @@ -247,7 +286,7 @@ public class GenotypingEngine { } } } - return returnCalls; + return new CalledHaplotypes(returnCalls, calledHaplotypes); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 64c762e97..003b8197f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -72,22 +71,23 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; import java.io.FileNotFoundException; import java.io.PrintStream; @@ -146,15 +146,39 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected PrintStream graphWriter = null; /** - * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here - * does not include uninformative reads so that not every input read is emitted to the bam. + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. + * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. + * + * Turning on this mode may result in serious performance cost for the HC. It's really only approprate to + * use in specific areas where you want to better understand why the HC is making specific calls. + * + * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches + * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended + * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more + * easily see which reads go with these haplotype. + * + * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire + * active region, coming from read HC and a special read group. + * + * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean + * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to + * its next best haplotype. + * + * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, + * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV + * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen + * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png + * */ - @Hidden - @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false) + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false) protected StingSAMFileWriter bamWriter = null; - private SAMFileHeader bamHeader = null; - private long uniqueNameCounter = 1; - private final static String readGroupId = "ArtificialHaplotype"; + private HaplotypeBAMWriter haplotypeBAMWriter; + + /** + * The type of BAM output we want to see. + */ + @Output(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) + public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. @@ -354,7 +378,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); if ( bamWriter != null ) - setupBamWriter(); + haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); } //--------------------------------------------------------------------------------------------------------------- @@ -497,39 +521,25 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes ); - for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoods( UG_engine, - bestHaplotypes, - samplesList, - stratifiedReadMap, - perSampleFilteredReadList, - fullReferenceWithPadding, - paddedReferenceLoc, - activeRegion.getLocation(), - getToolkit().getGenomeLocParser(), - activeAllelesToGenotype ) ) { + final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, + bestHaplotypes, + samplesList, + stratifiedReadMap, + perSampleFilteredReadList, + fullReferenceWithPadding, + paddedReferenceLoc, + activeRegion.getLocation(), + getToolkit().getGenomeLocParser(), + activeAllelesToGenotype ); + + for( final VariantContext call : calledHaplotypes.getCalls() ) { // TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker. // annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } if ( bamWriter != null ) { - // write the haplotypes to the bam - for ( Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); - - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); - for ( final Haplotype haplotype : haplotypes ) - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - - // next, output the interesting reads for each sample aligned against the appropriate haplotype - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); - } - } + haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } @@ -624,92 +634,5 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return returnMap; } - private void setupBamWriter() { - // prepare the bam header - bamHeader = new SAMFileHeader(); - bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); - bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - // include the original read groups plus a new artificial one for the haplotypes - final List readGroups = new ArrayList(getToolkit().getSAMFileHeader().getReadGroups()); - final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); - rg.setSample("HC"); - rg.setSequencingCenter("BI"); - readGroups.add(rg); - bamHeader.setReadGroups(readGroups); - - bamWriter.setPresorted(false); - bamWriter.writeHeader(bamHeader); - } - - private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { - final GATKSAMRecord record = new GATKSAMRecord(bamHeader); - record.setReadBases(haplotype.getBases()); - record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); - record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); - record.setCigar(haplotype.getCigar()); - record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); - record.setReadName("HC" + uniqueNameCounter++); - record.setReadUnmappedFlag(false); - record.setReferenceIndex(paddedRefLoc.getContigIndex()); - record.setAttribute(SAMTag.RG.toString(), readGroupId); - record.setFlags(16); - bamWriter.addAlignment(record); - } - - private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) { - - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2); - final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1(); - final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; - read.setAlignmentStart(readStartOnReference); - - final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar()); - read.setCigar(cigar); - - bamWriter.addAlignment(read); - } - - private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) { - - int currentReadPos = 0; - int currentHapPos = 0; - final List readCigarElements = new ArrayList(); - - for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) { - - if ( cigarElement.getOperator() == CigarOperator.D ) { - if ( currentReadPos > 0 ) - readCigarElements.add(cigarElement); - } else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) { - - final int elementLength = cigarElement.getLength(); - final int nextReadPos = currentReadPos + elementLength; - final int nextHapPos = currentHapPos + elementLength; - - // do we want this element? - if ( currentReadPos > 0 ) { - // do we want the entire element? - if ( nextReadPos < read.getReadLength() ) { - readCigarElements.add(cigarElement); - currentReadPos = nextReadPos; - } - // otherwise, we can finish up and return the cigar - else { - readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator())); - return new Cigar(readCigarElements); - } - } - // do we want part of the element to start? - else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) { - currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength()); - readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator())); - } - - currentHapPos = nextHapPos; - } - } - - return new Cigar(readCigarElements); - } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index aeeb95c87..a7d85b969 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -133,7 +133,8 @@ public class LikelihoodCalculationEngine { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? Haplotype previousHaplotypeSeen = null; - final byte[] readQuals = read.getBaseQualities(); + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities(); for( int kkk = 0; kkk < readQuals.length; kkk++ ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java new file mode 100644 index 000000000..27b429353 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java @@ -0,0 +1,85 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +public class HaplotypeCallerModesIntegrationTest extends WalkerTest { + // -------------------------------------------------------------------------------------------------------------- + // + // testing that writing a BAM works + // + // I don't really care about the MD5s, so I'm just not providing them here, so they don't have to be + // updated. These tests are basically ensuring that the code doesn't just randomly blow up. + // + // TODO -- what i'd really like to ensure here isn't the MD5 but that the BAMs can be read by the GATK or IGV + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestBamWriterCalledHaplotypes() { + HCTestBamWriter(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, ""); // current MD5 = 9a2b6157f14b44b872a77f4e75c56023 + } + + @Test + public void HCTestBamWriterAllHaplotypes() { + HCTestBamWriter(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, ""); // current MD5 = 06d885d82be81b8eef13bbfcd8041189 + } + + public void HCTestBamWriter(final HaplotypeBAMWriter.Type type, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o /dev/null " + + "-bamout %s -L 20:10,000,000-10,010,000 -bamWriterType " + type, 1, + Arrays.asList(md5)); + executeTest("HC writing bams with mode " + type, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java new file mode 100644 index 000000000..6d3c310b7 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java @@ -0,0 +1,94 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +public class SWPairwiseAlignmentUnitTest extends BaseTest { + @DataProvider(name = "ComplexReadAlignedToRef") + public Object[][] makeComplexReadAlignedToRef() { + List tests = new ArrayList(); + + final String ref1 = "ACTGACTGACTG"; + tests.add(new Object[]{"AAAGGACTGACTG", ref1, 1, "12M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ComplexReadAlignedToRef", enabled = true) + public void testReadAlignedToRefComplexAlignment(final String reference, final String read, final int expectedStart, final String expectedCigar) { + final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes()); + Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); + Assert.assertEquals(sw.getCigar().toString(), expectedCigar); + } + + // TODO + // TODO + // TODO this example demonstrates some kind of failure mode of SW that results in the read not being aligned + // TODO to the reference at all. It has something to do with the specific parameters provided to the + // TODO SW code. With the default parameters the result is the one expected. With the specified parameters + // TODO the code fails + // TODO + // TODO + @Test(enabled = false) + public void testOddNoAlignment() { + final String reference = "AAAGACTACTG"; + final String read = "AACGGACACTG"; + final int expectedStart = 0; + final String expectedCigar = "11M"; + final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), 5.0, -10.0, -22.0, -1.2); + sw.printAlignment(reference.getBytes(), read.getBytes()); + Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); + Assert.assertEquals(sw.getCigar().toString(), expectedCigar); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index cce6abbee..415cb73ac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -27,9 +27,12 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Requires; import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -121,6 +124,19 @@ public class Haplotype extends Allele { return cigar; } + /** + * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar + * + * @param padSize how many additional Ms should be appended to the end of this cigar. Must be >= 0 + * @return a newly allocated Cigar that consolidate(getCigar + padSize + M) + */ + public Cigar getConsolidatedPaddedCigar(final int padSize) { + if ( padSize < 0 ) throw new IllegalArgumentException("padSize must be >= 0 but got " + padSize); + final Cigar extendedHaplotypeCigar = new Cigar(getCigar().getCigarElements()); + if ( padSize > 0 ) extendedHaplotypeCigar.add(new CigarElement(padSize, CigarOperator.M)); + return AlignmentUtils.consolidateCigar(extendedHaplotypeCigar); + } + public void setCigar( final Cigar cigar ) { this.cigar = cigar; } diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index 7bd937af9..e501cf40a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -1,28 +1,28 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + package org.broadinstitute.sting.utils; import net.sf.samtools.Cigar; @@ -30,17 +30,23 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.*; /** - * Created by IntelliJ IDEA. + * Pairwise discrete smith-waterman alignment + * + * ************************************************************************ + * **** IMPORTANT NOTE: **** + * **** This class assumes that all bytes come from UPPERCASED chars! **** + * ************************************************************************ + * * User: asivache * Date: Mar 23, 2009 * Time: 1:54:54 PM - * To change this template use File | Settings | File Templates. */ -public class SWPairwiseAlignment { +public final class SWPairwiseAlignment { private int alignment_offset; // offset of s2 w/respect to s1 private Cigar alignmentCigar; @@ -54,24 +60,11 @@ public class SWPairwiseAlignment { private static final int DSTATE = 2; private static final int CLIP = 3; - private static boolean cutoff = false; + protected static boolean cutoff = false; private static boolean DO_SOFTCLIP = true; double[] SW; -// private double [] best_gap_v ; -// private int [] gap_size_v ; -// private double [] best_gap_h ; -// private int [] gap_size_h ; - - - // private static double [][] sw = new double[500][500]; - // private static int [][] btrack = new int[500][500]; - - // ************************************************************************ - // **** IMPORTANT NOTE: **** - // **** This class assumes that all bytes come from UPPERCASED chars! **** - // ************************************************************************ public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { w_match = match; w_mismatch = mismatch; @@ -80,12 +73,10 @@ public class SWPairwiseAlignment { align(seq1,seq2); } - public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) } - public Cigar getCigar() { return alignmentCigar ; } public int getAlignmentStart2wrt1() { return alignment_offset; } @@ -97,13 +88,6 @@ public class SWPairwiseAlignment { SW = sw; int [] btrack = new int[(n+1)*(m+1)]; -// best_gap_v = new double[m+1]; -// Arrays.fill(best_gap_v,-1.0e40); -// gap_size_v = new int[m+1]; -// best_gap_h = new double[n+1]; -// Arrays.fill(best_gap_h,-1.0e40); -// gap_size_h = new int[n+1]; - calculateMatrix(a, b, sw, btrack); calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) } @@ -169,18 +153,6 @@ public class SWPairwiseAlignment { final double step_down = best_gap_v[j] ; final int kd = gap_size_v[j]; -/* - for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) { - // data_offset_k is linearized offset of element [i-k][j] - // in other words, trial = sw[i-k][j]+gap_penalty: - final double trial = sw[data_offset_k]+wk(k); - if ( step_down < trial ) { - step_down=trial; - kd = k; - } - } -*/ - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing // all 'step right' events that would end in the current cell. The optimized code // does exactly the same thing as the commented out loop below. IMPORTANT: @@ -202,21 +174,6 @@ public class SWPairwiseAlignment { final double step_right = best_gap_h[i]; final int ki = gap_size_h[i]; -/* - for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) { - // data_offset is linearized offset of element [i][j-k] - // in other words, step_right=sw[i][j-k]+gap_penalty; - final double trial = sw[data_offset]+wk(k); - if ( step_right < trial ) { - step_right=trial; - ki = k; - } - } - - final int data_offset = row_offset + j; // linearized offset of element [i][j] -*/ - - if ( step_down > step_right ) { if ( step_down > step_diag ) { sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down); @@ -235,8 +192,6 @@ public class SWPairwiseAlignment { btrack[data_offset] = 0; // 0 = diagonal } } - -// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); } // IMPORTANT, IMPORTANT, IMPORTANT: @@ -245,7 +200,6 @@ public class SWPairwiseAlignment { // in the for() statement itself. row_offset_1 = row_offset; } -// print(sw,a,b); } @@ -271,12 +225,10 @@ public class SWPairwiseAlignment { if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { p1 = n; p2 = j ; -// maxscore = sw[n][j]; maxscore = sw[data_offset]; segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment } } -// System.out.println(" Found max score="+maxscore+" at p1="+p1+ " p2="+p2); List lce = new ArrayList(5); @@ -291,16 +243,12 @@ public class SWPairwiseAlignment { int state = MSTATE; int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] - // System.out.println("Backtracking: starts at "+p1+":"+p2+" ("+sw[data_offset]+")"); do { -// int btr = btrack[p1][p2]; int btr = btrack[data_offset]; int new_state; int step_length = 1; - // System.out.print(" backtrack value: "+btr); - if ( btr > 0 ) { new_state = DSTATE; step_length = btr; @@ -309,25 +257,16 @@ public class SWPairwiseAlignment { step_length = (-btr); } else new_state = MSTATE; // and step_length =1, already set above - // move to next best location in the sw matrix: switch( new_state ) { case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in th esw matrix case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up } - // System.out.println("; backtracked to p1="+p1+" p2="+p2); - /* - switch( new_state ) { - case MSTATE: System.out.println(" diag (match) to "+ sw[data_offset]); break; // equivalent to p1--; p2-- - case ISTATE: System.out.println(" left (insertion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p2-=step_length; - case DSTATE: System.out.println(" up (deletion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p1 -= step_up - } - */ + // now let's see if the state actually changed: if ( new_state == state ) segment_length+=step_length; else { -// System.out.println(" emitting "+segment_length+makeElement(state,segment_length).getOperator().toString()); // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). lce.add(makeElement(state, segment_length)); segment_length = step_length; @@ -354,11 +293,9 @@ public class SWPairwiseAlignment { } Collections.reverse(lce); - alignmentCigar = new Cigar(lce); - + alignmentCigar = AlignmentUtils.consolidateCigar(new Cigar(lce)); } - private CigarElement makeElement(int state, int segment_length) { CigarOperator o = null; switch(state) { @@ -374,33 +311,11 @@ public class SWPairwiseAlignment { return (x == y ? w_match : w_mismatch); } - private double wk(int k) { - return w_open+(k-1)*w_extend; // gap - } - - private void print(double[] s, byte[] a, byte[] b) { - int n = a.length+1; - int m = b.length+1; - System.out.print(" "); - for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ; - System.out.println(); - - for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) { - if ( i > 0 ) System.out.print((char)a[i-1]); - else System.out.print(' '); - System.out.print(" "); - for ( int j = 0; j < m ; j++ ) { - System.out.printf(" %5.1f",s[row_offset+j]); - } - System.out.println(); - } - } - - static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read) { - printAlignment(a,ref,read,100); + public void printAlignment(byte[] ref, byte[] read) { + printAlignment(ref,read,100); } - static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read, int width) { + public void printAlignment(byte[] ref, byte[] read, int width) { StringBuilder bread = new StringBuilder(); StringBuilder bref = new StringBuilder(); StringBuilder match = new StringBuilder(); @@ -408,9 +323,9 @@ public class SWPairwiseAlignment { int i = 0; int j = 0; - final int offset = a.getAlignmentStart2wrt1(); + final int offset = getAlignmentStart2wrt1(); - Cigar cigar = a.getCigar(); + Cigar cigar = getCigar(); if ( ! DO_SOFTCLIP ) { @@ -436,7 +351,7 @@ public class SWPairwiseAlignment { } if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 - for ( ; i < a.getAlignmentStart2wrt1() ; i++ ) { + for ( ; i < getAlignmentStart2wrt1() ; i++ ) { bref.append((char)ref[i]); bread.append(' '); match.append(' '); @@ -506,280 +421,5 @@ public class SWPairwiseAlignment { } int end = Math.min(start+width,s.length()); System.out.println(s.substring(start,end)); - } - -// BELOW: main() method for testing; old implementations of the core methods are commented out below; -// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed. - - public static void main(String argv[]) { -// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT"; -// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG"; - - String ref = null; - String read = null; - - Map> args = processArgs(argv); - - List l = args.get("SEQ"); - args.remove("SEQ"); - if ( l == null ) { - System.err.println("SEQ argument is missing. Two input sequences must be provided"); - System.exit(1); - } - if ( l.size() != 2 ) { - System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead"); - System.exit(1); - } - - ref = l.get(0); - read = l.get(1); - - Double m = extractSingleDoubleArg("MATCH",args); - Double mm = extractSingleDoubleArg("MISMATCH",args); - Double open = extractSingleDoubleArg("OPEN",args); - Double ext = extractSingleDoubleArg("EXTEND",args); - - Boolean reverse = extractSingleBooleanArg("REVERSE",args); - if ( reverse != null && reverse.booleanValue() == true ) { - ref = Utils.reverse(ref); - read = Utils.reverse(read); - } - - Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args); - Boolean cut = extractSingleBooleanArg("CUTOFF",args); - if ( cut != null ) SWPairwiseAlignment.cutoff = cut; - - if ( args.size() != 0 ) { - System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next()); - System.exit(1); - } - - double w_match; - double w_mismatch; - double w_open; - double w_extend; - - w_match = (m == null ? 30.0 : m.doubleValue()); - w_mismatch = (mm == null ? -10.0 : mm.doubleValue()); - w_open = (open == null ? -10.0 : open.doubleValue()); - w_extend = (ext == null ? -2.0 : ext.doubleValue()); - - - SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); - - System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ - " length1="+ref.length()+" length2="+read.length()); - - - System.out.println(); - printAlignment(a,ref.getBytes(),read.getBytes()); - - System.out.println(); - if ( print_mat != null && print_mat == true ) { - a.print(a.SW,ref.getBytes(),read.getBytes()); - } - } - - - static Pair getArg(String prefix, String argv[], int i) { - String arg = null; - if ( argv[i].startsWith(prefix) ) { - arg = argv[i].substring(prefix.length()); - if( arg.length() == 0 ) { - i++; - if ( i < argv.length ) arg = argv[i]; - else { - System.err.println("No value found after " + prefix + " argument tag"); - System.exit(1); - } - } - i++; - } - return new Pair(arg,i); - } - - static Map> processArgs(String argv[]) { - Map> args = new HashMap>(); - - for ( int i = 0; i < argv.length ; i++ ) { - String arg = argv[i]; - int pos = arg.indexOf('='); - if ( pos < 0 ) { - System.err.println("Argument "+arg+" is not of the form ="); - System.exit(1); - } - String val = arg.substring(pos+1); - if ( val.length() == 0 ) { - // there was a space between '=' and the value - i++; - if ( i < argv.length ) val = argv[i]; - else { - System.err.println("No value found after " + arg + " argument tag"); - System.exit(1); - } - } - arg = arg.substring(0,pos); - - List l = args.get(arg); - if ( l == null ) { - l = new ArrayList(); - args.put(arg,l); - } - l.add(val); - } - return args; - } - - static Double extractSingleDoubleArg(String argname, Map> args) { - List l = args.get(argname); - args.remove(argname); - if ( l == null ) return null; - - if ( l.size() > 1 ) { - System.err.println("Only one "+argname+" argument is allowed"); - System.exit(1); - } - double d=0; - try { - d = Double.parseDouble(l.get(0)); - } catch ( NumberFormatException e) { - System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")"); - System.exit(1); - } - System.out.println("Argument "+argname+" set to "+d); - return new Double(d); - } - - - static Boolean extractSingleBooleanArg(String argname, Map> args) { - List l = args.get(argname); - args.remove(argname); - if ( l == null ) return null; - - if ( l.size() > 1 ) { - System.err.println("Only one "+argname+" argument is allowed"); - System.exit(1); - } - if ( l.get(0).equals("true") ) return Boolean.valueOf(true); - if ( l.get(0).equals("false") ) return Boolean.valueOf(false); - System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed"); - System.exit(1); - return Boolean.valueOf(false); // This value isn't used because it is preceded by System.exit(1) - } - -/* ############################################## - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend, boolean runOld ) { - w_match = match; - w_mismatch = mismatch; - w_open = open; - w_extend = extend; - if ( runOld ) align_old(seq1,seq2); - else align(seq1,seq2); - } - - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, boolean runOld) { - this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0,runOld); // match=1, mismatch = -1/3, gap=-(1+k/3) - } - - public void align_old(final byte[] a, final byte[] b) { - final int n = a.length; - final int m = b.length; - double [] sw = new double[(n+1)*(m+1)]; - int [] btrack = new int[(n+1)*(m+1)]; - calculateMatrix_old(a, b, sw, btrack); - calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) - } - - private void calculateMatrix_old(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { - final int n = a.length+1; - final int m = b.length+1; - - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop - byte a_base = a[i-1]; // letter in a at the current pos - - final int row_offset = row_offset_1 + m; - - // On the entrance into the loop, row_offset_1 is the (linear) offset - // of the first element of row (i-1) and row_offset is the linear offset of the - // start of row i - - for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { - - // data_offset_1 is linearized offset of element [i-1][j-1] - - final byte b_base = b[j-1]; // letter in b at the current pos - - // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_diag = sw[data_offset_1] + wd(a_base,b_base); - int kd = 0; - - double step_down = 0; - - for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) { - // data_offset_k is linearized offset of element [i-k][j] - // in other words, trial = sw[i-k][j]+gap_penalty: - final double trial = sw[data_offset_k]+wk(k); - if ( step_down < trial ) { - step_down=trial; - kd = k; - } - } - - int ki = 0; - - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing - // all 'step right' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - double step_right = 0; - - for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) { - // data_offset is linearized offset of element [i][j-k] - // in other words, step_right=sw[i][j-k]+gap_penalty; - final double trial = sw[data_offset]+wk(k); - if ( step_right < trial ) { - step_right=trial; - ki = k; - } - } - - final int data_offset = row_offset + j; // linearized offset of element [i][j] - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[data_offset] = Math.max(0,step_down); - btrack[data_offset] = kd ; // positive=vertical - } else { - sw[data_offset] = Math.max(0,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } else { - // step_down <= step_right - if ( step_right > step_diag ) { - sw[data_offset] = Math.max(0,step_right); - btrack[data_offset] = -ki; // negative = horizontal - } else { - sw[data_offset] = Math.max(0,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } - -// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - - // IMPORTANT, IMPORTANT, IMPORTANT: - // note that we update this (secondary) outer loop variable here, - // so that we DO NOT need to update it - // in the for() statement itself. - row_offset_1 = row_offset; - } -// print(sw,a,b); - } -##################### -END COMMENTED OUT SECTION -*/ - } diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java new file mode 100644 index 000000000..a49d7e5e6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Simple program to run SW performance test. + * + * // TODO -- should be replaced with Caliper before using again + * + * User: depristo + * Date: 2/28/13 + * Time: 4:54 PM + * To change this template use File | Settings | File Templates. + */ +public class SWPairwiseAlignmentMain { + // BELOW: main() method for testing; old implementations of the core methods are commented out below; +// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed. + + public static void main(String argv[]) { +// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT"; +// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG"; + + String ref = null; + String read = null; + + Map> args = processArgs(argv); + + List l = args.get("SEQ"); + args.remove("SEQ"); + if ( l == null ) { + System.err.println("SEQ argument is missing. Two input sequences must be provided"); + System.exit(1); + } + if ( l.size() != 2 ) { + System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead"); + System.exit(1); + } + + ref = l.get(0); + read = l.get(1); + + Double m = extractSingleDoubleArg("MATCH",args); + Double mm = extractSingleDoubleArg("MISMATCH",args); + Double open = extractSingleDoubleArg("OPEN",args); + Double ext = extractSingleDoubleArg("EXTEND",args); + + Boolean reverse = extractSingleBooleanArg("REVERSE",args); + if ( reverse != null && reverse.booleanValue() == true ) { + ref = Utils.reverse(ref); + read = Utils.reverse(read); + } + + Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args); + Boolean cut = extractSingleBooleanArg("CUTOFF",args); + if ( cut != null ) SWPairwiseAlignment.cutoff = cut; + + if ( args.size() != 0 ) { + System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next()); + System.exit(1); + } + + double w_match; + double w_mismatch; + double w_open; + double w_extend; + + w_match = (m == null ? 30.0 : m.doubleValue()); + w_mismatch = (mm == null ? -10.0 : mm.doubleValue()); + w_open = (open == null ? -10.0 : open.doubleValue()); + w_extend = (ext == null ? -2.0 : ext.doubleValue()); + + + SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); + + System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ + " length1="+ref.length()+" length2="+read.length()); + + + System.out.println(); + a.printAlignment(ref.getBytes(),read.getBytes()); + + System.out.println(); + if ( print_mat != null && print_mat == true ) { + print(a.SW,ref.getBytes(),read.getBytes()); + } + } + + private static void print(double[] s, byte[] a, byte[] b) { + int n = a.length+1; + int m = b.length+1; + System.out.print(" "); + for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ; + System.out.println(); + + for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) { + if ( i > 0 ) System.out.print((char)a[i-1]); + else System.out.print(' '); + System.out.print(" "); + for ( int j = 0; j < m ; j++ ) { + System.out.printf(" %5.1f",s[row_offset+j]); + } + System.out.println(); + } + } + + + static Pair getArg(String prefix, String argv[], int i) { + String arg = null; + if ( argv[i].startsWith(prefix) ) { + arg = argv[i].substring(prefix.length()); + if( arg.length() == 0 ) { + i++; + if ( i < argv.length ) arg = argv[i]; + else { + System.err.println("No value found after " + prefix + " argument tag"); + System.exit(1); + } + } + i++; + } + return new Pair(arg,i); + } + + static Map> processArgs(String argv[]) { + Map> args = new HashMap>(); + + for ( int i = 0; i < argv.length ; i++ ) { + String arg = argv[i]; + int pos = arg.indexOf('='); + if ( pos < 0 ) { + System.err.println("Argument "+arg+" is not of the form ="); + System.exit(1); + } + String val = arg.substring(pos+1); + if ( val.length() == 0 ) { + // there was a space between '=' and the value + i++; + if ( i < argv.length ) val = argv[i]; + else { + System.err.println("No value found after " + arg + " argument tag"); + System.exit(1); + } + } + arg = arg.substring(0,pos); + + List l = args.get(arg); + if ( l == null ) { + l = new ArrayList(); + args.put(arg,l); + } + l.add(val); + } + return args; + } + + static Double extractSingleDoubleArg(String argname, Map> args) { + List l = args.get(argname); + args.remove(argname); + if ( l == null ) return null; + + if ( l.size() > 1 ) { + System.err.println("Only one "+argname+" argument is allowed"); + System.exit(1); + } + double d=0; + try { + d = Double.parseDouble(l.get(0)); + } catch ( NumberFormatException e) { + System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")"); + System.exit(1); + } + System.out.println("Argument "+argname+" set to "+d); + return new Double(d); + } + + + static Boolean extractSingleBooleanArg(String argname, Map> args) { + List l = args.get(argname); + args.remove(argname); + if ( l == null ) return null; + + if ( l.size() > 1 ) { + System.err.println("Only one "+argname+" argument is allowed"); + System.exit(1); + } + if ( l.get(0).equals("true") ) return Boolean.valueOf(true); + if ( l.get(0).equals("false") ) return Boolean.valueOf(false); + System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed"); + System.exit(1); + return Boolean.valueOf(false); // This value isn't used because it is preceded by System.exit(1) + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index d009ba5bc..45a2fa58d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -54,6 +54,17 @@ public class Utils { public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + /** + * Boolean xor operation. Only true if x != y. + * + * @param x a boolean + * @param y a boolean + * @return true if x != y + */ + public static boolean xor(final boolean x, final boolean y) { + return x != y; + } + /** * Calculates the optimum initial size for a hash table given the maximum number * of elements it will need to hold. The optimum size is the smallest size that diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index cc4fc6129..5e010db67 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -203,12 +203,32 @@ public class PerReadAlleleLikelihoodMap { */ @Ensures("result != null") public static Allele getMostLikelyAllele( final Map alleleMap ) { + return getMostLikelyAllele(alleleMap, null); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD + * then the most likely allele is set to "no call" + * + * @param alleleMap - a map from alleles to likelihoods + * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. + * this is useful for the case where you've selected a subset of the alleles that + * the reads have been computed for further analysis. If null totally ignored + * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD + * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the + * corresponding key + */ + public static Allele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; for (final Map.Entry el : alleleMap.entrySet()) { + if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) + continue; + if (el.getValue() > maxLike) { prevMaxLike = maxLike; maxLike = el.getValue(); @@ -220,7 +240,6 @@ public class PerReadAlleleLikelihoodMap { return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); } - /** * Debug method to dump contents of object into string for display */ diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java new file mode 100644 index 000000000..46ffd43b6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * A haplotype bam writer that writes out all haplotypes as reads and then + * the alignment of reach read to its best match among the best haplotypes. + * + * Primarily useful for people working on the HaplotypeCaller method itself + * + * User: depristo + * Date: 2/22/13 + * Time: 1:50 PM + */ +class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { + public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) { + super(bamWriter); + } + + /** + * {@inheritDoc} + */ + @Override + public void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap) { + writeHaplotypesAsReads(haplotypes, new HashSet(bestHaplotypes), paddedReferenceLoc); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + + // next, output the interesting reads for each sample aligned against the appropriate haplotype + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java new file mode 100644 index 000000000..a33ed809a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Writes a BAM containing just the reads in stratifiedReadMap aligned to their + * most likely haplotype among all of the called haplotypes. + * + * Primarily useful for users of the HaplotypeCaller who want to better understand the + * support of their calls w.r.t. the reads. + * + * User: depristo + * Date: 2/22/13 + * Time: 1:50 PM + */ +class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { + public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) { + super(bamWriter); + } + + /** + * {@inheritDoc} + */ + @Override + public void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap) { + if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes + return; + + writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : calledHaplotypes ) { + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + } + + // the set of all alleles that were actually called + final Set allelesOfCalledHaplotypes = alleleToHaplotypeMap.keySet(); + + // next, output the interesting reads for each sample aligned against one of the called haplotypes + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + if ( entry.getKey().getMappingQuality() > 0 ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java new file mode 100644 index 000000000..c0d3b38fa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -0,0 +1,282 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * A BAMWriter that aligns reads to haplotypes and emits their best alignments to a BAM file + * + * User: depristo + * Date: 2/22/13 + * Time: 2:59 PM + */ +public abstract class HaplotypeBAMWriter { + /** + * Allows us to write out unique names for our synthetic haplotype reads + */ + private long uniqueNameCounter = 1; + + protected final static String READ_GROUP_ID = "ArtificialHaplotype"; + protected final static String HAPLOTYPE_TAG = "HC"; + + final SAMFileWriter bamWriter; + final SAMFileHeader bamHeader; + + /** + * Possible modes for writing haplotypes to BAMs + */ + public static enum Type { + /** + * A mode that's for method developers. Writes out all of the possible + * haplotypes considered, as well as reads aligned to each + */ + ALL_POSSIBLE_HAPLOTYPES, + + /** + * A mode for users. Writes out the reads aligned only to the called + * haplotypes. Useful to understand why the caller is calling what it is + */ + CALLED_HAPLOTYPES + } + + /** + * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer + * + * @param type the type of the writer we want to create + * @param stingSAMWriter the destination, must not be null + * @param header the header of the input BAMs used to make calls, must not be null + * @return a new HaplotypeBAMWriter + */ + public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) { + if ( header == null ) throw new IllegalArgumentException("header cannot be null"); + if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null"); + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + // prepare the bam header + final SAMFileHeader bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(header.getSequenceDictionary()); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + // include the original read groups plus a new artificial one for the haplotypes + final List readGroups = new ArrayList(header.getReadGroups()); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + + // TODO -- this will be a performance problem at high-scale + stingSAMWriter.setPresorted(false); + stingSAMWriter.writeHeader(bamHeader); + return create(type, stingSAMWriter); + } + + /** + * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer + * + * Note that writer must have its presorted bit set to false, as reads + * may come in out of order during writing + * + * @param type the type of the writer we want to create + * @param writer the destination, must not be null + * @return a new HaplotypeBAMWriter + */ + public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) { + if ( writer == null ) throw new IllegalArgumentException("writer cannot be null"); + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + switch ( type ) { + case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer); + case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer); + default: throw new IllegalArgumentException("Unknown type " + type); + } + } + + /** + * Create a new HaplotypeBAMWriter writing its output to bamWriter + * + * Assumes that the header has been fully initialized with a single + * read group READ_GROUP_ID + * + * @param bamWriter our output destination + */ + protected HaplotypeBAMWriter(SAMFileWriter bamWriter) { + this.bamWriter = bamWriter; + this.bamHeader = bamWriter.getFileHeader(); + } + + /** + * Write out a BAM representing for the haplotype caller at this site + * + * @param haplotypes a list of all possible haplotypes at this loc + * @param paddedReferenceLoc the span of the based reference here + * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping + * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference + * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes + */ + public abstract void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap); + + /** + * Write out read aligned to haplotype to the BAM file + * + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + */ + protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart) { + final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart); + if ( alignedToRef != null ) + bamWriter.addAlignment(alignedToRef); + } + + /** + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible + */ + protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart) { + if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null"); + if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null"); + if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype); + if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); + + try { + // compute the smith-waterman alignment of read -> haplotype + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), 5.0, -10.0, -22.0, -1.2); + //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); + if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) + // sw can fail (reasons not clear) so if it happens just don't write the read + return null; + final Cigar swCigar = AlignmentUtils.consolidateCigar(swPairwiseAlignment.getCigar()); + + // since we're modifying the read we need to clone it + final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone(); + + addHaplotypeTag(read, haplotype); + + // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar + final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000); + final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1()); + final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; + read.setAlignmentStart(readStartOnReference); + + // compute the read -> ref alignment by mapping read -> hap -> ref from the + // SW of read -> hap mapped through the given by hap -> ref + final Cigar haplotypeToRef = AlignmentUtils.trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1); + final Cigar readToRefCigarRaw = AlignmentUtils.applyCigarToCigar(swCigar, haplotypeToRef); + final Cigar readToRefCigarClean = AlignmentUtils.cleanUpCigar(readToRefCigarRaw); + final Cigar readToRefCigar = AlignmentUtils.leftAlignIndel(readToRefCigarClean, haplotype.getBases(), + originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true); + + read.setCigar(readToRefCigar); + + if ( readToRefCigar.getReadLength() != read.getReadLength() ) + throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() + + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() + + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); + + return read; + } catch ( CloneNotSupportedException e ) { + throw new IllegalStateException("GATKSAMRecords should support clone but this one does not " + originalRead); + } + } + + /** + * Add a haplotype tag to the read based on haplotype + * + * @param read the read to add the tag to + * @param haplotype the haplotype that gives rises to read + */ + private void addHaplotypeTag(final GATKSAMRecord read, final Haplotype haplotype) { + // add a tag to the read that indicates which haplotype it best aligned to. It's a uniquish integer + read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); + } + + /** + * Write out haplotypes as reads to the BAM, marking specifically those that are among the best haplotypes + * + * @param haplotypes a collection of haplotypes to write to the BAM + * @param bestHaplotypes a subset of haplotypes that contains those that are best "either good or called" + * @param paddedReferenceLoc the genome loc of the padded reference + */ + protected void writeHaplotypesAsReads(final Collection haplotypes, + final Set bestHaplotypes, + final GenomeLoc paddedReferenceLoc) { + for ( final Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); + } + + /** + * Write out a representation of this haplotype as a read + * + * @param haplotype a haplotype to write out. Cannot be null + * @param paddedRefLoc the reference location. Cannot be null + * @param isAmongBestHaplotypes true if among the best haplotypes, false if it was just one possible but not so good + */ + private void writeHaplotype(final Haplotype haplotype, + final GenomeLoc paddedRefLoc, + final boolean isAmongBestHaplotypes) { + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); + record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + addHaplotypeTag(record, haplotype); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(paddedRefLoc.getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); + record.setFlags(16); + bamWriter.addAlignment(record); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index d34e2996c..d59d0ef63 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,18 +31,17 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.EventType; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; +import java.util.*; public final class AlignmentUtils { + private final static Logger logger = Logger.getLogger(AlignmentUtils.class); private final static EnumSet ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); private final static EnumSet ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S); @@ -58,6 +57,9 @@ public final class AlignmentUtils { return getMismatchCount(r, refSeq, refIndex).mismatchQualities; } + /** + * @see #getMismatchCount(GATKSAMRecord, byte[], int, int, int) with startOnRead == 0 and nReadBases == read.getReadLength() + */ public static MismatchCount getMismatchCount(GATKSAMRecord r, byte[] refSeq, int refIndex) { return getMismatchCount(r, refSeq, refIndex, 0, r.getReadLength()); } @@ -70,7 +72,10 @@ public final class AlignmentUtils { * * @param r the sam record to check against * @param refSeq the byte array representing the reference sequence - * @param refIndex the index in the reference byte array of the read's first base (the reference index is matching the alignment start, there may be tons of soft-clipped bases before/after that so it's wrong to compare with getReadLength() here.) + * @param refIndex the index in the reference byte array of the read's first base (the reference index + * is matching the alignment start, there may be tons of soft-clipped bases before/after + * that so it's wrong to compare with getReadLength() here.). Note that refIndex is + * zero based, not 1 based * @param startOnRead the index in the read's bases from which we start counting * @param nReadBases the number of bases after (but including) startOnRead that we check * @return non-null object representing the mismatch count @@ -440,6 +445,9 @@ public final class AlignmentUtils { * Need a well-formed, consolidated Cigar string so that the left aligning code works properly. * For example, 1M1M1M1D2M1M --> 3M1D3M * If the given cigar is empty then the returned cigar will also be empty + * + * Note that this routine collapses cigar elements of size 0, so 2M0M => 2M + * * @param c the cigar to consolidate * @return a non-null cigar with consecutive matching operators merged into single operators. */ @@ -450,13 +458,25 @@ public final class AlignmentUtils { final Cigar returnCigar = new Cigar(); int sumLength = 0; - for( int iii = 0; iii < c.numCigarElements(); iii++ ) { - sumLength += c.getCigarElement(iii).getLength(); - if( iii == c.numCigarElements() - 1 || !c.getCigarElement(iii).getOperator().equals(c.getCigarElement(iii+1).getOperator())) { // at the end so finish the current element - returnCigar.add(new CigarElement(sumLength, c.getCigarElement(iii).getOperator())); + CigarElement lastElement = null; + + for( final CigarElement cur : c.getCigarElements() ) { + if ( cur.getLength() == 0 ) + continue; // don't add elements of 0 length + + if ( lastElement != null && lastElement.getOperator() != cur.getOperator() ) { + returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); sumLength = 0; } + + sumLength += cur.getLength(); + lastElement = cur; } + + if( sumLength > 0 ) { + returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); + } + return returnCigar; } @@ -616,7 +636,7 @@ public final class AlignmentUtils { */ @Requires("c != null") @Ensures("result != null") - private static Cigar cleanUpCigar(final Cigar c) { + public static Cigar cleanUpCigar(final Cigar c) { final List elements = new ArrayList(c.numCigarElements() - 1); for (final CigarElement ce : c.getCigarElements()) { @@ -730,4 +750,355 @@ public final class AlignmentUtils { return alt; } + + + /** + * Trim cigar down to one that starts at start reference on the left and extends to end on the reference + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases on the reference? The first position is 0 + * @param end Where should we stop keeping bases on the reference? The maximum value is cigar.getReferenceLength() + * @return a new Cigar with reference length == start - end + 1 + */ + public static Cigar trimCigarByReference(final Cigar cigar, final int start, final int end) { + if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start); + if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start); + if ( end > cigar.getReferenceLength() ) throw new IllegalArgumentException("End is beyond the cigar's reference length " + end + " for cigar " + cigar ); + + final Cigar result = trimCigar(cigar, start, end, true); + + if ( result.getReferenceLength() != end - start + 1) + throw new IllegalStateException("trimCigarByReference failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result); + return result; + } + + /** + * Trim cigar down to one that starts at start base in the cigar and extends to (inclusive) end base + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases in the cigar? The first position is 0 + * @param end Where should we stop keeping bases in the cigar? The maximum value is cigar.getReadLength() + * @return a new Cigar containing == start - end + 1 reads + */ + public static Cigar trimCigarByBases(final Cigar cigar, final int start, final int end) { + if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start); + if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start); + if ( end > cigar.getReadLength() ) throw new IllegalArgumentException("End is beyond the cigar's read length " + end + " for cigar " + cigar ); + + final Cigar result = trimCigar(cigar, start, end, false); + + final int expectedSize = end - start + 1; + if ( result.getReadLength() != expectedSize) + throw new IllegalStateException("trimCigarByBases failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result + " with size " + result.getReadLength() + " expected " + expectedSize + " for input cigar " + cigar); + return result; + } + + + /** + * Workhorse for trimCigarByBases and trimCigarByReference + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases in the cigar? The first position is 0 + * @param end Where should we stop keeping bases in the cigar? The maximum value is cigar.getReadLength() + * @param byReference should start and end be intrepreted as position in the reference or the read to trim to/from? + * @return a non-null cigar + */ + @Requires({"cigar != null", "start >= 0", "start <= end"}) + @Ensures("result != null") + private static Cigar trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference) { + final List newElements = new LinkedList(); + + int pos = 0; + for ( final CigarElement elt : cigar.getCigarElements() ) { + if ( pos > end ) break; + + switch ( elt.getOperator() ) { + case D: + if ( ! byReference ) { + if ( pos >= start ) + newElements.add(elt); + break; + } + // otherwise fall through to the next case + case EQ: case M: case X: + pos = addCigarElements(newElements, pos, start, end, elt); + break; + case S: case I: + if ( byReference ) { + if ( pos >= start ) + newElements.add(elt); + } else { + pos = addCigarElements(newElements, pos, start, end, elt); + } + break; + default: + throw new IllegalStateException("Cannot handle " + elt); + } + } + + return AlignmentUtils.consolidateCigar(new Cigar(newElements)); + } + + /** + * Helper function for trimCigar that adds cigar elements (of total length X) of elt.op to dest for + * X bases that fall between start and end, where the last position of the base is pos. + * + * The primary use of this function is to create a new cigar element list that contains only + * elements that occur between start and end bases in an initial cigar. + * + * Note that this function may return multiple cigar elements (1M1M etc) that are best consolidated + * after the fact into a single simpler representation. + * + * @param dest we will append our cigar elements to this list + * @param pos the position (0 indexed) where elt started + * @param start only include bases that occur >= this position + * @param end only include bases that occur <= this position + * @param elt the element we are slicing down + * @return the position after we've traversed all elt.length bases of elt + */ + protected static int addCigarElements(final List dest, int pos, final int start, final int end, final CigarElement elt) { + final int length = Math.min(pos + elt.getLength() - 1, end) - Math.max(pos, start) + 1; + if ( length > 0 ) + dest.add(new CigarElement(length, elt.getOperator())); + return pos + elt.getLength(); + } + + /** + * Get the offset (base 0) of the first reference aligned base in Cigar that occurs after readStartByBaseOfCigar base of the cigar + * + * The main purpose of this routine is to find a good start position for a read given it's cigar. The real + * challenge is that the starting base might be inside an insertion, in which case the read actually starts + * at the next M/EQ/X operator. + * + * @param cigar a non-null cigar + * @param readStartByBaseOfCigar finds the first base after this (0 indexed) that aligns to the reference genome (M, EQ, X) + * @throws IllegalStateException if no such base can be found + * @return an offset into cigar + */ + public static int calcFirstBaseMatchingReferenceInCigar(final Cigar cigar, int readStartByBaseOfCigar) { + if ( cigar == null ) throw new IllegalArgumentException("cigar cannot be null"); + if ( readStartByBaseOfCigar >= cigar.getReadLength() ) throw new IllegalArgumentException("readStartByBaseOfCigar " + readStartByBaseOfCigar + " must be <= readLength " + cigar.getReadLength()); + + int hapOffset = 0, refOffset = 0; + for ( final CigarElement ce : cigar.getCigarElements() ) { + for ( int i = 0; i < ce.getLength(); i++ ) { + switch ( ce.getOperator() ) { + case M:case EQ:case X: + if ( hapOffset >= readStartByBaseOfCigar ) + return refOffset; + hapOffset++; + refOffset++; + break; + case I: case S: + hapOffset++; + break; + case D: + refOffset++; + break; + default: + throw new IllegalStateException("calcFirstBaseMatchingReferenceInCigar does not support cigar " + ce.getOperator() + " in cigar " + cigar); + } + } + } + + throw new IllegalStateException("Never found appropriate matching state for cigar " + cigar + " given start of " + readStartByBaseOfCigar); + } + + /** + * Generate a new Cigar that maps the operations of the first cigar through those in a second + * + * For example, if first is 5M and the second is 2M1I2M then the result is 2M1I2M. + * However, if first is 1M2D3M and second is 2M1I3M this results in a cigar X + * + * ref : AC-GTA + * hap : ACxGTA - 2M1I3M + * read : A--GTA - 1M2D3M + * result: A--GTA => 1M1D3M + * + * ref : ACxG-TA + * hap : AC-G-TA - 2M1D3M + * read : AC-GxTA - 3M1I2M + * result: AC-GxTA => 2M1D1M1I2M + * + * ref : ACGTA + * hap : ACGTA - 5M + * read : A-GTA - 1M1I3M + * result: A-GTA => 1M1I3M + * + * ref : ACGTAC + * hap : AC---C - 2M3D1M + * read : AC---C - 3M + * result: AG---C => 2M3D + * + * The constraint here is that both cigars should imply that the result have the same number of + * reference bases (i.e.g, cigar.getReferenceLength() are equals). + * + * @param firstToSecond the cigar mapping hap1 -> hap2 + * @param secondToThird the cigar mapping hap2 -> hap3 + * @return A cigar mapping hap1 -> hap3 + */ + public static Cigar applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird) { + final boolean DEBUG = false; + + final List newElements = new LinkedList(); + final int nElements12 = firstToSecond.getCigarElements().size(); + final int nElements23 = secondToThird.getCigarElements().size(); + + int cigar12I = 0, cigar23I = 0; + int elt12I = 0, elt23I = 0; + + while ( cigar12I < nElements12 && cigar23I < nElements23 ) { + final CigarElement elt12 = firstToSecond.getCigarElement(cigar12I); + final CigarElement elt23 = secondToThird.getCigarElement(cigar23I); + + final CigarPairTransform transform = getTransformer(elt12.getOperator(), elt23.getOperator()); + + if ( DEBUG ) + System.out.printf("Transform %s => %s with elt1 = %d %s @ %d elt2 = %d %s @ %d with transform %s%n", + firstToSecond, secondToThird, cigar12I, elt12.getOperator(), elt12I, cigar23I, elt23.getOperator(), elt23I, transform); + + if ( transform.op13 != null ) // skip no ops + newElements.add(new CigarElement(1, transform.op13)); + + elt12I += transform.advance12; + elt23I += transform.advance23; + + // if have exhausted our current element, advance to the next one + if ( elt12I == elt12.getLength() ) { cigar12I++; elt12I = 0; } + if ( elt23I == elt23.getLength() ) { cigar23I++; elt23I = 0; } + } + + return AlignmentUtils.consolidateCigar(new Cigar(newElements)); + } + + private static CigarPairTransform getTransformer(final CigarOperator op12, final CigarOperator op23) { + for ( final CigarPairTransform transform : cigarPairTransformers) { + if ( transform.op12.contains(op12) && transform.op23.contains(op23) ) + return transform; + } + + throw new IllegalStateException("No transformer for operators " + op12 + " and " + op23); + } + + /** + * transformations that project one alignment state through another + * + * Think about this as a state machine, where we have: + * + * bases3 : xxx A zzz + * bases2 : xxx B zzz + * bases1 : xxx C zzz + * + * where A, B and C are alignment states of a three way alignment. We want to capture + * the transition from operation mapping 1 -> 2 and an operation mapping 2 -> 3 and its + * associated mapping from 1 -> 3 and the advancement of the cigar states of 1->2 and 2->3. + * + * Imagine that A, B, and C are all equivalent (so that op12 = M and op23 = M). This implies + * a mapping of 1->3 of M, and in this case the next states to consider in the 3 way alignment + * are the subsequent states in 1 and 2 (so that advance12 and advance23 are both 1). + * + * Obviously not all of the states and their associated transitions are so simple. Suppose instead + * that op12 = I, and op23 = M. What does this look like: + * + * bases3 : xxx - A zzz + * bases2 : xxx - B zzz + * bases1 : xxx I C zzz + * + * It means that op13 must be an insertion (as we have an extra base in 1 thats not present in 2 and + * so not present in 3). We advance the cigar in 1 by 1 (as we've consumed one base in 1 for the I) + * but we haven't yet found the base corresponding to the M of op23. So we don't advance23. + */ + private static class CigarPairTransform { + private final EnumSet op12, op23; + private final CigarOperator op13; + private final int advance12, advance23; + + private CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23) { + this.op12 = getCigarSet(op12); + this.op23 = getCigarSet(op23); + this.op13 = op13; + this.advance12 = advance12; + this.advance23 = advance23; + } + + private static EnumSet getCigarSet(final CigarOperator masterOp) { + switch ( masterOp ) { + case M: return EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); + case I: return EnumSet.of(CigarOperator.I, CigarOperator.S); + case D: return EnumSet.of(CigarOperator.D); + default: throw new IllegalStateException("Unexpected state " + masterOp); + } + } + + @Override + public String toString() { + return "CigarPairTransform{" + + "op12=" + op12 + + ", op23=" + op23 + + ", op13=" + op13 + + ", advance12=" + advance12 + + ", advance23=" + advance23 + + '}'; + } + } + + + private final static List cigarPairTransformers = Arrays.asList( + // + // op12 is a match + // + // 3: xxx B yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.M, CigarOperator.M, 1, 1), + // 3: xxx I yyy + // ^^^^^^^^^^^^ + // 2: xxx I yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.I, CigarOperator.I, 1, 1), + // 3: xxx D yyy + // ^^^^^^^^^^^^ + // 2: xxx D yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.D, CigarOperator.D, 0, 1), + + // + // op12 is a deletion + // + // 3: xxx D M yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx D yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.M, CigarOperator.D, 1, 1), + // 3: xxx D1 D2 yyy + // ^^^^^^^^^^^^ + // 2: xxx D2 yyy + // 1: xxx D1 yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.D, CigarOperator.D, 1, 0), + // 3: xxx X yyy => no-op, we skip emitting anything here + // ^^^^^^^^^^^^ + // 2: xxx I yyy + // 1: xxx D yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.I, null, 1, 1), + + // + // op12 is a insertion + // + // 3: xxx I M yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx I yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.M, CigarOperator.I, 1, 0), + // 3: xxx I D yyy + // ^^^^^^^^^^^^ + // 2: xxx D yyy + // 1: xxx I yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.D, CigarOperator.I, 1, 0), + // 3: xxx I1 I2 yyy + // ^^^^^^^^^^^^ + // 2: xxx I2 yyy + // 1: xxx I1 yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0) + ); } diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java index 1b16266a9..0e4ec2b63 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -26,9 +26,11 @@ package org.broadinstitute.sting.utils; +import net.sf.picard.util.CigarUtil; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -163,4 +165,22 @@ public class HaplotypeUnitTest extends BaseTest { final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); } + + private Haplotype makeHCForCigar(final String bases, final String cigar) { + final Haplotype h = new Haplotype(bases.getBytes()); + h.setCigar(TextCigarCodec.getSingleton().decode(cigar)); + return h; + } + + @Test + public void testConsolidateCigar() throws Exception { + Assert.assertEquals(makeHCForCigar("AGCT", "4M").getConsolidatedPaddedCigar(0).toString(), "4M"); + Assert.assertEquals(makeHCForCigar("AGCT", "4M").getConsolidatedPaddedCigar(1).toString(), "5M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(0).toString(), "1M2I1M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(1).toString(), "1M2I2M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(2).toString(), "1M2I3M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(0).toString(), "1M3I"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(1).toString(), "1M3I1M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(2).toString(), "1M3I2M"); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index 29c643153..705db6f85 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -75,6 +75,14 @@ public class UtilsUnitTest extends BaseTest { Assert.assertEquals(duped.charAt(0), 'b', "dupString character was incorrect"); } + @Test + public void testXor() { + Assert.assertEquals(Utils.xor(false, false), false, "xor F F failed"); + Assert.assertEquals(Utils.xor(false, true), true, "xor F T failed"); + Assert.assertEquals(Utils.xor(true, false), true, "xor T F failed"); + Assert.assertEquals(Utils.xor(true, true), false, "xor T T failed"); + } + @Test public void testDupStringMultiChar() { String duped = Utils.dupString('c',5); diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java new file mode 100644 index 000000000..43969c7a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -0,0 +1,287 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeBAMWriterUnitTest extends BaseTest { + private final static boolean DEBUG = false; + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + private GATKSAMRecord makeRead(final String baseString) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, 10); + final byte[] bases = baseString.getBytes(); + read.setReadBases(bases.clone()); + read.setBaseQualities(Utils.dupBytes((byte)30, read.getReadLength())); + return read; + } + + private Haplotype makeHaplotype(final String bases, final String cigar) { + final Haplotype hap = new Haplotype(bases.getBytes()); + hap.setCigar(TextCigarCodec.getSingleton().decode(cigar)); + return hap; + } + + private static class MockBAMWriter implements SAMFileWriter { + @Override + public void addAlignment(SAMRecord alignment) { + //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public SAMFileHeader getFileHeader() { + return null; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public void close() { + //To change body of implemented methods use File | Settings | File Templates. + } + } + + @Test + public void testCreate() throws Exception { + final SAMFileWriter writer = new MockBAMWriter(); + Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, writer) instanceof CalledHaplotypeBAMWriter); + Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, writer) instanceof AllHaplotypeBAMWriter); + } + + + ////////////////////////////////////////// + // Test HaplotypeBAMWriter.createReadAlignedToRef() // + ////////////////////////////////////////// + + @DataProvider(name = "ReadAlignedToRefData") + public Object[][] makeReadAlignedToRefData() { + List tests = new ArrayList(); + + final String hapBases = "ACTGAAGGTTCC"; + final Haplotype allM = makeHaplotype(hapBases, hapBases.length() + "M"); + + // make sure we get back a cigar of the right length + for ( int i = -1; i < hapBases.length(); i++ ) { + final GATKSAMRecord read = makeRead(hapBases); + if ( i != -1 ) read.getReadBases()[i] = (byte)'A'; + tests.add(new Object[]{read, allM, 10, 10, allM.getCigar().toString()}); + } + + // make sure insertions at the front are correctly handled + for ( int padFront = 1; padFront < 10; padFront++ ) { + final GATKSAMRecord read = makeRead(Utils.dupString("N", padFront) + hapBases); + tests.add(new Object[]{read, allM, 10, 10, padFront + "I" + allM.getCigar().toString()}); + } + + // make sure insertions at the back are correctly handled + for ( int padBack = 1; padBack < 10; padBack++ ) { + final GATKSAMRecord read = makeRead(hapBases + Utils.dupString("N", padBack)); + tests.add(new Object[]{read, allM, 10, 10, allM.getCigar().toString() + padBack + "I"}); + } + + // make sure refStart and hapStart are respected + for ( int refStart = 1; refStart < 10; refStart++ ) { + for ( int hapStart = refStart; hapStart < 10 + refStart; hapStart++ ) { + final Haplotype hap = new Haplotype(allM.getBases()); + hap.setCigar(allM.getCigar()); + hap.setAlignmentStartHapwrtRef(hapStart); + + final GATKSAMRecord read = makeRead(new String(hap.getBases())); + tests.add(new Object[]{read, hap, refStart, refStart + hapStart, allM.getCigar().toString()}); + } + } + + // test that reads without a good alignment to hap get excluded + { + final GATKSAMRecord read = makeRead("NNNNN"); + tests.add(new Object[]{read, allM, 10, -1, null}); + } + + // example case of bad alignment because SW doesn't necessarily left-align indels + { + final String hap = "ACTGTGGGTTCCTCTTATTTTATTTCTACATCAATGTTCATATTTAACTTATTATTTTATCTTATTTTTAAATTTCTTTTATGTTGAGCCTTGATGAAAGCCATAGGTTCTCTCATATAATTGTATGTGTATGTATGTATATGTACATAATATATACATATATGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATATATACATATATG"; + final String hapCigar = "399M"; + final String readBases = "ATGTACATAATATATACATATATGTATATGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATAT"; + final GATKSAMRecord read = makeRead(readBases); + final int refStart = 10130100; + final int hapStart = 500; + final String badCigar = "31M6D211M"; + final String goodCigar = "28M6D214M"; + final Haplotype badHap = new Haplotype(hap.getBytes()); + badHap.setCigar(TextCigarCodec.getSingleton().decode(hapCigar)); + badHap.setAlignmentStartHapwrtRef(hapStart); + + final int expectedPos = 10130740; + tests.add(new Object[]{read, badHap, refStart, expectedPos, goodCigar}); + } + + return tests.toArray(new Object[][]{}); + } + + + + @Test(dataProvider = "ReadAlignedToRefData", enabled = true) + public void testReadAlignedToRef(final GATKSAMRecord read, final Haplotype haplotype, final int refStart, final int expectedReadStart, final String expectedReadCigar) throws Exception { + final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter()); + final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone(); + + if ( expectedReadCigar == null ) { + Assert.assertNull(writer.createReadAlignedToRef(read, haplotype, refStart)); + } else { + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedReadCigar); + final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, refStart); + + Assert.assertEquals(alignedRead.getReadName(), originalReadCopy.getReadName()); + Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart); + Assert.assertEquals(alignedRead.getReadBases(), originalReadCopy.getReadBases()); + Assert.assertEquals(alignedRead.getBaseQualities(), originalReadCopy.getBaseQualities()); + Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart); + Assert.assertEquals(alignedRead.getCigar(), expectedCigar); + Assert.assertNotNull(alignedRead.getAttribute("HC")); + } + + Assert.assertEquals(read, originalReadCopy, "createReadAlignedToRef seems be modifying the original read!"); + } + + private static class Mutation implements Comparable { + int pos, len; + CigarOperator operator; + + private Mutation(int pos, int len, CigarOperator operator) { + this.pos = pos; + this.len = len; + this.operator = operator; + } + public int getNMismatches() { return len; } + + @Override + public int compareTo(Mutation o) { + return Integer.valueOf(pos).compareTo(o.pos); + } + + private String apply(final String seq) { + switch ( operator ) { + case M: + final byte[] bases = seq.getBytes(); + if ( pos < seq.length() ) + bases[pos] = (byte)(bases[pos] == 'A' ? 'C' : 'A'); + return new String(bases); + case I: { + final String prefix = seq.substring(0, pos); + final String postfix = seq.substring(pos, seq.length()); + return prefix + "GTCAGTTA".substring(0, len) + postfix; + } case D: { + final String prefix = seq.substring(0, pos); + final String postfix = seq.substring(pos + len, seq.length()); + return prefix + postfix; + }default: + throw new IllegalStateException("Unexpected operator " + operator); + } + } + } + + private static class MutatedSequence { + int numMismatches; + String seq; + + private MutatedSequence(int numMismatches, String seq) { + this.numMismatches = numMismatches; + this.seq = seq; + } + } + + private MutatedSequence mutateSequence(final String hapIn, final List mutations) { + Collections.sort(mutations); + int mismatches = 0; + String hap = hapIn; + for ( final Mutation mut : mutations ) { + hap = mut.apply(hap); + mismatches += mut.getNMismatches(); + } + return new MutatedSequence(mismatches, hap); + } + + @DataProvider(name = "ComplexReadAlignedToRef") + public Object[][] makeComplexReadAlignedToRef() { + List tests = new ArrayList(); + + final List allMutations = Arrays.asList( + new Mutation(1, 1, CigarOperator.M), + new Mutation(2, 1, CigarOperator.M), + new Mutation(3, 1, CigarOperator.I), + new Mutation(7, 1, CigarOperator.D) + ); + + int i = 0; + final String referenceBases = "ACTGACTGACTG"; + final String paddedReference = "NNNN" + referenceBases + "NNNN"; + for ( final List mutations : Utils.makePermutations(allMutations, 3, false) ) { + final MutatedSequence hap = mutateSequence(referenceBases, mutations); + final Haplotype haplotype = new Haplotype(hap.seq.getBytes()); + final SWPairwiseAlignment align = new SWPairwiseAlignment(paddedReference.getBytes(), hap.seq.getBytes()); + haplotype.setAlignmentStartHapwrtRef(align.getAlignmentStart2wrt1()); + haplotype.setCigar(align.getCigar()); + + for ( final List readMutations : Utils.makePermutations(allMutations, 3, false) ) { + final MutatedSequence readBases = mutateSequence(hap.seq, readMutations); + final GATKSAMRecord read = makeRead(readBases.seq); + tests.add(new Object[]{i++, read, paddedReference, haplotype, hap.numMismatches + readBases.numMismatches}); + } + } + + // for convenient testing of a single failing case + //tests.add(new Object[]{makeRead("ACCGGGACTGACTG"), reference, makeHaplotype("AAAGGACTGACTG", "1M1I11M"), 2}); + + return tests.toArray(new Object[][]{}); + } + + + @Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG) + public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception { + final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter()); + final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, 1); + if ( alignedRead != null ) { + final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches; + Assert.assertTrue(mismatches <= expectedMaxMismatches, + "Alignment of read to ref looks broken. Expected at most " + expectedMaxMismatches + " but saw " + mismatches + + " for readBases " + new String(read.getReadBases()) + " with cigar " + read.getCigar() + " reference " + reference + " haplotype " + + haplotype + " with cigar " + haplotype.getCigar() + " aligned read cigar " + alignedRead.getCigarString() + " @ " + alignedRead.getAlignmentStart()); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index f845e6670..ae01c6c63 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,6 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { + private final static boolean DEBUG = false; private SAMFileHeader header; /** Basic aligned and mapped read. */ @@ -85,7 +86,7 @@ public class AlignmentUtilsUnitTest { new Object[] {readUnknownStart, false} }; } - @Test(dataProvider = "genomeLocUnmappedReadTests") + @Test(enabled = !DEBUG, dataProvider = "genomeLocUnmappedReadTests") public void testIsReadGenomeLocUnmapped(SAMRecord read, boolean expected) { Assert.assertEquals(AlignmentUtils.isReadGenomeLocUnmapped(read), expected); } @@ -103,7 +104,7 @@ public class AlignmentUtilsUnitTest { new Object[] {readUnknownStart, true} }; } - @Test(dataProvider = "unmappedReadTests") + @Test(enabled = !DEBUG, dataProvider = "unmappedReadTests") public void testIsReadUnmapped(SAMRecord read, boolean expected) { Assert.assertEquals(AlignmentUtils.isReadUnmapped(read), expected); } @@ -160,7 +161,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumAlignedBasesCountingSoftClips") + @Test(enabled = !DEBUG, dataProvider = "NumAlignedBasesCountingSoftClips") public void testNumAlignedBasesCountingSoftClips(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); @@ -180,7 +181,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "CigarHasZeroElement") + @Test(enabled = !DEBUG, dataProvider = "CigarHasZeroElement") public void testCigarHasZeroSize(final Cigar cigar, final boolean hasZero) { Assert.assertEquals(AlignmentUtils.cigarHasZeroSizeElement(cigar), hasZero, "Cigar " + cigar.toString() + " failed cigarHasZeroSizeElement"); } @@ -200,7 +201,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumHardClipped") + @Test(enabled = !DEBUG, dataProvider = "NumHardClipped") public void testNumHardClipped(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); @@ -227,49 +228,54 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumAlignedBlocks") + @Test(enabled = !DEBUG, dataProvider = "NumAlignedBlocks") public void testNumAlignedBlocks(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); Assert.assertEquals(AlignmentUtils.getNumAlignmentBlocks(read), expected, "Cigar " + cigar + " failed NumAlignedBlocks"); } - @Test - public void testConsolidateCigar() { - { - //1M1M1M1D2M1M --> 3M1D3M - List list = new ArrayList(); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.D)); - list.add( new CigarElement(2, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - Cigar unconsolidatedCigar = new Cigar(list); + @DataProvider(name = "ConsolidateCigarData") + public Object[][] makeConsolidateCigarData() { + List tests = new ArrayList(); - list.clear(); - list.add( new CigarElement(3, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.D)); - list.add( new CigarElement(3, CigarOperator.M)); - Cigar consolidatedCigar = new Cigar(list); + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{"1M1M", "2M"}); + tests.add(new Object[]{"2M", "2M"}); + tests.add(new Object[]{"2M0M", "2M"}); + tests.add(new Object[]{"0M2M", "2M"}); + tests.add(new Object[]{"0M2M0M0I0M1M", "3M"}); + tests.add(new Object[]{"2M0M1M", "3M"}); + tests.add(new Object[]{"1M1M1M1D2M1M", "3M1D3M"}); + tests.add(new Object[]{"6M6M6M", "18M"}); - Assert.assertEquals(consolidatedCigar.toString(), AlignmentUtils.consolidateCigar(unconsolidatedCigar).toString()); + final List elements = new LinkedList(); + int i = 1; + for ( final CigarOperator op : CigarOperator.values() ) { + elements.add(new CigarElement(i++, op)); + } + for ( final List ops : Utils.makePermutations(elements, 3, false) ) { + final String expected = new Cigar(ops).toString(); + final List cutElements = new LinkedList(); + for ( final CigarElement elt : ops ) { + for ( int j = 0; j < elt.getLength(); j++ ) { + cutElements.add(new CigarElement(1, elt.getOperator())); + } + } + + final String actual = new Cigar(cutElements).toString(); + tests.add(new Object[]{actual, expected}); } - { - //6M6M6M --> 18M - List list = new ArrayList(); - list.add( new CigarElement(6, CigarOperator.M)); - list.add( new CigarElement(6, CigarOperator.M)); - list.add( new CigarElement(6, CigarOperator.M)); - Cigar unconsolidatedCigar = new Cigar(list); + return tests.toArray(new Object[][]{}); + } - list.clear(); - list.add( new CigarElement(18, CigarOperator.M)); - Cigar consolidatedCigar = new Cigar(list); - - Assert.assertEquals(consolidatedCigar.toString(), AlignmentUtils.consolidateCigar(unconsolidatedCigar).toString()); - } + @Test(enabled = !DEBUG, dataProvider = "ConsolidateCigarData") + public void testConsolidateCigarWithData(final String testCigarString, final String expectedCigarString) { + final Cigar testCigar = TextCigarCodec.getSingleton().decode(testCigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.consolidateCigar(testCigar); + Assert.assertEquals(actualCigar, expectedCigar); } @DataProvider(name = "SoftClipsDataProvider") @@ -304,7 +310,7 @@ public class AlignmentUtilsUnitTest { return array; } - @Test(dataProvider = "SoftClipsDataProvider") + @Test(enabled = !DEBUG, dataProvider = "SoftClipsDataProvider") public void testSoftClipsData(final byte[] qualsOfSoftClipsOnLeft, final int middleSize, final String middleOp, final byte[] qualOfSoftClipsOnRight, final int qualThreshold, final int numExpected) { final int readLength = (middleOp.equals("D") ? 0 : middleSize) + qualOfSoftClipsOnRight.length + qualsOfSoftClipsOnLeft.length; final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); @@ -391,7 +397,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "MismatchCountDataProvider") + @Test(enabled = !DEBUG, dataProvider = "MismatchCountDataProvider") public void testMismatchCountData(final GATKSAMRecord read, final int refIndex, final int startOnRead, final int basesToRead, final boolean isMismatch) { final byte[] reference = Utils.dupBytes((byte)'A', 100); final int actual = AlignmentUtils.getMismatchCount(read, reference, refIndex, startOnRead, basesToRead).numMismatches; @@ -476,7 +482,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "AlignmentByteArrayOffsetDataProvider") + @Test(enabled = !DEBUG, dataProvider = "AlignmentByteArrayOffsetDataProvider") public void testAlignmentByteArrayOffsetData(final Cigar cigar, final int offset, final int expectedResult, final boolean isDeletion, final int lengthOfSoftClip) { final int actual = AlignmentUtils.calcAlignmentByteArrayOffset(cigar, isDeletion ? -1 : offset, isDeletion, 20, 20 + offset - lengthOfSoftClip); Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString()); @@ -514,7 +520,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ReadToAlignmentByteArrayDataProvider") + @Test(enabled = !DEBUG, dataProvider = "ReadToAlignmentByteArrayDataProvider") public void testReadToAlignmentByteArrayData(final Cigar cigar, final int expectedLength, final char middleOp, final int startOfIndelBases, final int lengthOfDeletion) { final byte[] read = Utils.dupBytes((byte)'A', cigar.getReadLength()); final byte[] alignment = AlignmentUtils.readToAlignmentByteArray(cigar, read); @@ -645,9 +651,273 @@ public class AlignmentUtilsUnitTest { return readString; } - @Test(dataProvider = "LeftAlignIndelDataProvider", enabled = true) + @Test(enabled = !DEBUG, dataProvider = "LeftAlignIndelDataProvider") public void testLeftAlignIndelData(final Cigar originalCigar, final Cigar expectedCigar, final byte[] reference, final byte[] read, final int repeatLength) { final Cigar actualCigar = AlignmentUtils.leftAlignIndel(originalCigar, reference, read, 0, 0, true); Assert.assertTrue(expectedCigar.equals(actualCigar), "Wrong left alignment detected for cigar " + originalCigar.toString() + " to " + actualCigar.toString() + " but expected " + expectedCigar.toString() + " with repeat length " + repeatLength); } + + ////////////////////////////////////////// + // Test AlignmentUtils.trimCigarByReference() // + ////////////////////////////////////////// + + @DataProvider(name = "TrimCigarData") + public Object[][] makeTrimCigarData() { + List tests = new ArrayList(); + + for ( final CigarOperator op : Arrays.asList(CigarOperator.D, CigarOperator.EQ, CigarOperator.X, CigarOperator.M) ) { + for ( int myLength = 1; myLength < 6; myLength++ ) { + for ( int start = 0; start < myLength - 1; start++ ) { + for ( int end = start; end < myLength; end++ ) { + final int length = end - start + 1; + + final List padOps = Arrays.asList(CigarOperator.D, CigarOperator.M); + for ( final CigarOperator padOp: padOps) { + for ( int leftPad = 0; leftPad < 2; leftPad++ ) { + for ( int rightPad = 0; rightPad < 2; rightPad++ ) { + tests.add(new Object[]{ + (leftPad > 0 ? leftPad + padOp.toString() : "") + myLength + op.toString() + (rightPad > 0 ? rightPad + padOp.toString() : ""), + start + leftPad, + end + leftPad, + length + op.toString()}); + } + } + } + } + } + } + } + + for ( final int leftPad : Arrays.asList(0, 1, 2, 5) ) { + for ( final int rightPad : Arrays.asList(0, 1, 2, 5) ) { + final int length = leftPad + rightPad; + if ( length > 0 ) { + for ( final int insSize : Arrays.asList(1, 10) ) { + for ( int start = 0; start <= leftPad; start++ ) { + for ( int stop = leftPad; stop < length; stop++ ) { + final int leftPadRemaining = leftPad - start; + final int rightPadRemaining = stop - leftPad + 1; + final String insC = insSize + "I"; + tests.add(new Object[]{ + leftPad + "M" + insC + rightPad + "M", + start, + stop, + (leftPadRemaining > 0 ? leftPadRemaining + "M" : "") + insC + (rightPadRemaining > 0 ? rightPadRemaining + "M" : "") + }); + } + } + } + } + } + } + + tests.add(new Object[]{"3M2D4M", 0, 8, "3M2D4M"}); + tests.add(new Object[]{"3M2D4M", 2, 8, "1M2D4M"}); + tests.add(new Object[]{"3M2D4M", 2, 6, "1M2D2M"}); + tests.add(new Object[]{"3M2D4M", 3, 6, "2D2M"}); + tests.add(new Object[]{"3M2D4M", 4, 6, "1D2M"}); + tests.add(new Object[]{"3M2D4M", 5, 6, "2M"}); + tests.add(new Object[]{"3M2D4M", 6, 6, "1M"}); + + tests.add(new Object[]{"2M3I4M", 0, 5, "2M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 5, "1M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 4, "1M3I3M"}); + tests.add(new Object[]{"2M3I4M", 2, 4, "3I3M"}); + tests.add(new Object[]{"2M3I4M", 2, 3, "3I2M"}); + tests.add(new Object[]{"2M3I4M", 2, 2, "3I1M"}); + tests.add(new Object[]{"2M3I4M", 3, 4, "2M"}); + tests.add(new Object[]{"2M3I4M", 3, 3, "1M"}); + tests.add(new Object[]{"2M3I4M", 4, 4, "1M"}); + + // this doesn't work -- but I'm not sure it should + // tests.add(new Object[]{"2M3I4M", 2, 1, "3I"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimCigarData", enabled = ! DEBUG) + public void testTrimCigar(final String cigarString, final int start, final int length, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.trimCigarByReference(cigar, start, length); + Assert.assertEquals(actualCigar, expectedCigar); + } + + @DataProvider(name = "TrimCigarByBasesData") + public Object[][] makeTrimCigarByBasesData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"2M3I4M", 0, 8, "2M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 8, "1M3I4M"}); + tests.add(new Object[]{"2M3I4M", 2, 8, "3I4M"}); + tests.add(new Object[]{"2M3I4M", 3, 8, "2I4M"}); + tests.add(new Object[]{"2M3I4M", 4, 8, "1I4M"}); + tests.add(new Object[]{"2M3I4M", 4, 7, "1I3M"}); + tests.add(new Object[]{"2M3I4M", 4, 6, "1I2M"}); + tests.add(new Object[]{"2M3I4M", 4, 5, "1I1M"}); + tests.add(new Object[]{"2M3I4M", 4, 4, "1I"}); + tests.add(new Object[]{"2M3I4M", 5, 5, "1M"}); + + tests.add(new Object[]{"2M2D2I", 0, 3, "2M2D2I"}); + tests.add(new Object[]{"2M2D2I", 1, 3, "1M2D2I"}); + tests.add(new Object[]{"2M2D2I", 2, 3, "2D2I"}); + tests.add(new Object[]{"2M2D2I", 3, 3, "1I"}); + tests.add(new Object[]{"2M2D2I", 2, 2, "2D1I"}); + tests.add(new Object[]{"2M2D2I", 1, 2, "1M2D1I"}); + tests.add(new Object[]{"2M2D2I", 1, 1, "1M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimCigarByBasesData", enabled = !DEBUG) + public void testTrimCigarByBase(final String cigarString, final int start, final int length, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.trimCigarByBases(cigar, start, length); + Assert.assertEquals(actualCigar, expectedCigar); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.applyCigarToCigar() // + ////////////////////////////////////////// + + @DataProvider(name = "ApplyCigarToCigarData") + public Object[][] makeApplyCigarToCigarData() { + List tests = new ArrayList(); + + for ( int i = 1; i < 5; i++ ) + tests.add(new Object[]{i + "M", i + "M", i + "M"}); + +// * ref : ACGTAC +// * hap : AC---C - 2M3D1M +// * read : AC---C - 3M +// * result: AG---C => 2M3D + tests.add(new Object[]{"3M", "2M3D1M", "2M3D1M"}); + +// * ref : ACxG-TA +// * hap : AC-G-TA - 2M1D3M +// * read : AC-GxTA - 3M1I2M +// * result: AC-GxTA => 2M1D1M1I2M + tests.add(new Object[]{"3M1I2M", "2M1D3M", "2M1D1M1I2M"}); + +// * ref : A-CGTA +// * hap : A-CGTA - 5M +// * read : AxCGTA - 1M1I4M +// * result: AxCGTA => 1M1I4M + tests.add(new Object[]{"1M1I4M", "5M", "1M1I4M"}); + +// * ref : ACGTA +// * hap : ACGTA - 5M +// * read : A--TA - 1M2D2M +// * result: A--TA => 1M2D2M + tests.add(new Object[]{"1M2D2M", "5M", "1M2D2M"}); + +// * ref : AC-GTA +// * hap : ACxGTA - 2M1I3M +// * read : A--GTA - 1M2D3M +// * result: A--GTA => 1M1D3M + tests.add(new Object[]{"108M14D24M2M18I29M92M1000M", "2M1I3M", "2M1I3M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ApplyCigarToCigarData", enabled = !DEBUG) + public void testApplyCigarToCigar(final String firstToSecondString, final String secondToThirdString, final String expectedCigarString) { + final Cigar firstToSecond = TextCigarCodec.getSingleton().decode(firstToSecondString); + final Cigar secondToThird = TextCigarCodec.getSingleton().decode(secondToThirdString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.applyCigarToCigar(firstToSecond, secondToThird); + Assert.assertEquals(actualCigar, expectedCigar); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.applyCigarToCigar() // + ////////////////////////////////////////// + + @DataProvider(name = "ReadOffsetFromCigarData") + public Object[][] makeReadOffsetFromCigarData() { + List tests = new ArrayList(); + + final int SIZE = 10; + for ( int i = 0; i < SIZE; i++ ) { + tests.add(new Object[]{SIZE + "M", i, i}); + } + + // 0123ii45 + // ref : ACGT--AC + // hap : AC--xxAC (2M2D2I2M) + // ref.pos: 01 45 + tests.add(new Object[]{"2M2D2I2M", 0, 0}); + tests.add(new Object[]{"2M2D2I2M", 1, 1}); + tests.add(new Object[]{"2M2D2I2M", 2, 4}); + tests.add(new Object[]{"2M2D2I2M", 3, 4}); + tests.add(new Object[]{"2M2D2I2M", 4, 4}); + tests.add(new Object[]{"2M2D2I2M", 5, 5}); + + // 10132723 - 10132075 - 500 = 148 + // what's the offset of the first match after the I? + // 108M + 14D + 24M + 2M = 148 + // What's the offset of the first base that is after the I? + // 108M + 24M + 2M + 18I = 134M + 18I = 152 - 1 = 151 + tests.add(new Object[]{"108M14D24M2M18I29M92M", 0, 0}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 107, 107}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 108, 108 + 14}); // first base after the deletion + + tests.add(new Object[]{"108M14D24M2M18I29M92M", 132, 132+14}); // 2 before insertion + tests.add(new Object[]{"108M14D24M2M18I29M92M", 133, 133+14}); // last base before insertion + + // entering into the insertion + for ( int i = 0; i < 18; i++ ) { + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+i, 148}); // inside insertion + } + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18, 148}); // first base after insertion matches at same as insertion + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18+1, 149}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18+2, 150}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadOffsetFromCigarData", enabled = !DEBUG) + public void testReadOffsetFromCigar(final String cigarString, final int startOnCigar, final int expectedOffset) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final int actualOffset = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(cigar, startOnCigar); + Assert.assertEquals(actualOffset, expectedOffset); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.addCigarElements() // + ////////////////////////////////////////// + + @DataProvider(name = "AddCigarElementsData") + public Object[][] makeAddCigarElementsData() { + List tests = new ArrayList(); + + final int SIZE = 10; + for ( final CigarOperator op : Arrays.asList(CigarOperator.I, CigarOperator.M, CigarOperator.S, CigarOperator.EQ, CigarOperator.X)) { + for ( int start = 0; start < SIZE; start++ ) { + for ( int end = start; end < SIZE * 2; end ++ ) { + for ( int pos = 0; pos < SIZE * 3; pos++ ) { + int length = 0; + for ( int i = 0; i < SIZE; i++ ) length += (i+pos) >= start && (i+pos) <= end ? 1 : 0; + tests.add(new Object[]{SIZE + op.toString(), pos, start, end, length > 0 ? length + op.toString() : "*"}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AddCigarElementsData", enabled = !DEBUG) + public void testAddCigarElements(final String cigarString, final int pos, final int start, final int end, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final CigarElement elt = cigar.getCigarElement(0); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + + final List elts = new LinkedList(); + final int actualEndPos = AlignmentUtils.addCigarElements(elts, pos, start, end, elt); + + Assert.assertEquals(actualEndPos, pos + elt.getLength()); + Assert.assertEquals(AlignmentUtils.consolidateCigar(new Cigar(elts)), expectedCigar); + } } From d0c8105387787505eda7542aa10e53b43b69a64a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 4 Mar 2013 16:47:45 -0500 Subject: [PATCH 027/226] Cleaning up hilarious exception messages Too many users (with RNASeq reads) are hitting these exceptions that were never supposed to happen. Let's give them (and us) a better and clearer error message. --- .../broadinstitute/sting/utils/clipping/ReadClipper.java | 4 ++-- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 45dd55af7..eaefa3aba 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -28,8 +28,8 @@ package org.broadinstitute.sting.utils.clipping; import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -534,7 +534,7 @@ public class ReadClipper { throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); if ( start > stop ) - throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop)); + throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen, please check read: %s (CIGAR: %s)", start, stop, read, read.getCigarString())); if ( start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 709afeef5..95e0d55f3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -29,12 +29,12 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.BaseUtils; import java.io.File; import java.util.*; @@ -485,7 +485,7 @@ public class ReadUtils { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); } else { - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); } } @@ -506,7 +506,7 @@ public class ReadUtils { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); } else { - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + throw new ReviewedStingException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); } } From bbbaf9ad20fc2190da7e852ead61328b70cecf00 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 5 Mar 2013 09:06:02 -0500 Subject: [PATCH 031/226] Revert push from stable (I forgot that pushing from stable overwrites current unstable changes) --- .../sting/gatk/walkers/indels/PairHMMIndelErrorModel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 45162fdba..e3d3c6640 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -397,7 +397,7 @@ public class PairHMMIndelErrorModel { if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); } int startIndexInHaplotype = 0; From e2d41f02820f0987764b5bfbe6e3d331b679c43a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 5 Mar 2013 17:25:52 -0500 Subject: [PATCH 033/226] Turning @Output required to false By default all output is assigned to stdout if a -o is not provided. Technically this makes @Output a not required parameter, and the documentation is misleading because it's reading from the annotation. GSA-820 #resolve --- .../java/src/org/broadinstitute/sting/commandline/Output.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java index 6c2b143c4..47a47602a 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Output.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java @@ -64,7 +64,7 @@ public @interface Output { * fail if the type can't be populated. * @return True if the argument is required. False otherwise. */ - boolean required() default true; + boolean required() default false; /** * Should this command-line argument be exclusive of others. Should be From 78721ee09b14730c9cd054daea4d8563592330b3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 4 Mar 2013 14:13:42 -0500 Subject: [PATCH 034/226] Added new walker to split MNPs into their allelic primitives (SNPs). * Can be extended to complex alleles at some point. * Currently only works for bi-allelics (documented). * Added unit and integration tests. --- ...ntsToAllelicPrimitivesIntegrationTest.java | 67 +++++++++ .../VariantsToAllelicPrimitives.java | 140 ++++++++++++++++++ .../variant/GATKVariantContextUtils.java | 83 +++++++++-- .../GATKVariantContextUtilsUnitTest.java | 117 +++++++++++++++ 4 files changed, 394 insertions(+), 13 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java new file mode 100644 index 000000000..7b1b9b7d2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitivesIntegrationTest.java @@ -0,0 +1,67 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests VariantsToAllelicPrimitives + */ +public class VariantsToAllelicPrimitivesIntegrationTest extends WalkerTest { + + @Test + public void testMNPsToSNPs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantsToAllelicPrimitives -o %s -R " + b37KGReference + " -V " + privateTestDir + "vcfWithMNPs.vcf --no_cmdline_in_header", + 1, + Arrays.asList("c5333d2e352312bdb7c5182ca3009594")); + executeTest("test MNPs To SNPs", spec); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java new file mode 100644 index 000000000..319183f28 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java @@ -0,0 +1,140 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; + +import java.util.*; + +/** + * Takes alleles from a variants file and breaks them up (if possible) into more basic/primitive alleles. + * + *

    + * For now this tool modifies only multi-nucleotide polymorphisms (MNPs) and leaves SNPs, indels, and complex substitutions as is, + * although one day it may be extended to handle the complex substitution case. + * + * This tool will take an MNP (e.g. ACCCA -> TCCCG) and break it up into separate records for each component part (A-T and A->G). + * + * Note that this tool modifies only bi-allelic variants. + * + *

    Input

    + *

    + * A variant set with any type of alleles. + *

    + * + *

    Output

    + *

    + * A VCF with alleles broken into primitive types. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantsToAllelicPrimitives \
    + *   --variant input.vcf \
    + *   -o output.vcf
    + * 
    + * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +public class VariantsToAllelicPrimitives extends RodWalker { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc="File to which variants should be written",required=true) + protected VariantContextWriter baseWriter = null; + + private VariantContextWriter vcfWriter; + + public void initialize() { + final String trackName = variantCollection.variants.getName(); + final Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + + final Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + final Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); + + baseWriter.writeHeader(new VCFHeader(headerLines, samples)); + + vcfWriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 200); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + + int changedSites = 0; + for ( final VariantContext vc : VCs ) + changedSites += writeVariants(vc); + + return changedSites; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + System.out.println(result + " MNPs were broken up into primitives"); + vcfWriter.close(); + } + + @Requires("vc != null") + private int writeVariants(final VariantContext vc) { + // for now, we modify only bi-allelic MNPs; update docs above if this changes + if ( vc.isBiallelic() && vc.isMNP() ) { + for ( final VariantContext splitVC : GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc) ) + vcfWriter.add(splitVC); + return 1; + } else { + vcfWriter.add(vc); + return 0; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 37bd798cf..398b32669 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -989,7 +989,6 @@ public class GATKVariantContextUtils { return inputVC; final List alleles = new LinkedList(); - final GenotypesContext genotypes = GenotypesContext.create(); final Map originalToTrimmedAlleleMap = new HashMap(); for (final Allele a : inputVC.getAlleles()) { @@ -1006,17 +1005,8 @@ public class GATKVariantContextUtils { } // now we can recreate new genotypes with trimmed alleles - for ( final Genotype genotype : inputVC.getGenotypes() ) { - final List originalAlleles = genotype.getAlleles(); - final List trimmedAlleles = new ArrayList(); - for ( final Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); - } + final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); + final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); final int start = inputVC.getStart() + (fwdTrimEnd + 1); final VariantContextBuilder builder = new VariantContextBuilder(inputVC); @@ -1027,6 +1017,18 @@ public class GATKVariantContextUtils { return builder.make(); } + @Requires("originalGenotypes != null && alleleMapper != null") + protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { + final GenotypesContext updatedGenotypes = GenotypesContext.create(); + + for ( final Genotype genotype : originalGenotypes ) { + final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); + updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); + } + + return updatedGenotypes; + } + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { int clipping = 0; boolean stillClipping = true; @@ -1263,7 +1265,7 @@ public class GATKVariantContextUtils { } - private static class AlleleMapper { + protected static class AlleleMapper { private VariantContext vc = null; private Map map = null; public AlleleMapper(VariantContext vc) { this.vc = vc; } @@ -1323,4 +1325,59 @@ public class GATKVariantContextUtils { } return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); } + + /** + * Splits the alleles for the provided variant context into its primitive parts. + * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. + * Currently works only for MNPs. + * + * @param vc the non-null VC to split + * @return a non-empty list of VCs split into primitive parts or the original VC otherwise + */ + public static List splitIntoPrimitiveAlleles(final VariantContext vc) { + if ( vc == null ) + throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); + + if ( !vc.isBiallelic() ) + throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); + + // currently only works for MNPs + if ( !vc.isMNP() ) + return Arrays.asList(vc); + + final byte[] ref = vc.getReference().getBases(); + final byte[] alt = vc.getAlternateAllele(0).getBases(); + + if ( ref.length != alt.length ) + throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); + + final List result = new ArrayList(ref.length); + + for ( int i = 0; i < ref.length; i++ ) { + + // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) + if ( ref[i] != alt[i] ) { + + // create the ref and alt SNP alleles + final Allele newRefAllele = Allele.create(ref[i], true); + final Allele newAltAllele = Allele.create(alt[i], false); + + // create a new VariantContext with the new SNP alleles + final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); + + // create new genotypes with updated alleles + final Map alleleMap = new HashMap(); + alleleMap.put(vc.getReference(), newRefAllele); + alleleMap.put(vc.getAlternateAllele(0), newAltAllele); + final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); + + result.add(newVC.genotypes(newGenotypes).make()); + } + } + + if ( result.isEmpty() ) + result.add(vc); + + return result; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 2a15d709a..ff42abb23 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.utils.variant; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; @@ -976,4 +978,119 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); } } + + // -------------------------------------------------------------------------------- + // + // test primitive allele splitting + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "PrimitiveAlleleSplittingData") + public Object[][] makePrimitiveAlleleSplittingData() { + List tests = new ArrayList(); + + // no split + tests.add(new Object[]{"A", "C", 0, null}); + tests.add(new Object[]{"A", "AC", 0, null}); + tests.add(new Object[]{"AC", "A", 0, null}); + + // one split + tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); + tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); + tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); + + // two splits + tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); + tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); + tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); + + // three splits + tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrimitiveAlleleSplittingData") + public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { + + final int start = 10; + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + + final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); + + if ( expectedSplit > 0 ) { + Assert.assertEquals(result.size(), expectedSplit); + for ( int i = 0; i < variantPositions.size(); i++ ) { + Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); + } + } else { + Assert.assertEquals(result.size(), 1); + Assert.assertEquals(vc, result.get(0)); + } + } + + // -------------------------------------------------------------------------------- + // + // test allele remapping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "AlleleRemappingData") + public Object[][] makeAlleleRemappingData() { + List tests = new ArrayList(); + + final Allele originalBase1 = Allele.create((byte)'A'); + final Allele originalBase2 = Allele.create((byte)'T'); + + for ( final byte base1 : BaseUtils.BASES ) { + for ( final byte base2 : BaseUtils.BASES ) { + for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { + Map map = new HashMap(2); + map.put(originalBase1, Allele.create(base1)); + map.put(originalBase2, Allele.create(base2)); + + tests.add(new Object[]{map, numGenotypes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AlleleRemappingData") + public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { + + final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); + + final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); + + final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); + + for ( int i = 0; i < numGenotypes; i++ ) { + final Genotype originalG = originalGC.get(String.format("%d", i)); + final Genotype remappedG = remappedGC.get(String.format("%d", i)); + + Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); + for ( int j = 0; j < originalG.getAlleles().size(); j++ ) + Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); + } + } + + private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { + GenomeAnalysisEngine.resetRandomGenerator(); + final Random random = GenomeAnalysisEngine.getRandomGenerator(); + + final GenotypesContext gc = GenotypesContext.create(); + for ( int i = 0; i < numGenotypes; i++ ) { + // choose alleles at random + final List myAlleles = new ArrayList(); + myAlleles.add(alleles.get(random.nextInt(2))); + myAlleles.add(alleles.get(random.nextInt(2))); + + final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); + gc.add(g); + } + + return gc; + } } \ No newline at end of file From 3759d9dd679ed26ce67e9b75bca90291d91f797c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 5 Mar 2013 15:57:44 -0500 Subject: [PATCH 035/226] Added the functionality to impose a relative ordering on ReadTransformers in the GATK engine. * ReadTransformers can say they must be first, must be last, or don't care. * By default, none of the existing ones care about ordering except BQSR (must be first). * This addresses a bug reported on the forum where BAQ is incorrectly applied before BQSR. * The engine now orders the read transformers up front before applying iterators. * The engine checks for enabled RTs that are not compatible (e.g. both must be first) and blows up (gracefully). * Added unit tests. --- .../recalibration/BQSRReadTransformer.java | 3 + .../sting/gatk/GenomeAnalysisEngine.java | 37 +++++++- .../sting/gatk/iterators/ReadTransformer.java | 36 +++++++ .../sting/utils/exceptions/UserException.java | 6 ++ .../gatk/GenomeAnalysisEngineUnitTest.java | 93 +++++++++++++++++++ 5 files changed, 173 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java index 113ea2222..3f8fd0e88 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java @@ -62,6 +62,9 @@ public class BQSRReadTransformer extends ReadTransformer { private boolean enabled; private BaseRecalibration bqsr = null; + @Override + public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; } + @Override public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { this.enabled = engine.hasBQSRArgumentSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 85c94cc92..e45a750ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -372,7 +372,8 @@ public class GenomeAnalysisEngine { * @param walker the walker we need to apply read transformers too */ public void initializeReadTransformers(final Walker walker) { - final List activeTransformers = new ArrayList(); + // keep a list of the active read transformers sorted based on priority ordering + List activeTransformers = new ArrayList(); final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; @@ -392,9 +393,41 @@ public class GenomeAnalysisEngine { return readTransformers; } - private void setReadTransformers(final List readTransformers) { + /* + * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). + * + * @param readTransformers the active read transformers + */ + protected void checkActiveReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new IllegalArgumentException("read transformers cannot be null"); + + ReadTransformer sawMustBeFirst = null; + ReadTransformer sawMustBeLast = null; + + for ( final ReadTransformer r : readTransformers ) { + if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { + if ( sawMustBeFirst != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); + sawMustBeFirst = r; + } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { + if ( sawMustBeLast != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); + sawMustBeLast = r; + } + } + } + + protected void setReadTransformers(final List readTransformers) { if ( readTransformers == null ) throw new ReviewedStingException("read transformers cannot be null"); + + // sort them in priority order + Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); + + // make sure we don't have an invalid set of active read transformers + checkActiveReadTransformers(readTransformers); + this.readTransformers = readTransformers; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java index f026b8f6c..799014cd4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java @@ -31,6 +31,8 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.Comparator; + /** * Baseclass used to describe a read transformer like BAQ and BQSR * @@ -65,6 +67,11 @@ abstract public class ReadTransformer { protected ReadTransformer() {} + /* + * @return the ordering constraint for the given read transformer + */ + public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.DO_NOT_CARE; } + /** * Master initialization routine. Called to setup a ReadTransform, using it's overloaded initializeSub routine. * @@ -166,4 +173,33 @@ abstract public class ReadTransformer { */ HANDLED_IN_WALKER } + + /* + * This enum specifies the constraints that the given read transformer has relative to any other read transformers being used + */ + public enum OrderingConstraint { + /* + * If 2 read transformers are both active and MUST_BE_FIRST, then an error will be generated + */ + MUST_BE_FIRST, + + /* + * No constraints on the ordering for this read transformer + */ + DO_NOT_CARE, + + /* + * If 2 read transformers are both active and MUST_BE_LAST, then an error will be generated + */ + MUST_BE_LAST + } + + public static class ReadTransformerComparator implements Comparator { + + public int compare(final ReadTransformer r1, final ReadTransformer r2) { + if ( r1.getOrderingConstraint() == r2.getOrderingConstraint() ) + return 0; + return ( r1.getOrderingConstraint() == OrderingConstraint.MUST_BE_FIRST || r2.getOrderingConstraint() == OrderingConstraint.MUST_BE_LAST ) ? -1 : 1; + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 1c461748e..b3c5bd2c7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -75,6 +75,12 @@ public class UserException extends ReviewedStingException { } } + public static class IncompatibleReadFiltersException extends CommandLineException { + public IncompatibleReadFiltersException(final String filter1, final String filter2) { + super(String.format("Two read filters are enabled that are incompatible and cannot be used simultaneously: %s and %s", filter1, filter2)); + } + } + public static class MalformedWalkerArgumentsException extends CommandLineException { public MalformedWalkerArgumentsException(String message) { super(String.format("Malformed walker argument: %s",message)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 0b6e08fa7..3f74e0eae 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -29,15 +29,23 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.List; /** * Tests selected functionality in the GenomeAnalysisEngine class @@ -81,4 +89,89 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } + + + /////////////////////////////////////////////////// + // Test the ReadTransformer ordering enforcement // + /////////////////////////////////////////////////// + + public static class TestReadTransformer extends ReadTransformer { + + private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; + private boolean enabled; + + protected TestReadTransformer(final OrderingConstraint orderingConstraint) { + this.orderingConstraint = orderingConstraint; + enabled = true; + } + + // need this because PackageUtils will pick up this class as a possible ReadTransformer + protected TestReadTransformer() { + enabled = false; + } + + @Override + public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } + + @Override + public boolean enabled() { return enabled; } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } + + } + + @DataProvider(name = "ReadTransformerData") + public Object[][] makeReadTransformerData() { + List tests = new ArrayList(); + + for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { + tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadTransformerData") + public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { + + final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + final List readTransformers = new ArrayList(3); + readTransformers.add(new TestReadTransformer(oc1)); + readTransformers.add(new TestReadTransformer(oc2)); + readTransformers.add(new TestReadTransformer(oc3)); + + final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || + numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; + + try { + testEngine.setReadTransformers(readTransformers); + + Assert.assertFalse(shouldThrowException); + Assert.assertEquals(testEngine.getReadTransformers().size(), 3); + + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + } catch (UserException.IncompatibleReadFiltersException e) { + Assert.assertTrue(shouldThrowException); + } + } + + private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { + int count = 0; + for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { + if ( constraint == target ) + count++; + } + return count; + } } From 3ab78543a737223be36421ff2cef7757fa4b1bb1 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 5 Mar 2013 01:27:36 -0500 Subject: [PATCH 036/226] Fix tests that were consistently or intermittently failing when run in parallel on the farm -Make MaxRuntimeIntegrationTest more lenient by assuming that startup overhead might be as long as 120 seconds on a very slow node, rather than the original assumption of 20 seconds -In TraverseActiveRegionsUnitTest, write temp bam file to the temp directory, not to the current working directory -SimpleTimerUnitTest: This test was internally inconsistent. It asserted that a particular operation should take no more than 10 milliseconds, and then asserted again that this same operation should take no more than 100 microseconds (= 0.1 millisecond). On a slow node it could take slightly longer than 100 microseconds, however. Changed the test to assert that the operation should require no more than 10000 microseconds (= 10 milliseconds) -change global default test timeout from 20 to 40 minutes (things just take longer on the farm!) -build.xml: allow runtestonly target to work with scala test classes --- build.xml | 1 + .../sting/TestNGTestTransformer.java | 4 +-- .../sting/gatk/MaxRuntimeIntegrationTest.java | 4 +-- .../TraverseActiveRegionsUnitTest.java | 29 ++++++++++--------- .../sting/utils/SimpleTimerUnitTest.java | 4 +-- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/build.xml b/build.xml index 03f3232f2..fb5362b3e 100644 --- a/build.xml +++ b/build.xml @@ -1473,6 +1473,7 @@ +
    diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java index 362d409cb..772c86563 100644 --- a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java +++ b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java @@ -35,7 +35,7 @@ import java.lang.reflect.Method; /** * Provide default @Test values for GATK testng tests. * - * Currently only sets the maximum runtime to 10 minutes, if it's not been specified. + * Currently only sets the maximum runtime to 40 minutes, if it's not been specified. * * See http://beust.com/weblog/2006/10/18/annotation-transformers-in-java/ * @@ -44,7 +44,7 @@ import java.lang.reflect.Method; * @version 0.1 */ public class TestNGTestTransformer implements IAnnotationTransformer { - public static final long DEFAULT_TIMEOUT = 1000 * 60 * 20; // 20 minutes max per test + public static final long DEFAULT_TIMEOUT = 1000 * 60 * 40; // 40 minutes max per test final static Logger logger = Logger.getLogger(TestNGTestTransformer.class); diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java index 55f9e1f7d..25ee9ff09 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java @@ -39,7 +39,7 @@ import java.util.concurrent.TimeUnit; * */ public class MaxRuntimeIntegrationTest extends WalkerTest { - private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(20, TimeUnit.SECONDS); + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(120, TimeUnit.SECONDS); private class MaxRuntimeTestProvider extends TestDataProvider { final long maxRuntime; @@ -68,7 +68,7 @@ public class MaxRuntimeIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 60 * 1000) + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 300 * 1000) public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T PrintReads -R " + hg18Reference diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index a574932a7..0384260fa 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -58,6 +58,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.*; /** @@ -86,11 +87,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List intervals; - private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; - private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; + private File testBAM; @BeforeClass - private void init() throws FileNotFoundException { + private void init() throws IOException { //reference = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); // hg19Reference)); reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); dictionary = reference.getSequenceDictionary(); @@ -133,17 +133,18 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { createBAM(reads); } - private void createBAM(List reads) { - File outFile = new File(testBAM); - outFile.deleteOnExit(); - File indexFile = new File(testBAI); - indexFile.deleteOnExit(); + private void createBAM(List reads) throws IOException { + testBAM = File.createTempFile("TraverseActiveRegionsUnitTest", ".bam"); + testBAM.deleteOnExit(); - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { out.addAlignment(read); } out.close(); + + new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); } @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") @@ -400,7 +401,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return getActiveRegions(t, walker, intervals, testBAM); } - private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final String bam) { + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final File bam) { for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) t.traverse(walker, dataProvider, 0); @@ -466,13 +467,13 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return record; } - private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, String bamFile) { + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, File bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); traverseActiveRegions.initialize(engine, walker); Collection samFiles = new ArrayList(); - SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); + SAMReaderID readerID = new SAMReaderID(bamFile, new Tags()); samFiles.add(readerID); SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, @@ -594,7 +595,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { walker.setStates(readStates); final TraverseActiveRegions traversal = new TraverseActiveRegions(); - final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary for ( final ActiveRegion region : activeRegionsMap.values() ) { @@ -666,7 +667,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); final TraverseActiveRegions traversal = new TraverseActiveRegions(); - final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); final ActiveRegion region = activeRegionsMap.values().iterator().next(); int nReadsExpectedInRegion = 0; diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 5a6db4d9c..f92cd4bcf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -118,8 +118,8 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); - final long maxTimeInMicro = 100; - final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(100); + final long maxTimeInMicro = 10000; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(maxTimeInMicro); Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); } From baad965a578a8b5108554e27c51b0ed49c159695 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 6 Mar 2013 11:15:30 -0500 Subject: [PATCH 037/226] - Changed loadContaminationFile file parser to delimit by tab only. This allows spaces in sampleIDs, which apparently are allowed. - This was needed since samples with spaces in their names are regularly found in the picard pipeline. - Modified the tests to account for this (removed spaces from the good tests, and changed the failing tests accordingly) - Cleaned up the unit tests using a @DataProvider (I'm in love...). - Moved AlleleBiasedDownsamplingUtilsUnitTest to public to match location of class it is testing (due to the way bamboo operates) --- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 208 ------------------ .../AlleleBiasedDownsamplingUtils.java | 6 +- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 187 ++++++++++++++++ 3 files changed, 190 insertions(+), 211 deletions(-) delete mode 100644 protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java deleted file mode 100644 index dd131b797..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ /dev/null @@ -1,208 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.downsampling; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - - -/** - * Basic unit test for AlleleBiasedDownsamplingUtils - */ -public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { - - - @Test - public void testSmartDownsampling() { - - final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; - final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; - - // no contamination, no removal - testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // hom sample, het contaminant, different alleles - testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // hom sample, hom contaminant, different alleles - testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // het sample, het contaminant, different alleles - testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // het sample, hom contaminant, different alleles - testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // hom sample, het contaminant, overlapping alleles - final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; - testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - - // hom sample, hom contaminant, overlapping alleles - testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); - - // het sample, het contaminant, overlapping alleles - testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); - testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // het sample, hom contaminant, overlapping alleles - testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - } - - private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, - final int pileupSize, final int[] initialCounts, final int[] targetCounts) { - - final int[] actualCounts = initialCounts.clone(); - actualCounts[0] += addA; - actualCounts[1] += addC; - actualCounts[2] += addG; - actualCounts[3] += addT; - - final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); - Assert.assertTrue(countsAreEqual(results, targetCounts)); - } - - private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { - for ( int i = 0; i < 4; i++ ) { - if ( counts1[i] != counts2[i] ) - return false; - } - return true; - } - - - @Test - public void testLoadContaminationFile1(){ - Logger logger=org.apache.log4j.Logger.getRootLogger(); - - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); - - Map Contam1=new HashMap(); - Set Samples1=new HashSet(); - - Contam1.put("NA11918",0.15); - Samples1.addAll(Contam1.keySet()); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - - Contam1.put("NA12842",0.13); - Samples1.addAll(Contam1.keySet()); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - - Samples1.add("DUMMY"); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - } - - private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ - Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); - Assert.assertTrue(loadedMap.equals(map)); - } - - @Test - public void testLoadContaminationFiles(){ - Logger logger=org.apache.log4j.Logger.getRootLogger(); - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - - for(int i=1; i<=5; i++){ - File ContamFile=new File(ArtificalBAMLocation+String.format("contamination.case.%d.txt",i)); - Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile,0.0,null,logger).size()==2); - } - - } - - @Test(expectedExceptions = UserException.MalformedFile.class) - public void testLoadBrokenContaminationFile1(){ - testLoadBrokenContaminationFile(1); - } - - @Test(expectedExceptions = UserException.MalformedFile.class) - public void testLoadBrokenContaminationFile2(){ - testLoadBrokenContaminationFile(2); - } - @Test(expectedExceptions = UserException.MalformedFile.class) - public void testLoadBrokenContaminationFile3(){ - testLoadBrokenContaminationFile(3); - } - - @Test(expectedExceptions = UserException.MalformedFile.class) - public void testLoadBrokenContaminationFile4(){ - testLoadBrokenContaminationFile(4); - } - - - public void testLoadBrokenContaminationFile(final int i){ - Logger logger=org.apache.log4j.Logger.getRootLogger(); - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - - File ContaminationFile=new File(ArtificalBAMLocation+String.format("contamination.case.broken.%d.txt",i)); - AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile,0.0,null,logger); - - } - - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 6785375ba..26e9febe7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -288,17 +288,17 @@ public class AlleleBiasedDownsamplingUtils { continue; } - StringTokenizer st = new StringTokenizer(line); + StringTokenizer st = new StringTokenizer(line,"\t"); String fields[] = new String[2]; try { fields[0] = st.nextToken(); fields[1] = st.nextToken(); } catch(NoSuchElementException e){ - throw new UserException.MalformedFile("Contamination file must have exactly two columns. Offending line:\n" + line); + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); } if(st.hasMoreTokens()) { - throw new UserException.MalformedFile("Contamination file must have exactly two columns. Offending line:\n" + line); + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); } if (fields[0].length() == 0 || fields[1].length() == 0) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100644 index 000000000..23b940491 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,187 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(results, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } + + + @Test + public void testLoadContaminationFileDetails(){ + Logger logger=org.apache.log4j.Logger.getRootLogger(); + + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); + + Map Contam1=new HashMap(); + Set Samples1=new HashSet(); + + Contam1.put("NA11918",0.15); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Contam1.put("NA12842",0.13); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Samples1.add("DUMMY"); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + } + + private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ + Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); + Assert.assertTrue(loadedMap.equals(map)); + } + + @DataProvider(name = "goodContaminationFiles") + public Integer[][] goodContaminationFiles() { + return new Integer[][]{ + {1, 2}, + {2, 3}, + {3, 2}, + {4, 2}, + {5, 3}, + {6, 2}, + {7, 2}, + {8, 2} + }; + } + + @Test(dataProvider = "goodContaminationFiles") + public void testLoadContaminationFile(final Integer ArtificalBAMnumber, final Integer numberOfSamples) { + final String ArtificialBAM = String.format("ArtificallyContaminatedBams/contamination.case.%d.txt", ArtificalBAMnumber); + Logger logger = org.apache.log4j.Logger.getRootLogger(); + + File ContamFile = new File(privateTestDir, ArtificialBAM); + Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile, 0.0, null, logger).size() == numberOfSamples); + + } + + + @DataProvider(name = "badContaminationFiles") + public Integer[][] badContaminationFiles() { + return new Integer[][]{{1}, {2}, {3}, {4}, {5}}; + } + + @Test(dataProvider = "badContaminationFiles", expectedExceptions = UserException.MalformedFile.class) + public void testLoadBrokenContaminationFile(final int i) { + Logger logger = org.apache.log4j.Logger.getRootLogger(); + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + + File ContaminationFile = new File(ArtificalBAMLocation + String.format("contamination.case.broken.%d.txt", i)); + AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile, 0.0, null, logger); + + } + + +} From 695723ba435b6b8ff443cb4a98dc304326bad61b Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 18 Feb 2013 20:00:46 -0500 Subject: [PATCH 038/226] Two features useful for ancient DNA processing. Ancient DNA sequencing data is in many ways different from modern data, and methods to analyze it need to be adapted accordingly. Feature 1: Read adaptor trimming. Ancient DNA libraries typically have very short inserts (in the order of 50 bp), so typical Illumina libraries sequenced in, say, 100bp HiSeq will have a large adaptor component being read after the insert. If this adaptor is not removed, data will not be aligneable. There are third party tools that remove adaptor and potentially merge read pairs, but are cumbersome to use and require precise knowledge of the library construction and adaptor sequence. -- New walker ReadAdaptorTrimmer walks through paired end data, computes pair overlap and trims auto-detected adaptor sequence. -- Unit tests added for trimming operation. -- Utility walker (may be retired later) DetailedReadLengthDistribution computes insert size or read length distribution stratified by read group and mapping status and outputs a GATKReport with data. -- Renamed MaxReadLengthFilter to ReadLengthFilter and added ability to specify minimum read length as a filter (may be useful if, as a consequence of adaptor trimming, we're left with a lot of very short reads which will map poorly and will just clutter output BAMs). Feature 2: Unbiased site QUAL estimation: many times ancestral allele status is not known and VCF fields like QUAL, QD, GQ, etc. are affected by the pop. gen. prior at a site. This might introduce subtle biases in studies where a species is aligned against the reference of another species, so an option for UG and HC not to apply such prior is introduced. -- Added -noPrior argument to StandardCallerArgumentCollection. -- Added option not to fill priors is such argument is set. -- Added an integration test. --- .../StandardCallerArgumentCollection.java | 10 ++++++ .../genotyper/UnifiedGenotyperEngine.java | 22 +++++++++++-- .../genotyper/afcalc/AFCalcTestBuilder.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 9 ++++++ .../genotyper/afcalc/AFCalcUnitTest.java | 31 ++++++++++++++++++- ...engthFilter.java => ReadLengthFilter.java} | 10 +++--- 6 files changed, 75 insertions(+), 9 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/filters/{MaxReadLengthFilter.java => ReadLengthFilter.java} (79%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index a47e417c4..0769c8749 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -112,6 +112,15 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 6; + /** + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus. + * If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate, + * as for example when the ancestral status of the reference allele is not known. + */ + @Advanced + @Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false) + public boolean ignoreHeterozygosityPrior = false; + /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we @@ -180,5 +189,6 @@ public class StandardCallerArgumentCollection { this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; + this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index ede0741ff..1d0c10795 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine { this.N = samples.size() * ploidy; log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior); filter.add(LOW_QUAL_FILTER_NAME); @@ -722,8 +722,20 @@ public class UnifiedGenotyperEngine { return GGAmodel; } - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { + /** + * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used, + * where Pr(AC=i) = theta/i where theta is heterozygosity + * @param N Number of chromosomes + * @param priors (output) array to be filled with priors + * @param theta Heterozygosity + * @param ignorePriors If true, priors are ignored and zeros returned + */ + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) { + if (ignorePriors) { + Arrays.fill(priors, 0,N,0.0); + return; + } double sum = 0.0; // for each i @@ -733,6 +745,10 @@ public class UnifiedGenotyperEngine { sum += value; } + // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) + if (sum > 1.0) { + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument"); + } // null frequency for AF=0 is (1 - sum(all other frequencies)) priors[0] = Math.log10(1.0 - sum); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index a66a5580c..a4224bf6c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -111,7 +111,7 @@ public class AFCalcTestBuilder { return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index ca965a042..a0440aaed 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -139,6 +139,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test confidence 1", spec1); } + @Test + public void testNoPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1, + Arrays.asList("422656266117f8d01e17e5c491c49a24")); + executeTest("test no prior 1", spec1); + + } + // -------------------------------------------------------------------------------------------------------------- // // testing heterozygosity diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index c4f5befcf..5eebe9670 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest { final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( AFCalc model : calcs ) { @@ -575,6 +575,35 @@ public class AFCalcUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } + + @Test(enabled = true, dataProvider = "Models") + public void testNoPrior(final AFCalc model) { + for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) { + final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000); + + final double[] flatPriors = new double[]{0.0,0.0,0.0}; + final double[] noPriors = new double[3]; + // test that function computeAlleleFrequency correctly operates when the -noPrior option is set + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true); + + GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior"); + GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior"); + final AFCalcResult resultTrackerFlat = cfgFlatPrior.execute(); + final AFCalcResult resultTrackerNoPrior = cfgNoPrior.execute(); + + final double pRefWithNoPrior = AB.getLikelihoods().getAsVector()[0]; + final double pHetWithNoPrior = AB.getLikelihoods().getAsVector()[1] - Math.log10(0.5); + final double nonRefPost = Math.pow(10, pHetWithNoPrior) / (Math.pow(10, pRefWithNoPrior) + Math.pow(10, pHetWithNoPrior)); + final double log10NonRefPost = Math.log10(nonRefPost); + + if ( ! Double.isInfinite(log10NonRefPost) ) { + // check that the no-prior and flat-prior constructions yield same result + Assert.assertEquals(resultTrackerFlat.getLog10PosteriorOfAFGT0(), resultTrackerNoPrior.getLog10PosteriorOfAFGT0()); + } + + } + } + @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models") public void testBiallelicPriors(final AFCalc model) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java similarity index 79% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java index df1c11a2b..80224b786 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MaxReadLengthFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadLengthFilter.java @@ -29,18 +29,20 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; /** - * Filters out reads whose length is >= some value. + * Filters out reads whose length is >= some value or < some value. * * @author mhanna * @version 0.1 */ -public class MaxReadLengthFilter extends ReadFilter { +public class ReadLengthFilter extends ReadFilter { @Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true) private int maxReadLength; - + + @Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true) + private int minReadLength = 1; public boolean filterOut(SAMRecord read) { // check the length - return read.getReadLength() > maxReadLength; + return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength; } } From c96fbcb9952a5ed1e3272c02715226826da9e242 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 11 Mar 2013 14:12:43 -0400 Subject: [PATCH 041/226] Use the indel heterozygosity prior when calling indels with the HC --- .../arguments/StandardCallerArgumentCollection.java | 7 +++++++ .../walkers/genotyper/UnifiedArgumentCollection.java | 7 ------- .../walkers/haplotypecaller/GenotypingEngine.java | 3 ++- ...lerComplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 12 ++++++------ 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 0769c8749..03698489c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -74,6 +74,12 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + /** + * This argument informs the prior probability of having an indel at a site. + */ + @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) + public double INDEL_HETEROZYGOSITY = 1.0/8000; + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; @@ -179,6 +185,7 @@ public class StandardCallerArgumentCollection { this.alleles = SCAC.alleles; this.GenotypingMode = SCAC.GenotypingMode; this.heterozygosity = SCAC.heterozygosity; + this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY; this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 14d827747..e346b10b7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -113,12 +113,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; - /** - * This argument informs the prior probability of having an indel at a site. - */ - @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) - public double INDEL_HETEROZYGOSITY = 1.0/8000; - @Advanced @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10", required = false) public byte INDEL_GAP_CONTINUATION_PENALTY = 10; @@ -238,7 +232,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; - this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index ae181aa69..34a6ddfa6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -52,6 +52,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -267,7 +268,7 @@ public class GenotypingEngine { final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final GenotypesContext genotypes = calculateGLsForThisEvent( samples, alleleReadMap, mergedVC ); - final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); + final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) ); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 3e57663f8..2e3e45247 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a960722c1ae2b6f774d3443a7e5ac27d"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2232995ca9bec143e664748845a0045"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -75,7 +75,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "56f2ef9acc6c0d267cf2b7a447d87fb7"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "298c1af47a515ea7c8c1ea704d7755ce"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -93,6 +93,6 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "f2df7a8f53ce449e4a8e8f8496e7c745"); + "9563e3c1eee2ef46afc7822af0bb58a8"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 4988fbe77..bf2ddea12 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "aac5517a0a64ad291b6b00825d982f7f"); + HCTest(CEUTRIO_BAM, "", "8f33e40686443b9a72de45d5a9da1861"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "3bfab723fb0f3a65998d82152b67ed15"); + HCTest(NA12878_BAM, "", "8f2b047cdace0ef122d6ad162e7bc5b9"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "283524b3e3397634d4cf0dc2b8723002"); + "9d4be26a2c956ba4b7b4044820eab030"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -95,7 +95,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f1f867dbbe3747f16a0d9e5f11e6ed64"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5af4782a0e1bc9b966b9e3ae76245919"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a17e95c1191e3aef7892586fe38ca050")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("03557376242bdf78c5237703b762573b")); executeTest("HCTestStructuralIndels: ", spec); } @@ -142,7 +142,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("6debe567cd5ed7eb5756b6605a151f56")); + Arrays.asList("a43c595a617589388ff3d7e2ddc661e7")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 05e69b62940f47845355cca8fd28bd234f2ac4ee Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 8 Mar 2013 10:28:11 -0500 Subject: [PATCH 043/226] Refactoring of SlidingWindow class in RR to reduce complexity and fix important bug. * Allow RR to write its BAM to stdout by setting required=true for @Output. * Fixed bug in sliding window where a break in coverage after a long stretch without a variant region was causing a doubling of all the reads before the break. * Refactored SlidingWindow.updateHeaderCounts() into 3 separate tested methods. * Refactored polyploid consensus code out of SlidingWindow.compressVariantRegion(). --- .../compression/reducereads/ReduceReads.java | 2 +- .../reducereads/SlidingWindow.java | 295 +++++++++++------- .../ReduceReadsIntegrationTest.java | 2 +- .../reducereads/SlidingWindowUnitTest.java | 111 ++++++- 4 files changed, 279 insertions(+), 131 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index e89158412..3df2aef38 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -112,7 +112,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40) public class ReduceReads extends ReadWalker, ReduceReadsStash> { - @Output + @Output(required=true) private StingSAMFileWriter out = null; private SAMFileWriter writerToUse = null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 7124b4772..6c063110e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -50,13 +50,12 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import it.unimi.dsi.fastutil.bytes.Byte2IntArrayMap; import it.unimi.dsi.fastutil.bytes.Byte2IntMap; -import it.unimi.dsi.fastutil.bytes.Byte2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -127,8 +126,8 @@ public class SlidingWindow { return getStopLocation(windowHeader); } - private int getStopLocation(LinkedList header) { - return getStartLocation(header) + header.size() - 1; + private int getStopLocation(final LinkedList header) { + return header.isEmpty() ? -1 : header.peekLast().getLocation(); } public String getContig() { @@ -139,7 +138,7 @@ public class SlidingWindow { return contigIndex; } - public int getStartLocation(LinkedList header) { + public int getStartLocation(final LinkedList header) { return header.isEmpty() ? -1 : header.peek().getLocation(); } @@ -652,51 +651,27 @@ public class SlidingWindow { ObjectList allReads = new ObjectArrayList(); // Try to compress into a polyploid consensus - int nVariantPositions = 0; int hetRefPosition = -1; - boolean canCompress = true; - Object[] header = windowHeader.toArray(); + final Object[] header = windowHeader.toArray(); - // foundEvent will remain false if we don't allow polyploid reduction - if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) { - for (int i = start; i<=stop; i++) { - - int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT); - - // we will only work on diploid cases because we just don't want to handle/test other scenarios - if ( nAlleles > 2 ) { - canCompress = false; - break; - } else if ( nAlleles == 2 ) { - nVariantPositions++; - - // make sure that there is only 1 site in the variant region that contains more than one allele - if ( nVariantPositions == 1 ) { - hetRefPosition = i; - } else if ( nVariantPositions > 1 ) { - canCompress = false; - break; - } - } - } - } + if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) + hetRefPosition = findSinglePolyploidCompressiblePosition(header, start, stop); // Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress // variant regions that are created by insertions (since we can't confirm here that they represent the same allele) - if ( canCompress && hetRefPosition != -1 ) { + if ( hetRefPosition != -1 ) { allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation()); } - // Return all reads that overlap the variant region and remove them from the window header entirely // also remove all reads preceding the variant region (since they will be output as consensus right after compression else { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); - ObjectList toRemove = new ObjectArrayList(); - for (GATKSAMRecord read : readsInWindow) { - if (read.getSoftStart() <= refStop) { - if (read.getAlignmentEnd() >= refStart) { + final ObjectList toRemove = new ObjectArrayList(); + for ( final GATKSAMRecord read : readsInWindow ) { + if ( read.getSoftStart() <= refStop ) { + if ( read.getAlignmentEnd() >= refStart ) { allReads.add(read); removeFromHeader(windowHeader, read); } @@ -708,6 +683,39 @@ public class SlidingWindow { return allReads; } + /* + * Finds the het variant position located within start and stop (inclusive) if one exists. + * + * @param header the header element array + * @param start the first header index in the region to check (inclusive) + * @param stop the last header index of the region to check (inclusive) + * @return the window header index of the single het position or -1 if either none or more than one exists + */ + @Requires("header != null && start >= 0 && (stop >= start || stop == 0)") + protected int findSinglePolyploidCompressiblePosition(final Object[] header, final int start, final int stop) { + int hetRefPosition = -1; + + for ( int i = start; i <= stop; i++ ) { + + final int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT); + + // we will only work on diploid cases because we just don't want to handle/test other scenarios + if ( nAlleles > 2 ) + return -1; + + if ( nAlleles == 2 ) { + + // make sure that there is only 1 site in the region that contains more than one allele + if ( hetRefPosition >= 0 ) + return -1; + + hetRefPosition = i; + } + } + + return hetRefPosition; + } + /** * Finalizes a variant region, any adjacent synthetic reads. * @@ -728,31 +736,41 @@ public class SlidingWindow { return result; // finalized reads will be downsampled if necessary } - public ObjectSet closeVariantRegions(CompressionStash regions) { - ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - if (!regions.isEmpty()) { - int lastStop = -1; - int windowHeaderStart = getStartLocation(windowHeader); + /* + * Finalizes the list of regions requested (and any regions preceding them) + * + * @param regions the list of regions to finalize + * @return a non-null set of reduced reads representing the finalized regions + */ + public ObjectSet closeVariantRegions(final CompressionStash regions) { + final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); + if ( !regions.isEmpty() ) { - for (GenomeLoc region : regions) { + int windowHeaderStart = getStartLocation(windowHeader); + HeaderElement lastCleanedElement = null; + + for ( final GenomeLoc region : regions ) { if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { - int start = region.getStart() - windowHeaderStart; - int stop = region.getStop() - windowHeaderStart; + final int start = region.getStart() - windowHeaderStart; + final int stop = region.getStop() - windowHeaderStart; allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track - lastStop = stop; + + // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. + // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads + // (in the case where there are multiple regions to close we'd reuse the reads for each region). + if ( stop >= 0 ) { + for ( int i = 0; i < stop; i++ ) + windowHeader.remove(); + lastCleanedElement = windowHeader.remove(); + windowHeaderStart = getStartLocation(windowHeader); + } } } - // clean up the window header elements up until the end of the variant region. - // note that we keep the last element of the region in the event that the following element has a read that starts with insertion. - if ( lastStop >= 0 ) { - for (int i = 0; i < lastStop; i++) - windowHeader.remove(); - final HeaderElement lastOfRegion = windowHeader.remove(); - if ( lastOfRegion.hasInsertionToTheRight() ) - windowHeader.addFirst(new HeaderElement(lastOfRegion.getLocation(), lastOfRegion.numInsertionsToTheRight())); - } + // we need to keep the last element of the last cleaned region in the event that the following element has a read that starts with an insertion. + if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() ) + windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight())); } return allReads; } @@ -925,7 +943,6 @@ public class SlidingWindow { return hetReads; } - private void addToHeader(LinkedList header, GATKSAMRecord read) { updateHeaderCounts(header, read, false); } @@ -934,84 +951,123 @@ public class SlidingWindow { updateHeaderCounts(header, read, true); } - /** * Updates the sliding window's header counts with the incoming read bases, insertions * and deletions. * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param removeRead if we are removing the read from the header or adding + * @param header the sliding window header to use + * @param read the incoming read to be added to the sliding window + * @param removeRead if we are removing the read from the header or adding */ - private void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) { - byte[] bases = read.getReadBases(); - byte[] quals = read.getBaseQualities(); - byte[] insQuals = read.getExistingBaseInsertionQualities(); - byte[] delQuals = read.getExistingBaseDeletionQualities(); - int readStart = read.getSoftStart(); - int readEnd = read.getSoftEnd(); - Cigar cigar = read.getCigar(); + protected void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) { + final int readStart = read.getSoftStart(); + final int headerStart = getStartLocation(header); + int locationIndex = headerStart < 0 ? 0 : readStart - headerStart; - int readBaseIndex = 0; - int startLocation = getStartLocation(header); - int locationIndex = startLocation < 0 ? 0 : readStart - startLocation; - int stopLocation = getStopLocation(header); + if ( removeRead && locationIndex < 0 ) + throw new ReviewedStingException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); - if (removeRead && locationIndex < 0) - throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation); + // we only need to create new header elements if we are adding the read, not when we're removing it + if ( !removeRead ) + locationIndex = createNewHeaderElements(header, read, locationIndex); - if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it - if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window - for (int i = 1; i <= -locationIndex; i++) - header.addFirst(new HeaderElement(startLocation - i)); + actuallyUpdateHeaderForRead(header, read, removeRead, locationIndex); + } - startLocation = readStart; // update start location accordingly - locationIndex = 0; - } + /* + * Creates new header elements if needed for the given read. + * + * @param header the sliding window header to use + * @param read the incoming read to be added to the sliding window + * @param startIndex the start location index into the header for this read + * + * @return an updated index into the modified header + */ + @Requires("header != null && read != null") + protected int createNewHeaderElements(final LinkedList header, final GATKSAMRecord read, final int startIndex) { - if (stopLocation < readEnd) { // Do we need to add extra elements to the header? - int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation; - while (elementsToAdd-- > 0) - header.addLast(new HeaderElement(readEnd - elementsToAdd)); - } + int headerStart = getStartLocation(header); + int locationIndex = startIndex; - // Special case for leading insertions before the beginning of the sliding read - if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) { - header.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added - locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing. - } + // Do we need to add extra elements before the start of the header? This could happen if the previous read was + // clipped and this alignment starts before the beginning of the window + final int readStart = read.getSoftStart(); + if ( startIndex < 0 ) { + for ( int i = 1; i <= -startIndex; i++ ) + header.addFirst(new HeaderElement(headerStart - i)); + + // update the start location accordingly + headerStart = readStart; + locationIndex = 0; } - Iterator headerElementIterator = header.listIterator(locationIndex); + // Do we need to add extra elements to the end of the header? + final int headerStop = getStopLocation(header); + final int readEnd = read.getSoftEnd(); + if ( headerStop < readEnd ) { + final int elementsToAdd = (headerStop < 0) ? readEnd - readStart + 1 : readEnd - headerStop; + for ( int i = elementsToAdd - 1; i >= 0; i-- ) + header.addLast(new HeaderElement(readEnd - i)); + } + + // Special case for leading insertions before the beginning of the sliding read + if ( ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == headerStart || headerStart < 0) ) { + // create a new first element to the window header with no bases added + header.addFirst(new HeaderElement(readStart - 1)); + // this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing + locationIndex = 1; + } + + return locationIndex; + } + + /* + * Actually updates the sliding window's header counts with the incoming read bases and quals (including insertion and deletion quals). + * + * @param header the sliding window header to use + * @param read the incoming read to be added to the sliding window + * @param removeRead if we are removing the read from the header or adding + * @param startIndex the start location index into the header for this read + */ + @Requires("header != null && read != null && startIndex >= 0") + protected void actuallyUpdateHeaderForRead(final LinkedList header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) { + + final Iterator headerElementIterator = header.listIterator(startIndex); + final byte mappingQuality = (byte) read.getMappingQuality(); + + // iterator variables + int locationIndex = startIndex; + int readBaseIndex = 0; HeaderElement headerElement; - for (CigarElement cigarElement : cigar.getCigarElements()) { - switch (cigarElement.getOperator()) { + + for ( CigarElement cigarElement : read.getCigar().getCigarElements() ) { + switch ( cigarElement.getOperator() ) { case H: break; case I: - if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it. + // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it. + if ( removeRead && locationIndex == 0 ) break; - } - headerElement = header.get(locationIndex - 1); // insertions are added to the base to the left (previous element) + // insertions are added to the base to the left (previous element) + headerElement = header.get(locationIndex - 1); - if (removeRead) { + if ( removeRead ) headerElement.removeInsertionToTheRight(); - } - else { + else headerElement.addInsertionToTheRight(); - } + readBaseIndex += cigarElement.getLength(); - break; // just ignore the insertions at the beginning of the read + break; case D: - int nDeletions = cigarElement.getLength(); - while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score + // deletions are added to the baseCounts with the read mapping quality as it's quality score + final int nDeletionBases = cigarElement.getLength(); + for ( int i = 0; i < nDeletionBases; i++ ) { headerElement = headerElementIterator.next(); - byte mq = (byte) read.getMappingQuality(); if (removeRead) - headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); + headerElement.removeBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); else - headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); + headerElement.addBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); locationIndex++; } @@ -1021,26 +1077,33 @@ public class SlidingWindow { case P: case EQ: case X: - int nBasesToAdd = cigarElement.getLength(); - while (nBasesToAdd-- > 0) { + final int nBasesToAdd = cigarElement.getLength(); + final boolean isSoftClip = cigarElement.getOperator() == CigarOperator.S; + final boolean readHasIndelQuals = read.hasBaseIndelQualities(); + final byte[] insertionQuals = readHasIndelQuals ? read.getBaseInsertionQualities() : null; + final byte[] deletionQuals = readHasIndelQuals ? read.getBaseDeletionQualities() : null; + + for ( int i = 0; i < nBasesToAdd; i++ ) { headerElement = headerElementIterator.next(); - byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything) - byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex]; - if (removeRead) - headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S); + final byte insertionQuality = readHasIndelQuals ? insertionQuals[readBaseIndex] : -1; + final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1; + if ( removeRead ) + headerElement.removeBase(read.getReadBases()[readBaseIndex], read.getBaseQualities()[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); else - headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S); + headerElement.addBase(read.getReadBases()[readBaseIndex], read.getBaseQualities()[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); readBaseIndex++; locationIndex++; } break; + default: + break; } } } - private void removeReadsFromWindow (ObjectList readsToRemove) { - for (GATKSAMRecord read : readsToRemove) { + private void removeReadsFromWindow (final ObjectList readsToRemove) { + for (final GATKSAMRecord read : readsToRemove) { readsInWindow.remove(read); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 970829162..adbc65037 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -147,7 +147,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("922be8b1151dd0d92602af93b77f7a51"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("c57cd191dc391983131be43f6cc2e381"))); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index 09cfe83c9..054f7aa15 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -55,6 +55,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -72,8 +73,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedList; import java.util.List; -import java.util.Set; public class SlidingWindowUnitTest extends BaseTest { @@ -385,30 +386,22 @@ public class SlidingWindowUnitTest extends BaseTest { //// This section tests the downsampling functionality //// /////////////////////////////////////////////////////////// - private class DSTest { - public final int dcov; - - private DSTest(final int dcov) { - this.dcov = dcov; - } - } - @DataProvider(name = "Downsampling") public Object[][] createDownsamplingTestData() { List tests = new ArrayList(); for ( int i = 1; i < basicReads.size() + 10; i++ ) - tests.add(new Object[]{new DSTest(i)}); + tests.add(new Object[]{i}); return tests.toArray(new Object[][]{}); } @Test(dataProvider = "Downsampling", enabled = true) - public void testDownsamplingTest(DSTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, test.dcov, ReduceReads.DownsampleStrategy.Normal, false, false); + public void testDownsamplingTest(final int dcov) { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false, false); final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); - Assert.assertEquals(result.size(), Math.min(test.dcov, basicReads.size())); + Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); } @@ -487,5 +480,97 @@ public class SlidingWindowUnitTest extends BaseTest { } + //////////////////////////////////////////////////// + //// This section tests the new header creation //// + //////////////////////////////////////////////////// + @DataProvider(name = "CreateNewHeader") + public Object[][] CreateNewHeaderTestData() { + List tests = new ArrayList(); + + for ( final int start : Arrays.asList(-10, -1, 0, 1, 10) ) { + for ( final int stop : Arrays.asList(-10, -1, 0, 1, 10) ) { + tests.add(new Object[]{start, stop}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CreateNewHeader", enabled = true) + public void createNewHeaderTest(final int start, final int stop) { + + // set up the window header + final int currentHeaderStart = 100; + final int currentHeaderLength = 50; + final LinkedList windowHeader = new LinkedList(); + for ( int i = 0; i < currentHeaderLength; i++ ) + windowHeader.add(new HeaderElement(currentHeaderStart + i)); + + // set up the read + final int readStart = currentHeaderStart + start; + final int readLength = currentHeaderLength + stop - start; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); + read.setMappingQuality(30); + + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false); + int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start); + + Assert.assertEquals(newIndex, start > 0 ? start : 0); + + final int expectedNewLength = currentHeaderLength + (start < 0 ? -start : 0) + (stop > 0 ? stop : 0); + Assert.assertEquals(windowHeader.size(), expectedNewLength); + } + + + //////////////////////////////////////////////////////////// + //// This section tests updating the header from a read //// + //////////////////////////////////////////////////////////// + + @DataProvider(name = "UpdateHeaderForRead") + public Object[][] UpdateHeaderForReadTestData() { + List tests = new ArrayList(); + + for ( final int start : Arrays.asList(0, 1, 10) ) { + for ( final int readLength : Arrays.asList(1, 5, 10) ) { + tests.add(new Object[]{start, readLength}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "UpdateHeaderForRead", enabled = true) + public void updateHeaderForReadTest(final int start, final int readLength) { + + // set up the window header + final int currentHeaderStart = 100; + final int currentHeaderLength = 50; + final LinkedList windowHeader = new LinkedList(); + for ( int i = 0; i < currentHeaderLength; i++ ) + windowHeader.add(new HeaderElement(currentHeaderStart + i)); + + // set up the read + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + start, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + + // add the read + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false); + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); + for ( int i = 0; i < start; i++ ) + Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); + for ( int i = 0; i < readLength; i++ ) + Assert.assertEquals(windowHeader.get(start + i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 1); + for ( int i = start + readLength; i < currentHeaderLength; i++ ) + Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); + + // now remove the read + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, true, start); + for ( int i = 0; i < currentHeaderLength; i++ ) + Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); + } } From f972963918e3f80df26e49f5f24b925cfc364820 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Tue, 5 Mar 2013 13:58:50 -0500 Subject: [PATCH 046/226] Fixed issues raised by Appistry QA (mostly small fixes, corrections & clarifications to GATKDocs) GATK-73 updated docs for bqsr args GATK-9 differentiate CountRODs from CountRODsByRef GATK-76 generate GATKDoc for CatVariants GATK-4 made resource arg required GATK-10 added -o, some docs to CountMales; some docs to CountLoci GATK-11 fixed by MC's -o change; straightened out the docs. GATK-77 fixed references to wiki GATK-76 Added Ami's doc block GATK-14 Added note that these annotations can only be used with VariantAnnotator GATK-15 specified required=false for two arguments GATK-23 Added documentation block GATK-33 Added documentation GATK-34 Added documentation GATK-32 Corrected arg name and docstring in DiffObjects GATK-32 Added note to DO doc about reference (required but unused) GATK-29 Added doc block to CountIntervals GATK-31 Added @Output PrintStream to enable -o GATK-35 Touched up docs GATK-36 Touched up docs, specified verbosity is optional GATK-60 Corrected GContent annot module location in gatkdocs GATK-68 touched up docs and arg docstrings GATK-16 Added note of caution about calling RODRequiringAnnotations as a group GATK-61 Added run requirements (num samples, min genotype quality) Tweaked template and generic doc block formatting (h2 to h3 titles) GATK-62 Added a caveat to HR annot Made experimental annotation hidden GATK-75 Added setup info regarding BWA GATK-22 Clarified some argument requirements GATK-48 Clarified -G doc comments GATK-67 Added arg requirement GATK-58 Added annotation and usage docs GSATDG-96 Corrected doc Updated MD5 for DiffObjectsIntegrationTests (only change is link in table title) --- .../annotator/DepthPerAlleleBySample.java | 14 +++-- .../gatk/walkers/annotator/GCContent.java | 1 - .../gatk/walkers/annotator/HardyWeinberg.java | 2 + .../walkers/annotator/HomopolymerRun.java | 6 +- .../walkers/annotator/MVLikelihoodRatio.java | 6 ++ .../walkers/annotator/SpanningDeletions.java | 4 +- .../annotator/TandemRepeatAnnotator.java | 5 ++ .../TransmissionDisequilibriumTest.java | 6 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 6 +- .../bqsr/RecalibrationArgumentCollection.java | 16 +++--- .../compression/reducereads/CompareBAM.java | 6 +- .../compression/reducereads/ReduceReads.java | 6 +- .../targets/BaseCoverageDistribution.java | 6 +- .../diagnostics/targets/DiagnoseTargets.java | 6 +- .../targets/FindCoveredIntervals.java | 25 ++++++++ .../walkers/genotyper/UnifiedGenotyper.java | 15 ++--- .../haplotypecaller/HaplotypeCaller.java | 8 +-- .../haplotypecaller/HaplotypeResolver.java | 6 +- .../gatk/walkers/indels/IndelRealigner.java | 43 ++++++++------ .../gatk/walkers/indels/LeftAlignIndels.java | 6 +- .../indels/RealignerTargetCreator.java | 12 ++-- .../walkers/phasing/PhaseByTransmission.java | 6 +- .../gatk/walkers/phasing/PhasingUtils.java | 6 +- .../walkers/phasing/ReadBackedPhasing.java | 6 +- .../validation/GenotypeAndValidate.java | 6 +- .../ValidationSiteSelector.java | 6 +- .../ApplyRecalibration.java | 6 +- .../VariantRecalibrator.java | 13 +++-- .../variantutils/RegenotypeVariants.java | 6 +- .../covariates/ExperimentalCovariate.java | 6 +- .../covariates/RequiredCovariate.java | 6 +- .../covariates/StandardCovariate.java | 6 +- .../DiffObjectsIntegrationTest.java | 8 +-- .../utils/R/gsalib/man/gsalib-package.Rd | 6 +- public/doc/README | 8 +-- .../sting/alignment/CheckAlignment.java | 9 ++- .../sting/commandline/CommandLineProgram.java | 2 +- .../arguments/GATKArgumentCollection.java | 2 +- .../sting/gatk/examples/GATKDocsExample.java | 6 +- .../filters/ReassignMappingQualityFilter.java | 6 +- .../ReassignOneMappingQualityFilter.java | 6 +- .../gatk/walkers/annotator/AlleleBalance.java | 2 +- .../walkers/annotator/VariantAnnotator.java | 13 +++-- .../walkers/beagle/BeagleOutputToVCF.java | 2 +- .../walkers/beagle/ProduceBeagleInput.java | 4 +- .../gatk/walkers/coverage/CallableLoci.java | 6 +- .../walkers/coverage/DepthOfCoverage.java | 6 +- .../walkers/coverage/GCContentByInterval.java | 8 +-- .../diagnostics/CoveredByNSamplesSites.java | 6 +- .../diagnostics/ErrorRatePerCycle.java | 6 +- .../diagnostics/ReadGroupProperties.java | 6 +- .../diagnostics/ReadLengthDistribution.java | 6 +- .../gatk/walkers/diffengine/DiffEngine.java | 5 +- .../gatk/walkers/diffengine/DiffObjects.java | 13 +++-- .../fasta/FastaAlternateReferenceMaker.java | 6 +- .../walkers/fasta/FastaReferenceMaker.java | 6 +- .../sting/gatk/walkers/fasta/FastaStats.java | 22 ++++++- .../walkers/filters/VariantFiltration.java | 8 +-- .../sting/gatk/walkers/qc/CountBases.java | 6 +- .../sting/gatk/walkers/qc/CountIntervals.java | 39 ++++++++++++- .../sting/gatk/walkers/qc/CountLoci.java | 19 ++++--- .../sting/gatk/walkers/qc/CountMales.java | 29 ++++++++++ .../sting/gatk/walkers/qc/CountRODs.java | 18 ++++-- .../sting/gatk/walkers/qc/CountRODsByRef.java | 20 +++++-- .../gatk/walkers/qc/CountReadEvents.java | 14 ++--- .../sting/gatk/walkers/qc/CountReads.java | 6 +- .../gatk/walkers/qc/CountTerminusEvent.java | 13 +++-- .../sting/gatk/walkers/qc/FlagStat.java | 53 ++++++++--------- .../sting/gatk/walkers/qc/Pileup.java | 28 ++++++++- .../sting/gatk/walkers/qc/QCRef.java | 6 +- .../gatk/walkers/readutils/ClipReads.java | 6 +- .../gatk/walkers/readutils/PrintReads.java | 12 +++- .../validation/ValidationAmplicons.java | 6 +- .../gatk/walkers/varianteval/VariantEval.java | 6 +- .../walkers/variantutils/CombineVariants.java | 8 +-- .../variantutils/LeftAlignVariants.java | 6 +- .../walkers/variantutils/SelectHeaders.java | 6 +- .../walkers/variantutils/SelectVariants.java | 8 +-- .../variantutils/ValidateVariants.java | 4 +- .../VariantValidationAssessor.java | 6 +- .../walkers/variantutils/VariantsToTable.java | 9 ++- .../walkers/variantutils/VariantsToVCF.java | 6 +- .../sting/tools/CatVariants.java | 57 +++++++++++++++++-- .../utils/codecs/refseq/RefSeqCodec.java | 4 +- .../queue/qscripts/GATKResourcesBundle.scala | 2 +- .../queue/extensions/snpeff/SnpEff.scala | 2 +- settings/helpTemplates/generic.template.html | 2 +- 87 files changed, 550 insertions(+), 307 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 5acea12f6..9f90a1308 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -72,11 +72,11 @@ import java.util.Map; /** - * The depth of coverage of each VCF allele in this sample. + * The depth of coverage of each allele per sample * - * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this + *

    The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the - * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of + * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of * REF and ALT fields) is the unfiltered count of all reads that carried with them the * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the * power I have to determine the genotype of the sample at this site, while the AD tells me how many times @@ -86,10 +86,12 @@ import java.util.Map; * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are - * many non-informatice reads. - * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation, + * many non-informative reads.

    + * + *

    Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation, * one should not base assumptions about the underlying genotype based on it; - * instead, the genotype likelihoods (PLs) are what determine the genotype calls. + * instead, the genotype likelihoods (PLs) are what determine the genotype calls.

    + * */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 48b3593c5..aa5b779da 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -70,7 +70,6 @@ import java.util.Map; /** * The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { public Map annotate(final RefMetaDataTracker tracker, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index 703810025..b349be285 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -69,6 +69,8 @@ import java.util.Map; /** * Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium + * + *

    Requires at least 10 samples in order to run. Only genotypes with sufficient quality (>10) will be taken into account.

    */ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index c25cb6820..f9663d33e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -63,7 +63,11 @@ import java.util.List; import java.util.Map; /** - * Largest contiguous homopolymer run of the variant allele in either direction on the reference. Computed only for bi-allelic sites. + * Largest contiguous homopolymer run of the variant allele in either direction on the reference. + * + *

    Computed only for bi-allelic sites.

    + * + *

    Note that this annotation is no longer supported, as we have found that it does not give satisfactory results. Use at your own risk!

    */ public class HomopolymerRun extends InfoFieldAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index 19f32bae0..58d720899 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -65,10 +65,16 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** + * Likelihood of the site being a mendelian violation versus the likelihood of the site transmitting according to mendelian rules. + * + *

    * Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation * versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is * diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than * the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios. + *

    + * + *

    Note that this annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).

    */ public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index cede1e5ee..c3a0618ef 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -65,7 +65,9 @@ import java.util.Map; /** - * Fraction of reads containing spanning deletions at this site. + * Fraction of reads containing spanning deletions at this site + * + *

    Note that this annotation is currently not compatible with HaplotypeCaller.

    */ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index 2e0e759c2..d976592cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -66,6 +66,11 @@ import java.util.List; import java.util.Map; +/** + * Annotates variants that are composed of tandem repeats + * + *

    Note that this annotation is currently not compatible with HaplotypeCaller.

    + */ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation { private static final String STR_PRESENT = "STR"; private static final String REPEAT_UNIT_KEY = "RU"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index b3f5728a2..f29899f7f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -65,9 +65,9 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** - * Created by IntelliJ IDEA. - * User: rpoplin, lfran, ebanks - * Date: 11/14/11 + * Wittkowski transmission disequilibrium test + * + *

    Note that this annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).

    */ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e1972334b..dde49b7db 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -95,14 +95,14 @@ import java.util.List; * *

    * - *

    Input

    + *

    Input

    *

    * The input read data whose base quality scores need to be assessed. *

    * A database of known polymorphic sites to skip over. *

    * - *

    Output

    + *

    Output

    *

    * A GATK Report file with many tables: *

      @@ -116,7 +116,7 @@ import java.util.List; * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx4g -jar GenomeAnalysisTK.jar \
        *   -T BaseRecalibrator \
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
      index 5ab296a5f..ee2edee5a 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
      @@ -146,38 +146,38 @@ public class RecalibrationArgumentCollection {
           public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
       
           /**
      -     * The context covariate will use a context of this size to calculate it's covariate value for base mismatches
      +     * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
            */
      -    @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
      +    @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false)
           public int MISMATCHES_CONTEXT_SIZE = 2;
       
           /**
      -     * The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
      +     * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
            */
      -    @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
      +    @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false)
           public int INDELS_CONTEXT_SIZE = 3;
       
           /**
            * The cycle covariate will generate an error if it encounters a cycle greater than this value.
            * This argument is ignored if the Cycle covariate is not used.
            */
      -    @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
      +    @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false)
           public int MAXIMUM_CYCLE_VALUE = 500;
       
           /**
      -     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
      +     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off]
            */
           @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
           public byte MISMATCHES_DEFAULT_QUALITY = -1;
       
           /**
      -     * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
      +     * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on]
            */
           @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
           public byte INSERTIONS_DEFAULT_QUALITY = 45;
       
           /**
      -     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
      +     * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on]
            */
           @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
           public byte DELETIONS_DEFAULT_QUALITY = 45;
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
      index a8a765ddc..36da92b4f 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAM.java
      @@ -69,15 +69,15 @@ import java.util.Map;
        * 

      * This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files. *

      - *

      Input

      + *

      Input

      *

      * Two BAM files (using -I) with different read group IDs *

      - *

      Output

      + *

      Output

      *

      * [Output description] *

      - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
      index e89158412..c2c154053 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
      @@ -86,17 +86,17 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
        * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
        * savings in file size and performance of the downstream tools.
        *
      - * 

      Input

      + *

      Input

      *

      * The BAM file to be compressed *

      * - *

      Output

      + *

      Output

      *

      * The compressed (reduced) BAM file. * *

      - *

      Examples

      + *

      Examples

      *
        * java -Xmx4g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
      index 37e82a90c..9bd08a020 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java
      @@ -71,17 +71,17 @@ import java.util.Map;
        *  
    1. includes reads with deletions in the loci (optionally can be turned off)
    2. *

      * - *

      Input

      + *

      Input

      *

      * The BAM file and an optional interval list (works for WGS as well) *

      * - *

      Output

      + *

      Output

      *

      * A GATK Report with the coverage distribution per base * *

      - *

      Examples

      + *

      Examples

      *
        * java -Xmx4g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
      index 8b9b37c18..e4310588e 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
      @@ -75,7 +75,7 @@ import java.util.*;
        * 

      *

      *

      - *

      Input

      + *

      Input

      *

      *

        *
      • A reference file
      • @@ -84,12 +84,12 @@ import java.util.*; *
      *

      *

      - *

      Output

      + *

      Output

      *

      * A modified VCF detailing each interval by sample *

      *

      - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java
      index b1a26b7a2..6b4d1f7a8 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java
      @@ -63,6 +63,31 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
       
       import java.io.PrintStream;
       
      +/**
      + * Outputs a list of intervals that are covered above a given threshold.
      + *
      + * 

      The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.

      + * + *

      Input

      + *

      + * One or more BAM files. + *

      + * + *

      Output

      + *

      + * List of covered (or uncovered) intervals. + *

      + * + *

      Example

      + *
      + * java -Xmx2g -jar GenomeAnalysisTK.jar \
      + *   -T FindCoveredIntervals \
      + *   -R ref.fasta \
      + *   -I my_file.bam \
      + *   -o output.list
      + * 
      + * + */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.CONTIG) @ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 137a1cfa5..4347a1a84 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -86,17 +86,17 @@ import java.util.*; * both single sample data and multi-sample data. *

      * - *

      Input

      + *

      Input

      *

      * The read data from which to make variant calls. *

      * - *

      Output

      + *

      Output

      *

      * A raw, unfiltered, highly sensitive callset in VCF format. *

      * - *

      Example generic command for multi-sample SNP calling

      + *

      Example generic command for multi-sample SNP calling

      *
        * java -jar GenomeAnalysisTK.jar \
        *   -R resources/Homo_sapiens_assembly18.fasta \
      @@ -117,7 +117,7 @@ import java.util.*;
        * argument descriptions below.
        * 

      * - *

      Example command for generating calls at all sites

      + *

      Example command for generating calls at all sites

      *
        * java -jar /path/to/GenomeAnalysisTK.jar \
        *   -l INFO \
      @@ -128,7 +128,7 @@ import java.util.*;
        *   --output_mode EMIT_ALL_SITES
        * 
      * - *

      Caveats

      + *

      Caveats

      *
        *
      • The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and * file formats are likely to change.
      • @@ -167,7 +167,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * Records that are filtered in the comp track will be ignored. * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -205,7 +205,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif protected List annotationsToExclude = new ArrayList(); /** - * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups. + * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected String[] annotationClassesToUse = { "Standard" }; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 003b8197f..7948b93a9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -96,17 +96,17 @@ import java.util.*; /** * Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM. * - *

        Input

        + *

        Input

        *

        * Input bam file(s) from which to make calls *

        * - *

        Output

        + *

        Output

        *

        * VCF file with raw, unrecalibrated SNP and indel calls. *

        * - *

        Examples

        + *

        Examples

        *
          *   java
          *     -jar GenomeAnalysisTK.jar
        @@ -120,7 +120,7 @@ import java.util.*;
          *     -o output.raw.snps.indels.vcf
          * 
        * - *

        Caveats

        + *

        Caveats

        *
          *
        • The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.
        • *
        diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index c7cc84b9c..4de9488e9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -84,17 +84,17 @@ import java.util.*; * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. * Records are annotated with the set and status attributes. * - *

        Input

        + *

        Input

        *

        * 2 variant files to resolve. *

        * - *

        Output

        + *

        Output

        *

        * A single consensus VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx1g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
        index c7d24f475..d3a13df29 100644
        --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
        +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
        @@ -87,7 +87,7 @@ import java.io.IOException;
         import java.util.*;
         
         /**
        - * Performs local realignment of reads based on misalignments due to the presence of indels.
        + * Performs local realignment of reads to correct misalignments due to the presence of indels.
          *
          * 

        * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases @@ -100,39 +100,46 @@ import java.util.*; * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and * specifically identify indels. - *

        + *

        *
          There are 2 steps to the realignment process: *
        1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
        2. *
        3. Running the realigner over those intervals (IndelRealigner)
        4. *
        - *

        - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. *

        - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). + * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 + *

        * - *

        Input

        + *

        Input

        *

        * One or more aligned BAM files and optionally one or more lists of known indels. *

        * - *

        Output

        + *

        Output

        *

        * A realigned version of your input BAM file(s). *

        * - *

        Examples

        + *

        Example

        *
          * java -Xmx4g -jar GenomeAnalysisTK.jar \
        - *   -I input.bam \
        - *   -R ref.fasta \
          *   -T IndelRealigner \
        + *   -R ref.fasta \
        + *   -I input.bam \
          *   -targetIntervals intervalListFromRTC.intervals \
          *   -o realignedBam.bam \
          *   [-known /path/to/indels.vcf] \
          *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
          * 
        * + *

        Caveats

        + * + *
        • + * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + *
        • + * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + *
        + * * @author ebanks */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @@ -168,7 +175,7 @@ public class IndelRealigner extends ReadWalker { /** * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). */ - @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + @Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true) protected IntervalBinding intervalsFile = null; /** @@ -203,7 +210,7 @@ public class IndelRealigner extends ReadWalker { * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. */ @Advanced - @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + @Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false) protected double MISMATCH_THRESHOLD = 0.15; /** @@ -225,21 +232,21 @@ public class IndelRealigner extends ReadWalker { * For expert users only! */ @Advanced - @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false) + @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false) protected int MAX_POS_MOVE_ALLOWED = 200; /** * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. */ @Advanced - @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) + @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) protected int MAX_CONSENSUSES = 30; /** * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. */ @Advanced - @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) + @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) protected int MAX_READS_FOR_CONSENSUSES = 120; /** @@ -247,7 +254,7 @@ public class IndelRealigner extends ReadWalker { * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. */ @Advanced - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false) protected int MAX_READS = 20000; @Advanced @@ -263,7 +270,7 @@ public class IndelRealigner extends ReadWalker { * * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). */ - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)") protected String N_WAY_OUT = null; @Hidden diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index ff21893f1..532d13690 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -68,17 +68,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. * - *

        Input

        + *

        Input

        *

        * A bam file to left-align. *

        * - *

        Output

        + *

        Output

        *

        * A left-aligned bam. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx3g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
        index 1ee04e317..caeb1e8d7 100644
        --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
        +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
        @@ -99,22 +99,22 @@ import java.util.TreeSet;
          * Important note 3: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
          * (or with reads from similar technologies).   This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
          *
        - * 

        Input

        + *

        Input

        *

        * One or more aligned BAM files and optionally one or more lists of known indels. *

        * - *

        Output

        + *

        Output

        *

        * A list of target intervals to pass to the Indel Realigner. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -I input.bam \
        - *   -R ref.fasta \
          *   -T RealignerTargetCreator \
        + *   -R ref.fasta \
        + *   -I input.bam \
          *   -o forIndelRealigner.intervals \
          *   [--known /path/to/indels.vcf]
          * 
        @@ -143,7 +143,7 @@ public class RealignerTargetCreator extends RodWalker> known = Collections.emptyList(); /** - * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. + * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. Must be > 1. */ @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) protected int windowSize = 10; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 54a324411..a4c1caf86 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -90,7 +90,7 @@ import java.util.*; *
      • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
      • *
      * - *

      Input

      + *

      Input

      *

      *

        *
      • A VCF variant set containing trio(s) and/or parent/child pair(s).
      • @@ -108,12 +108,12 @@ import java.util.*; *
      *

      * - *

      Output

      + *

      Output

      *

      * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. *

      * - *

      Examples

      + *

      Examples

      *
        * java -Xmx2g -jar GenomeAnalysisTK.jar \
        *   -R ref.fasta \
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java
      index eb2bb62ef..bb8c14ef7 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingUtils.java
      @@ -65,17 +65,17 @@ import java.util.*;
        * [Functionality of this walker]
        * 

      *

      - *

      Input

      + *

      Input

      *

      * [Input description] *

      *

      - *

      Output

      + *

      Output

      *

      * [Output description] *

      *

      - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
      index 7f2cdd3d0..c1b484542 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
      @@ -86,17 +86,17 @@ import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersF
        * Performs physical phasing of SNP calls, based on sequencing reads.
        * 

      * - *

      Input

      + *

      Input

      *

      * VCF file of SNP calls, BAM file of sequence reads. *

      * - *

      Output

      + *

      Output

      *

      * Phased VCF file. *

      * - *

      Examples

      + *

      Examples

      *
        *    java
        *      -jar GenomeAnalysisTK.jar
      diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java
      index d6a814ee8..6af39c0b0 100644
      --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java
      +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidate.java
      @@ -99,14 +99,14 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel;
        *  

      * * - *

      Input

      + *

      Input

      *

      * A BAM file to make calls on and a VCF file to use as truth validation dataset. * * You also have the option to invert the roles of the files using the command line options listed below. *

      * - *

      Output

      + *

      Output

      *

      * GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a * 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true @@ -176,7 +176,7 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; * * * - *

      Examples

      + *

      Examples

      *
        *
      1. * Genotypes BAM file from new technology using the VCF as a truth dataset: diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java index 5c216928b..d587c305e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelector.java @@ -85,17 +85,17 @@ import java.util.*; * * User can additionally restrict output to a particular type of variant (SNP, Indel, etc.) * - *

        Input

        + *

        Input

        *

        * One or more variant sets to choose from. *

        * - *

        Output

        + *

        Output

        *

        * A sites-only VCF with the desired number of randomly selected sites. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
        index f2120213a..22425e62e 100644
        --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
        +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
        @@ -81,7 +81,7 @@ import java.util.*;
          * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
          * slightly lower quality level.
          *
        - * 

        Input

        + *

        Input

        *

        * The input raw variants to be recalibrated. *

        @@ -89,11 +89,11 @@ import java.util.*; *

        * The tranches file that was generated by the VariantRecalibrator walker. * - *

        Output

        + *

        Output

        *

        * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. * - *

        Examples

        + *

        Examples

        *
          * java -Xmx3g -jar GenomeAnalysisTK.jar \
          *   -T ApplyRecalibration \
        diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
        index 57d9c219c..99d926ea5 100644
        --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
        +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
        @@ -80,6 +80,7 @@ import java.util.*;
          *
          * 

        * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + *

        * *

        * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. @@ -91,24 +92,26 @@ import java.util.*; * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + *

        * *

        * NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). * See http://www.r-project.org for more info on how to download and install R. + *

        * - *

        Input

        + *

        Input

        *

        * The input raw variants to be recalibrated. *

        * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. * - *

        Output

        + *

        Output

        *

        * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. *

        * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * - *

        Example

        + *

        Example

        *
          * java -Xmx4g -jar GenomeAnalysisTK.jar \
          *   -T VariantRecalibrator \
        @@ -152,7 +155,7 @@ public class VariantRecalibrator extends RodWalker> resource = Collections.emptyList();
         
             /////////////////////////////
        @@ -170,7 +173,7 @@ public class VariantRecalibrator extends RodWalkerInput
        + * 

        Input

        *

        * A variant set to regenotype. *

        * - *

        Output

        + *

        Output

        *

        * A re-genotyped VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java
        index 5469b38c8..a16fdcaa1 100644
        --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java
        +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java
        @@ -53,17 +53,17 @@ package org.broadinstitute.sting.utils.recalibration.covariates;
          * [Functionality of this walker]
          * 

        *

        - *

        Input

        + *

        Input

        *

        * [Input description] *

        *

        - *

        Output

        + *

        Output

        *

        * [Output description] *

        *

        - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java
        index bb55ed0c5..4267c1ffd 100644
        --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java
        +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java
        @@ -53,17 +53,17 @@ package org.broadinstitute.sting.utils.recalibration.covariates;
          * [Functionality of this walker]
          * 

        *

        - *

        Input

        + *

        Input

        *

        * [Input description] *

        *

        - *

        Output

        + *

        Output

        *

        * [Output description] *

        *

        - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java
        index 9ade37019..045b21527 100644
        --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java
        +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java
        @@ -53,17 +53,17 @@ package org.broadinstitute.sting.utils.recalibration.covariates;
          * [Functionality of this walker]
          * 

        *

        - *

        Input

        + *

        Input

        *

        * [Input description] *

        *

        - *

        Output

        + *

        Output

        *

        * [Output description] *

        *

        - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
        index c93f68ef8..5a308928d 100644
        --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
        +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
        @@ -74,10 +74,10 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
         
             @DataProvider(name = "data")
             public Object[][] createData() {
        -        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "aea3d5df32a2acd400da48d06b4dbc60");
        -        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "3f46f5a964f7c34015d972256fe49a35");
        -        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "e71e23e7ebfbe768e59527bc62f8918d");
        -        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "47bf16c27c9e2c657a7e1d13f20880c9");
        +        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "71869ddf9665773a842a9def4cc5f3c8");
        +        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "cec7c644c84ef9c96aacaed604d9ec9b");
        +        new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "47546e03344103020e49d8037a7e0727");
        +        new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "d27b37f7a366c8dacca5cd2590d3c6ce");
                 return TestParams.getTests(TestParams.class);
             }
         
        diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd
        index dc7a08287..4a49cf932 100644
        --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd
        +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd
        @@ -19,9 +19,11 @@ Medical and Population Genetics Program
         Maintainer: Kiran Garimella
         }
         \references{
        -GSA wiki page: http://www.broadinstitute.org/gatk
        +GATK website: http://www.broadinstitute.org/gatk
         
        -GATK help forum: http://www.broadinstitute.org/gatk
        +GATK documentation guide: http://www.broadinstitute.org/gatk/guide
        +
        +GATK help forum: http://gatkforums.broadinstitute.org
         }
         \examples{
         ## get script arguments in interactive and non-interactive mode
        diff --git a/public/doc/README b/public/doc/README
        index ec5fa8500..e70ced0df 100644
        --- a/public/doc/README
        +++ b/public/doc/README
        @@ -59,7 +59,7 @@ index (.fasta.fai).
         
         Instructions for preparing input files are available here:
         
        -http://www.broadinstitute.org/gsa/wiki/index.php/Preparing_input_files
        +http://www.broadinstitute.org/gatk/guide/article?id=1204
         
         The bundled 'resources' directory  contains an example BAM and fasta.
         
        @@ -69,7 +69,7 @@ The GATK is distributed with a few standard analyses, including PrintReads,
         Pileup, and DepthOfCoverage.  More information on the included walkers is
         available here:
         
        -http://www.broadinstitute.org/gsa/wiki/index.php/Built-in_walkers
        +http://www.broadinstitute.org/gatk/gatkdocs
         
         To print the reads of the included sample data, untar the package into
         the GenomeAnalysisTK directory and run the following command:
        @@ -81,6 +81,6 @@ java -jar GenomeAnalysisTK/GenomeAnalysisTK.jar \
         
         Support
         -------
        -Documentation for the GATK is available at http://www.broadinstitute.org/gsa/wiki.  
        +Documentation for the GATK is available at http://www.broadinstitute.org/gatk/guide.
         For help using the GATK, developing analyses with the GATK, bug reports, 
        -or feature requests, please email gsadevelopers@broadinstitute.org.
        +or feature requests, please visit our support forum at http://gatkforums.broadinstitute.org/
        diff --git a/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java b/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java
        index 93b4d5e6f..d313f35ce 100644
        --- a/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java
        +++ b/public/java/src/org/broadinstitute/sting/alignment/CheckAlignment.java
        @@ -42,9 +42,14 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
         import java.util.Iterator;
         
         /**
        - * Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them
        + * Validates consistency of the aligner interface
        + *
        + * 

        Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them * of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original - * alignment from the input file. + * alignment from the input file.

        + * + *

        Caveat

        + *

        This tool requires that BWA be available on the java path.

        * * @author mhanna * @version 0.1 diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index 08aa5f8b3..cf11bb61c 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -370,7 +370,7 @@ public abstract class CommandLineProgram { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); errorPrintf("%n"); - errorPrintf("Please visit the wiki to see if this is a known problem%n"); + errorPrintf("Please check the documentation guide to see if this is a known problem%n"); errorPrintf("If not, please post the error, with stack trace, to the GATK forum%n"); printDocumentationReference(); if ( msg == null ) // some exceptions don't have detailed messages diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index a3e19b944..a9016708b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -206,7 +206,7 @@ public class GATKArgumentCollection { * Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool. * Please be aware that one should only run recalibration with the covariates file created on the same input bam(s). */ - @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration") + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration (intended for use with BaseRecalibrator and PrintReads)") public File BQSR_RECAL_FILE = null; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java index 362cb202e..fcae3cc68 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java @@ -41,17 +41,17 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; * [Functionality of this walker] *

        * - *

        Input

        + *

        Input

        *

        * [Input description] *

        * - *

        Output

        + *

        Output

        *

        * [Output description] *

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java
        index e0166ab38..41ab59845 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java
        @@ -37,18 +37,18 @@ import org.broadinstitute.sting.commandline.Argument;
          *  

        * * - *

        Input

        + *

        Input

        *

        * BAM file(s) *

        * * - *

        Output

        + *

        Output

        *

        * BAM file(s) with all reads mapping qualities reassigned *

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java
        index c894dd801..f31313a86 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java
        @@ -47,18 +47,18 @@ import org.broadinstitute.sting.commandline.Argument;
          *  

        * * - *

        Input

        + *

        Input

        *

        * BAM file(s) *

        * * - *

        Output

        + *

        Output

        *

        * BAM file(s) with one read mapping quality selectively reassigned as desired *

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java
        index 73c31ef66..6e7bc9805 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java
        @@ -46,7 +46,7 @@ import java.util.Map;
         
         
         /**
        - * The allele balance (fraction of ref bases over ref + alt bases) across all bialleleic het-called samples
        + * The allele balance (fraction of ref bases over ref + alt bases) across all biallelic het-called samples
          */
         public class AlleleBalance extends InfoFieldAnnotation {
         
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
        index 826dc9f22..fa3ab885d 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
        @@ -55,17 +55,17 @@ import java.util.*;
          * VariantAnnotator is a GATK tool for annotating variant calls based on their context.
          * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself.
          *
        - * 

        Input

        + *

        Input

        *

        * A variant set to annotate and optionally one or more BAM files. *

        * - *

        Output

        + *

        Output

        *

        * An annotated VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        @@ -142,7 +142,8 @@ public class VariantAnnotator extends RodWalker implements Ann
             protected List annotationsToExclude = new ArrayList();
         
             /**
        -     * See the -list argument to view available groups.
        +     * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups.
        +     * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs.
              */
             @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
             protected List annotationGroupsToUse = new ArrayList();
        @@ -166,13 +167,13 @@ public class VariantAnnotator extends RodWalker implements Ann
             /**
              * Note that the --list argument requires a fully resolved and correct command-line to work.
              */
        -    @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
        +    @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit", required=false)
             protected Boolean LIST = false;
         
             /**
              * By default, the dbSNP ID is added only when the ID field in the variant VCF is empty.
              */
        -    @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated")
        +    @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="In conjunction with the dbSNP binding, append the dbSNP ID even when the variant VCF already has the ID field populated", required=false)
             protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
             public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
         
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java
        index 2e85fe8f9..4b96dbffb 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java
        @@ -61,7 +61,7 @@ import static java.lang.Math.log10;
          * Note that this walker requires all input files produced by Beagle.
          *
          *
        - * 

        Example

        + *

        Example

        *
          *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
          *      -R reffile.fasta -T BeagleOutputToVCF \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java
        index 937c3abc0..618fda0df 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java
        @@ -57,7 +57,7 @@ import java.util.*;
          *  Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
          * 

        * - *

        Input

        + *

        Input

        *

        * A VCF with variants to convert to Beagle format *

        @@ -70,7 +70,7 @@ import java.util.*; * Optional: A file with a list of markers *

        * - *

        Examples

        + *

        Examples

        *
          *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
          *      -R reffile.fasta -T ProduceBeagleInput \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
        index 0681ebf1e..a2efa626c 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
        @@ -70,12 +70,12 @@ import java.io.PrintStream;
          * 
          * 

        *

        - *

        Input

        + *

        Input

        *

        * A BAM file containing exactly one sample. *

        *

        - *

        Output

        + *

        Output

        *

        *

          *
        • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
        • @@ -83,7 +83,7 @@ import java.io.PrintStream; *
        *

        *

        - *

        Examples

        + *

        Examples

        *
          *     -T CallableLociWalker \
          *     -I my.bam \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
        index 3bd114aa1..61574d947 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
        @@ -66,7 +66,7 @@ import java.util.*;
          * and/or percentage of bases covered to or beyond a threshold.
          * Additionally, reads and bases can be filtered by mapping or base quality score.
          *
        - * 

        Input

        + *

        Input

        *

        * One or more bam files (with proper headers) to be analyzed for coverage statistics *

        @@ -75,7 +75,7 @@ import java.util.*; *

        * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation) *

        - *

        Output

        + *

        Output

        *

        * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: *

        @@ -98,7 +98,7 @@ import java.util.*; * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
        index 9a6ef61d8..2975df4a5 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
        @@ -44,21 +44,21 @@ import java.util.List;
          * Walks along reference and calculates the GC content for each interval.
          *
          *
        - * 

        Input

        + *

        Input

        *

        * A reference file *

        * - *

        Output

        + *

        Output

        *

        * GC content calculations per interval. *

        * - *

        Examples

        + *

        Example

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
          *   -T GCContentByInterval \
        + *   -R ref.fasta \
          *   -o output.txt \
          *   -L input.intervals
          * 
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java index a5a8edb0c..169c2708b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java @@ -50,17 +50,17 @@ import java.util.Collection; * CoveredByNSamplesSites is a GATK tool for filter out sites based on their coverage. * The sites that pass the filter are printed out to an intervals file. * - *

        Input

        + *

        Input

        *

        * A variant file and optionally min coverage and sample percentage values. *

        * - *

        Output

        + *

        Output

        *

        * An intervals file. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
        index 76f5478a4..86676ca54 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
        @@ -49,12 +49,12 @@ import java.io.PrintStream;
          * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read
          * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS.
          *
        - * 

        Input

        + *

        Input

        *

        * Any number of BAM files *

        * - *

        Output

        + *

        Output

        *

        * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. * @@ -82,7 +82,7 @@ import java.io.PrintStream; *

        *

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
        index de7ac3e41..0af1dbed5 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
        @@ -53,12 +53,12 @@ import java.util.Map;
          * the median statistics are well determined.  It is safe to run it WG and it'll finish in an appropriate
          * timeframe.
          *
        - * 

        Input

        + *

        Input

        *

        * Any number of BAM files *

        * - *

        Output

        + *

        Output

        *

        * GATKReport containing read group, sample, library, platform, center, median insert size and median read length. * @@ -86,7 +86,7 @@ import java.util.Map; *

        *

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
        index ccad7f0b2..a269a94bc 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
        @@ -49,17 +49,17 @@ import java.util.List;
          *  

        * * - *

        Input

        + *

        Input

        *

        * A BAM file. *

        * - *

        Output

        + *

        Output

        *

        * A human/R readable table of tab separated values with one column per sample and one row per read. *

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
        index 7ac59790c..c909eb2d5 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
        @@ -83,7 +83,7 @@ public class DiffEngine {
                     DiffElement masterElt = master.getElement(name);
                     DiffElement testElt = test.getElement(name);
                     if ( masterElt == null && testElt == null ) {
        -                throw new ReviewedStingException("BUG: unexceptedly got two null elements for field: " + name);
        +                throw new ReviewedStingException("BUG: unexpectedly got two null elements for field: " + name);
                     } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value
                         // todo -- should one of these be a special MISSING item?
                         diffs.add(new Difference(masterElt, testElt));
        @@ -283,8 +283,7 @@ public class DiffEngine {
                 // now that we have a specific list of values we want to show, display them
                 GATKReport report = new GATKReport();
                 final String tableName = "differences";
        -        // TODO for Geraldine -- link needs to be updated below
        -        report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", 3);
        +        report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gatk/guide/article?id=1299 for more information", 3);
                 final GATKReportTable table = report.getTable(tableName);
                 table.addColumn("Difference");
                 table.addColumn("NumberOfOccurrences");
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java
        index d1903c2bb..6b5189dfd 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java
        @@ -68,12 +68,12 @@ import java.util.List;
          *      The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them.  This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.
          * 

        * - *

        Input

        + *

        Input

        *

        * The DiffObjectsWalker works with BAM or VCF files. *

        * - *

        Output

        + *

        Output

        *

        * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named * nodes. Suppose I have two trees: @@ -132,6 +132,10 @@ import java.util.List; [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1

        * + *

        Caveat

        + *

        Because this is a walker, it requires that you pass a reference file. However the reference is not actually used, so it does not matter what you pass as reference.

        + * + * * @author Mark DePristo * @since 7/4/11 */ @@ -140,8 +144,7 @@ public class DiffObjects extends RodWalker { /** * Writes out a file of the DiffEngine format: * - * TODO for Geraldine -- link needs to be updated below (and also in SelectVariants and RefSeqCodec GATK docs) - * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine + * See http://www.broadinstitute.org/gatk/guide/article?id=1299 for details. */ @Output(doc="File to which results should be written",required=true) protected PrintStream out; @@ -169,7 +172,7 @@ public class DiffObjects extends RodWalker { @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) int MAX_OBJECTS_TO_READ = -1; - @Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) + @Argument(fullName="maxRawDiffsToSummarize", shortName="maxRawDiffsToSummarize", doc="Max. number of differences to include in the summary. -1 [default] means unlimited", required=false) int maxRawDiffsToSummary = -1; @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index e881315b9..d2f2e32b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -60,17 +60,17 @@ import java.util.List; * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * - *

        Input

        + *

        Input

        *

        * The reference, requested intervals, and any number of variant rod files. *

        * - *

        Output

        + *

        Output

        *

        * A fasta file representing the requested intervals. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java
        index f2f5fb5fe..fb7941fec 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java
        @@ -48,17 +48,17 @@ import java.io.PrintStream;
          * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
          * separate fasta sequence (named numerically in order).
          *
        - * 

        Input

        + *

        Input

        *

        * The reference and requested intervals. *

        * - *

        Output

        + *

        Output

        *

        * A fasta file representing the requested intervals. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
        index 9fbaca14e..8883523d9 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
        @@ -38,7 +38,27 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
         import java.io.PrintStream;
         
         /**
        - * Calculates basic statistics about the reference sequence itself
        + * Calculate basic statistics about the reference sequence itself
        + *
        + * 

        These are very basic statistics: total number of bases and number of "regular" bases (i.e. A, C, T or G).

        + * + *

        Input

        + *

        + * A FASTA reference file. + *

        + * + *

        Output

        + *

        + * Base counts are written to file if an output file name is given (with -o), otherwise output to stdout. + *

        + * + *

        Example

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -T FastaStats \
        + *   -R ref.fasta \
        + *   [-o output.txt]
        + * 
        */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class FastaStats extends RefWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index 61a847f4c..c59c61803 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -55,17 +55,17 @@ import java.util.*; * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. * - *

        Input

        + *

        Input

        *

        * A variant set to filter. *

        * - *

        Output

        + *

        Output

        *

        * A filtered VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        @@ -114,7 +114,7 @@ public class VariantFiltration extends RodWalker {
              * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
              * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
              */
        -    @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
        +    @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)", required=false)
             protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList();
         
             /**
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java
        index 503cdb6d6..8b82e50a7 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java
        @@ -38,17 +38,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
         /**
          * Walks over the input data set, calculating the number of bases seen for diagnostic purposes.
          *
        - * 

        Input

        + *

        Input

        *

        * One or more BAM files. *

        * - *

        Output

        + *

        Output

        *

        * Number of bases seen. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
        index 3b8eba398..e7b6df623 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java
        @@ -45,9 +45,42 @@ import java.util.Collections;
         import java.util.List;
         
         /**
        - * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
        - * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
        - * This was its very first use.
        + * Count contiguous regions in an interval list.
        + *
        + * 

        When the GATK reads in intervals from an intervals list, any intervals that overlap each other get merged into + * a single interval spanning the original ones. For example, if you have the following intervals: + *

        • + * 20:1-2000 + *
        • + * 20:1500-3000 + *
        + * They will be merged into a single interval: + *
        • 20:1-3000
        + * + * This tool allows you to check, for a given list of intervals, how many separate intervals the GATK will actually + * distinguish at runtime. + *

        + * + *

        Input

        + *

        + * One or more rod files containing intervals to check. + *

        + * + *

        Output

        + *

        + * Number of separate intervals identified by GATK after merging overlapping intervals. + *

        + * + * You can use the -numOverlaps argument to find out how many cases you have of a specific number of overlaps. + * + *

        Example

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -T CountIntervals \
        + *   -R ref.fasta \
        + *   -0 output.txt \
        + *   -check intervals.list
        + * 
        */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class CountIntervals extends RefWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java index f2bd791c1..d999dfebf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java @@ -42,33 +42,34 @@ import java.io.PrintStream; * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. * *

        - * Simplest example of a locus walker. + * This is the simplest example of a locus walker. + *

        * - * - *

        Input

        + *

        Input

        *

        * One or more BAM files. *

        * - *

        Output

        + *

        Output

        *

        - * Number of loci traversed. + * Number of loci traversed. If an output file name is provided, then the result will be written to that file. + * Otherwise it will be sent to standard console output. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
          *   -T CountLoci \
        - *   -o output.txt \
        + *   -R ref.fasta \
          *   -I input.bam \
        + *   -o output.txt \
          *   [-L input.intervals]
          * 
        * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class CountLoci extends LocusWalker implements TreeReducible, NanoSchedulable { - @Output(doc="Write count to this file instead of STDOUT") + @Output PrintStream out; public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java index 6fb4b84d6..7279a64a4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -37,12 +38,36 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; + /** * Walks over the input data set, calculating the number of reads seen from male samples for diagnostic purposes. + * + *

        Input

        + *

        + * One or more BAM files. + *

        + * + *

        Output

        + *

        + * Number of reads seen from male samples. + *

        + * + *

        Examples

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -T CountMales \
        + *   -R ref.fasta \
        + *   -I samples.bam \
        + *   -o output.txt
        + * 
        */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMales extends ReadWalker { + @Output + public PrintStream out; + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { Sample sample = getSampleDB().getSample(read); return sample.getGender() == Gender.MALE ? 1 : 0; @@ -53,4 +78,8 @@ public class CountMales extends ReadWalker { public Integer reduce(Integer value, Integer sum) { return value + sum; } + + public void onTraversalDone( Integer c ) { + out.println(c); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index c01a1df89..65f82efe4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -53,22 +53,32 @@ import java.util.*; /** * Prints out counts of the number of reference ordered data objects encountered. * + *

        CountRods is a RODWalker, and so traverses the data by ROD. For example if the ROD passed to it is a VCF file, + * it will count the variants in the file.

        * - *

        Input

        + *

        Note that this tool is different from CountRodsByRef which is a RefWalker, and so traverses the data by + * position along the reference. CountRodsByRef can count ROD elements (such as, but not limited to, variants) found + * at each position or within specific intervals if you use the -L argument (see CommandLineGATK).

        + * + *

        Both these tools are different from CountVariants in that they are more generic (they can also count RODs that + * are not variants) and CountVariants is more detailed, in that it computes additional statistics (type of variants + * being indels vs. SNPs etc).

        + * + *

        Input

        *

        * One or more rod files. *

        * - *

        Output

        + *

        Output

        *

        * Number of rods seen. *

        * - *

        Examples

        + *

        Example

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
          *   -T CountRODs \
        + *   -R ref.fasta \
          *   -o output.txt \
          *   --rod input.vcf
          * 
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java index 303f1704f..594ca239d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRef.java @@ -43,24 +43,34 @@ import java.util.Collections; import java.util.List; /** - * Prints out counts of the number of reference ordered data objects encountered. + * Prints out counts of the number of reference ordered data objects encountered along the reference. * + *

        CountRodsByRef is a RefWalker, and so traverses the data by position along the reference. It counts ROD + * elements (such as, but not limited to, variants) found at each position or within specific intervals if you use + * the -L argument (see CommandLineGATK).

        * - *

        Input

        + *

        Note that this tool is different from the basic CountRods, which is a RODWalker, and so traverses the data by + * ROD. For example if the ROD passed to it is a VCF file, CountRods will simply count the variants in the file.

        + * + *

        Both these tools are different from CountVariants in that they are more generic (they can also count RODs that + * are not variants) and CountVariants is more detailed, in that it computes additional statistics (type of variants + * being indels vs. SNPs etc).

        + * + *

        Input

        *

        * One or more rod files. *

        * - *

        Output

        + *

        Output

        *

        * Number of rods seen. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
          *   -T CountRODsByRef \
        + *   -R ref.fasta \
          *   -o output.txt \
          *   --rod input.vcf
          * 
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java index 8b0646092..cfb7325a9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java @@ -47,22 +47,22 @@ import java.util.Map; /** * Walks over the input data set, counting the number of read events (from the CIGAR operator) * - *

        Input

        + *

        Input

        *

        * One or more BAM files. *

        * - *

        Output

        + *

        Output

        *

        - * Number of reads events for each category + * Number of read events for each category, formatted as a GATKReport table. * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
          *   -T CountReadEvents \
        - *   -o output.grp \
        + *   -R ref.fasta \
          *   -I input.bam \
        + *   -o output.grp \
          *   [-L input.intervals]
          * 
        */ @@ -70,7 +70,7 @@ import java.util.Map; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadEvents extends ReadWalker> , Map>> { - @Output (doc = "GATKReport table output") + @Output PrintStream out; public Map> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 1a3984014..825fcac90 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -44,17 +44,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * --read-filter command line argument). Simplest example of a read-backed analysis. * * - *

        Input

        + *

        Input

        *

        * One or more BAM files. *

        * - *

        Output

        + *

        Output

        *

        * Number of reads seen. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java
        index 40b78588f..54562aa43 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java
        @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.qc;
         
         import net.sf.samtools.CigarElement;
         import net.sf.samtools.CigarOperator;
        +import org.broadinstitute.sting.commandline.Output;
         import org.broadinstitute.sting.gatk.CommandLineGATK;
         import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
         import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
        @@ -39,22 +40,23 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
         import org.broadinstitute.sting.utils.help.HelpConstants;
         import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
         
        +import java.io.PrintStream;
         import java.util.List;
         
         /**
          * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
          *
        - * 

        Input

        + *

        Input

        *

        * One or more BAM files. *

        * - *

        Output

        + *

        Output

        *

        * Number of reads ending in each category. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        @@ -67,6 +69,9 @@ import java.util.List;
         @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
         @Requires({DataSource.READS, DataSource.REFERENCE})
         public class CountTerminusEvent extends ReadWalker, Pair> {
        +    @Output
        +    public PrintStream out;
        +
             public Pair map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) {
                 List cigarElements = read.getCigar().getCigarElements();
         
        @@ -94,6 +99,6 @@ public class CountTerminusEvent extends ReadWalker, Pair result) {
        -        System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond()));
        +        out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond()));
             }
         }
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java
        index d0a3f3508..17fb4e322 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/FlagStat.java
        @@ -41,36 +41,31 @@ import java.io.PrintStream;
         import java.text.DecimalFormat;
         import java.text.NumberFormat;
         
        -
        -/*
        - * Copyright (c) 2009 The Broad Institute
        - *
        - * Permission is hereby granted, free of charge, to any person
        - * obtaining a copy of this software and associated documentation
        - * files (the "Software"), to deal in the Software without
        - * restriction, including without limitation the rights to use,
        - * copy, modify, merge, publish, distribute, sublicense, and/or sell
        - * copies of the Software, and to permit persons to whom the
        - * Software is furnished to do so, subject to the following
        - * conditions:
        - *
        - * The above copyright notice and this permission notice shall be
        - * included in all copies or substantial portions of the Software.
        - *
        - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
        - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
        - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
        - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
        - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
        - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
        - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
        - * OTHER DEALINGS IN THE SOFTWARE.
        - */
        -
         /**
        - * A reimplementation of the 'samtools flagstat' subcommand in the GATK.  Walks
        - * over all input data, accumulating statistics such as total number of reads,
        - * reads with QC failure flag set, number of duplicates, percentage mapped, etc.
        + * A reimplementation of the 'samtools flagstat' subcommand in the GATK
        + *
        + * 

        This tool walks over all input data, accumulating statistics such as total number of reads, + * reads with QC failure flag set, number of duplicates, percentage mapped, etc.

        + * + *

        Input

        + *

        + * A BAM file containing the sequence data. + *

        + * + *

        Output

        + *

        + * Resulting stats are written to file if an output file name is given (with -o), otherwise output to stdout. + *

        + * + *

        Example

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -T FlagStat \
        + *   -R ref.fasta \
        + *   -I reads.bam \
        + *   [-o output.txt]
        + * 
        + * * @author aaron */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java index 0790f2ced..bc98c670a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java @@ -49,11 +49,33 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * Emulates the samtools pileup command to print aligned reads + * + *

        Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * - * Associated command: + * Emulated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] + * + *

        Input

        + *

        + * A BAM file and the interval to print. + *

        + * + *

        Output

        + *

        + * Formatted pileup-style alignment of reads. + *

        + * + *

        Example

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -T Pileup \
        + *   -R ref.fasta \
        + *   -I aligned_reads.bam \
        + *   -o output.txt
        + * 
        + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { @@ -68,7 +90,7 @@ public class Pileup extends LocusWalker implements TreeReducibl * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per * read items are delimited with an '@' character. */ - @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output") + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) public boolean SHOW_VERBOSE = false; @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java index 395945f03..48bd6feba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRef.java @@ -44,17 +44,17 @@ import java.io.PrintStream; * Quality control for the reference fasta * * - *

        Input

        + *

        Input

        *

        * One reference file only. And optionally -L intervals *

        * - *

        Output

        + *

        Output

        *

        * If ok, nothing, else will throw an exception at the site where there's been a problem *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
        index f7b125828..739da5a98 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java
        @@ -88,12 +88,12 @@ import java.util.regex.Pattern;
          *
          * 

        * - *

        Input

        + *

        Input

        *

        * Any number of BAM files. *

        * - *

        Output

        + *

        Output

        *

        * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping * operation applied to each read. @@ -145,7 +145,7 @@ import java.util.regex.Pattern; *

        *

        * - *

        Examples

        + *

        Examples

        *
          *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
          *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java
        index 8a1178574..475f7a25d 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java
        @@ -56,18 +56,24 @@ import java.util.*;
          * PrintReads can dynamically merge the contents of multiple input BAM files, resulting
          * in merged output sorted in coordinate order.  Can also optionally filter reads based on the
          * --read_filter command line argument.
        + * 

        * - *

        Input

        + *

        + * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, + * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. + *

        + * + *

        Input

        *

        * One or more bam files. *

        * - *

        Output

        + *

        Output

        *

        * A single processed bam file. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
        index 45c5fe090..c75997e67 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
        @@ -65,13 +65,13 @@ import java.util.List;
          * reasons why the site may fail validation (nearby variation, for example).
          * 

        * - *

        Input

        + *

        Input

        *

        * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an * interval list defining the size of the amplicons around the sites to be validated *

        * - *

        Output

        + *

        Output

        *

        * Output is a FASTA-formatted file with some modifications at probe sites. For instance: *

        @@ -100,7 +100,7 @@ import java.util.List;
          * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
          * 

        * - *

        Examples

        + *

        Examples

        *
          *    java
          *      -jar GenomeAnalysisTK.jar
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
        index a3e480bd0..06fa455be 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
        @@ -77,12 +77,12 @@ import java.util.*;
          * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
          * and stratification modules.
          *
        - * 

        Input

        + *

        Input

        *

        * One or more variant sets to evaluate plus any number of comparison sets. *

        * - *

        Output

        + *

        Output

        *

        * Evaluation tables detailing the results of the eval modules which were applied. * For example: @@ -103,7 +103,7 @@ import java.util.*; *

        *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
        index e5fe46a07..436a973df 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
        @@ -68,7 +68,7 @@ import java.util.*;
          * can be exacted using JEXL expressions on the set attribute using SelectVariants.  If you want to extract just
          * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
          * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
        - * in the detailed example on the wiki.
        + * in the detailed example in the documentation guide.
          *
          * Note that CombineVariants supports multi-threaded parallelism (8/15/12).  This is particularly useful
          * when converting from VCF to BCF2, which can be expensive.  In this case each thread spends CPU time
        @@ -83,17 +83,17 @@ import java.util.*;
          *      max QUAL, which resulted in sometime strange downstream confusion
      2. * * - *

        Input

        + *

        Input

        *

        * One or more variant sets to combine. *

        * - *

        Output

        + *

        Output

        *

        * A combined VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java
        index 65ec7a4f0..e6d3e6e94 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java
        @@ -60,17 +60,17 @@ import java.util.*;
          * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
          * Note that this tool cannot handle anything other than bi-allelic, simple indels.  Complex events are written out unchanged.
          *
        - * 

        Input

        + *

        Input

        *

        * A variant set to left-align. *

        * - *

        Output

        + *

        Output

        *

        * A left-aligned VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java
        index 17aaa7513..9bbf728e1 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java
        @@ -58,17 +58,17 @@ import java.util.*;
          * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the
          * file (based on a complete header name or a pattern match).
          * 

        - *

        Input

        + *

        Input

        *

        * A set of VCFs. *

        *

        - *

        Output

        + *

        Output

        *

        * A header selected VCF. *

        *

        - *

        Examples

        + *

        Examples

        *
          * Select only the FILTER, FORMAT, and INFO headers:
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
        index 9c209ae2c..f72ce3bd6 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
        @@ -62,20 +62,20 @@ import java.util.*;
          * Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a
          * pattern match).  Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of
          * coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25).  These JEXL expressions are
        - * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions).
        + * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gatk/guide/article?id=1255).
          * One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
          *
        - * 

        Input

        + *

        Input

        *

        * A variant set to select from. *

        * - *

        Output

        + *

        Output

        *

        * A selected VCF. *

        * - *

        Examples

        + *

        Examples

        *
          * Select two samples out of a VCF with many samples:
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
        index a242f9310..d11cf5aee 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java
        @@ -60,12 +60,12 @@ import java.util.Set;
          *
          * If you are looking simply to test the adherence to the VCF specification, use --validationType NONE.
          *
        - * 

        Input

        + *

        Input

        *

        * A variant set to validate. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
        index 02089eb6c..0e2a04bf2 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
        @@ -55,12 +55,12 @@ import java.util.*;
          * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability.
          * If you have .ped files, please first convert them to VCF format.
          *
        - * 

        Input

        + *

        Input

        *

        * A validation VCF to annotate. *

        * - *

        Output

        + *

        Output

        *

        * An annotated VCF. Additionally, a table like the following will be output: *

        @@ -74,7 +74,7 @@ import java.util.*;
          * 
        *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
        index b12f51a1e..444eb745c 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
        @@ -62,14 +62,13 @@ import java.util.*;
          * genotypes), NO-CALL (count of no-call genotypes), TYPE (the type of event), VAR (count of
          * non-reference genotypes), NSAMPLES (number of samples), NCALLED (number of called samples),
          * GQ (from the genotype field; works only for a file with a single sample), and MULTI-ALLELIC
        - * (is the record from a multi-allelic site).  Note that this tool does not support capturing any
        - * GENOTYPE field values.  If a VCF record is missing a value, then the tool by
        + * (is the record from a multi-allelic site).  Note that if a VCF record is missing a value, then the tool by
          * default throws an error, but the special value NA can be emitted instead with
          * appropriate tool arguments.
          *
          * 

        * - *

        Input

        + *

        Input

        *

        *

          *
        • A VCF file
        • @@ -77,12 +76,12 @@ import java.util.*; *
        *

        * - *

        Output

        + *

        Output

        *

        * A tab-delimited file containing the values of the requested fields in the VCF file *

        * - *

        Examples

        + *

        Examples

        *
          *     java -jar GenomeAnalysisTK.jar \
          *     -R reference.fasta
        diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
        index ffe61f76d..7c7f52803 100644
        --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
        +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java
        @@ -62,17 +62,17 @@ import java.util.*;
          * 

        * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. * - *

        Input

        + *

        Input

        *

        * A variant file to filter. *

        * - *

        Output

        + *

        Output

        *

        * A VCF file. *

        * - *

        Examples

        + *

        Examples

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java
        index 10fb606f9..e1dd2c255 100644
        --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java
        +++ b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java
        @@ -35,6 +35,9 @@ import org.broadinstitute.sting.commandline.Argument;
         import org.broadinstitute.sting.commandline.Input;
         import org.broadinstitute.sting.commandline.Output;
         import org.broadinstitute.sting.commandline.CommandLineProgram;
        +import org.broadinstitute.sting.gatk.CommandLineGATK;
        +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
        +import org.broadinstitute.sting.utils.help.HelpConstants;
         import org.broadinstitute.variant.bcf2.BCF2Codec;
         import org.broadinstitute.sting.utils.collections.Pair;
         import org.broadinstitute.variant.vcf.VCFCodec;
        @@ -51,12 +54,48 @@ import java.util.*;
         
         /**
          *
        - * Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants    [sorted (optional)]");
        - * The input files can be of type: VCF (ends in .vcf or .VCF)");
        - *                                 BCF2 (ends in .bcf or .BCF)");
        - * Output file must be vcf or bcf file (.vcf or .bcf)");
        - * If the input files are already sorted, the last argument can indicate that");
        + * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples.
        + *
        + * 

        + * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. + * This tool concatenates the scattered output VCF files. It assumes that: + * - All the input VCFs (or BCFs) contain the same samples in the same order. + * - The variants in each input file are from non-overlapping (scattered) intervals. + * + * When the input files are already sorted based on the intervals start positions, use -assumeSorted. + * + * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. + * + *

        + * + *

        Input

        + *

        + * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). + * The input files should be 'name.vcf' or 'name.VCF' or 'name.bcf' or 'name.BCF'. + * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + *

        + * + *

        Output

        + *

        + * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. + * <\p> + * + * + *

        Examples

        + *
        + * java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
        + *    -R ref.fasta \
        + *    -V input1.vcf \
        + *    -V input2.vcf \
        + *    -out output.vcf \
        + *    -assumeSorted
        + * 
        + * + * @author Ami Levy Moonshine + * @since Jan 2012 */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class CatVariants extends CommandLineProgram { // setup the logging system, used by some codecs private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); @@ -64,6 +103,14 @@ public class CatVariants extends CommandLineProgram { @Input(fullName = "reference", shortName = "R", doc = "genome reference file .fasta", required = true) private File refFile = null; + /** + * The VCF or BCF files to merge together + * + * CatVariants can take any number of -V arguments on the command line. Each -V argument + * will be included in the final merged output VCF. The order of arguments does not matter, but it runs more + * efficiently if they are sorted based on the intervals and the assumeSorted argument is used. + * + */ @Input(fullName="variant", shortName="V", doc="Input VCF file/s named .vcf or .bcf", required = true) private List variant = null; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index fb26f6c37..82ee76a81 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -45,8 +45,8 @@ import java.util.ArrayList; *

        * *

        - * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here - * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq + * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the documentation guide here + * http://www.broadinstitute.org/gatk/guide/article?id=1329 *

        *

        Usage

        * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 8a8c76806..e20d285e1 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -171,7 +171,7 @@ class GATKResourcesBundle extends QScript { "CEUTrio.HiSeq.WGS.b37.bestPractices.phased",b37,true,false)) // - // example call set for wiki tutorial + // example call set for documentation guide tutorial // addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala index 344f5fe5b..529615c24 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/snpeff/SnpEff.scala @@ -31,7 +31,7 @@ import org.broadinstitute.sting.commandline.{Argument, Output, Input} /** * Basic snpEff support. - * See: http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator + * See: http://www.broadinstitute.org/gatk/guide/article?id=50 */ class SnpEff extends JavaCommandLineFunction { javaMainClass = "ca.mcgill.mcb.pcingola.snpEffect.commandLine.SnpEff" diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index 587828d1e..b05ad65c0 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -130,7 +130,7 @@
        -

        Introduction

        +

        Overview

        ${description} <#-- Create references to additional capabilities if appropriate --> From cdb1fa110547a23182005c98787d9bf6c861a526 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 12 Mar 2013 13:41:29 -0400 Subject: [PATCH 047/226] Fix more tests that fail when run in parallel on the farm -Allow the default S3 put timeout of 30 seconds for GATKRunReports to be overridden via a constructor argument, and use a timeout of 300 seconds for tests. The timeout remains 30 seconds in all other cases. -Change integration tests that themselves dispatch farm jobs into pipeline tests. Necessary because some farm nodes are not set up as submit hosts. Pipeline tests are still run directly on gsa4. -Bump up the timeout for the MaxRuntimeIntegrationTest even more (was still occasionally failing on the farm!) --- .../sting/gatk/phonehome/GATKRunReport.java | 43 ++++++++++++++----- .../sting/gatk/MaxRuntimeIntegrationTest.java | 5 ++- ...nTest.java => JnaSessionPipelineTest.java} | 2 +- ...ionTest.java => LibDrmaaPipelineTest.java} | 2 +- ...ationTest.java => LibBatPipelineTest.java} | 2 +- 5 files changed, 39 insertions(+), 15 deletions(-) rename public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/{JnaSessionIntegrationTest.java => JnaSessionPipelineTest.java} (99%) rename public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/{LibDrmaaIntegrationTest.java => LibDrmaaPipelineTest.java} (99%) rename public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/{LibBatIntegrationTest.java => LibBatPipelineTest.java} (99%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 02f2f9f02..de84809bd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -78,17 +78,11 @@ public class GATKRunReport { private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); - /** - * number of milliseconds before the S3 put operation is timed-out: - */ - private static final long S3_PUT_TIME_OUT = 30 * 1000; - /** * The root file system directory where we keep common report data */ private final static File REPORT_DIR = new File("/humgen/gsa-hpprojects/GATK/reports"); - /** * The full path to the direct where submitted (and uncharacterized) report files are written */ @@ -105,6 +99,17 @@ public class GATKRunReport { */ protected static final Logger logger = Logger.getLogger(GATKRunReport.class); + /** + * Default value for the number of milliseconds before an S3 put operation is timed-out. + * Can be overridden via a constructor argument. + */ + private static final long S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS = 30 * 1000; + + /** + * Number of milliseconds before an S3 put operation is timed-out. + */ + private long s3PutTimeOutInMilliseconds = S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS; + // ----------------------------------------------------------------- // elements captured for the report // ----------------------------------------------------------------- @@ -230,13 +235,31 @@ public class GATKRunReport { } /** - * Create a new RunReport and population all of the fields with values from the walker and engine + * Create a new RunReport and population all of the fields with values from the walker and engine. + * Allows the S3 put timeout to be explicitly set. * * @param walker the GATK walker that we ran * @param e the exception caused by running this walker, or null if we completed successfully * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc + * @param type the GATK phone home setting + * @param s3PutTimeOutInMilliseconds number of milliseconds to wait before timing out an S3 put operation */ - public GATKRunReport(Walker walker, Exception e, GenomeAnalysisEngine engine, PhoneHomeOption type) { + public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type, + final long s3PutTimeOutInMilliseconds) { + this(walker, e, engine, type); + this.s3PutTimeOutInMilliseconds = s3PutTimeOutInMilliseconds; + } + + /** + * Create a new RunReport and population all of the fields with values from the walker and engine. + * Leaves the S3 put timeout set to the default value of S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS. + * + * @param walker the GATK walker that we ran + * @param e the exception caused by running this walker, or null if we completed successfully + * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc + * @param type the GATK phone home setting + */ + public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type) { if ( type == PhoneHomeOption.NO_ET ) throw new ReviewedStingException("Trying to create a run report when type is NO_ET!"); @@ -563,7 +586,7 @@ public class GATKRunReport { throw new IllegalStateException("We are throwing an exception for testing purposes"); case TIMEOUT: try { - Thread.sleep(S3_PUT_TIME_OUT * 100); + Thread.sleep(s3PutTimeOutInMilliseconds * 100); } catch ( InterruptedException e ) { // supposed to be empty } @@ -625,7 +648,7 @@ public class GATKRunReport { s3thread.setName("S3Put-Thread"); s3thread.start(); - s3thread.join(S3_PUT_TIME_OUT); + s3thread.join(s3PutTimeOutInMilliseconds); if(s3thread.isAlive()){ s3thread.interrupt(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java index 25ee9ff09..9df768e70 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java @@ -39,7 +39,8 @@ import java.util.concurrent.TimeUnit; * */ public class MaxRuntimeIntegrationTest extends WalkerTest { - private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(120, TimeUnit.SECONDS); + // Assume a ridiculous amount of startup overhead to allow for running these tests on slow farm nodes + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(300, TimeUnit.SECONDS); private class MaxRuntimeTestProvider extends TestDataProvider { final long maxRuntime; @@ -68,7 +69,7 @@ public class MaxRuntimeIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 300 * 1000) + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 600 * 1000) public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T PrintReads -R " + hg18Reference diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java index 677f87cac..d2da0e228 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionPipelineTest.java @@ -34,7 +34,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.*; -public class JnaSessionIntegrationTest extends BaseTest { +public class JnaSessionPipelineTest extends BaseTest { private String implementation = null; private static final SessionFactory factory = new JnaSessionFactory(); diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java index 038bfd85d..efeeb3640 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaPipelineTest.java @@ -40,7 +40,7 @@ import java.io.File; import java.util.Arrays; import java.util.List; -public class LibDrmaaIntegrationTest extends BaseTest { +public class LibDrmaaPipelineTest extends BaseTest { private String implementation = null; @Test diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java index 4898f17c3..af8d0e7b1 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatPipelineTest.java @@ -40,7 +40,7 @@ import java.io.File; /** * Really unit tests, but these test will only run on systems with LSF setup. */ -public class LibBatIntegrationTest extends BaseTest { +public class LibBatPipelineTest extends BaseTest { @BeforeClass public void initLibBat() { Assert.assertFalse(LibBat.lsb_init("LibBatIntegrationTest") < 0, LibBat.lsb_sperror("lsb_init() failed")); From 8ed78b453f1a2c3e07a9efc703df057c0fa27c0c Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 12 Mar 2013 23:53:26 -0400 Subject: [PATCH 050/226] Increase timeout for a test in the EngineFeaturesIntegrationTest -This test was intermittently failing when run on the farm --- .../sting/gatk/EngineFeaturesIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index 8d0874ea1..2a9bbeb09 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -117,7 +117,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 60 * 1000 ) + @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 300 * 1000 ) public void testEngineErrorHandlingTestProvider(final EngineErrorHandlingTestProvider cfg) { for ( int i = 0; i < cfg.iterationsToTest; i++ ) { final String root = "-T ErrorThrowing -R " + exampleFASTA; From 925846c65f9a64ac16daecc2e2f33b901fe1cd8d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 10 Feb 2013 19:21:26 -0800 Subject: [PATCH 051/226] Cleanup of FragmentUtils -- Code was undocumented, big, and not well tested. All three things fixed. -- Currently not passing, but the framework works well for testing -- Added concat(byte[] ... arrays) to utils --- .../org/broadinstitute/sting/utils/Utils.java | 18 +++ .../sting/utils/fragments/FragmentUtils.java | 109 +++++++++++++----- .../sting/utils/UtilsUnitTest.java | 13 +++ .../fragments/FragmentUtilsUnitTest.java | 87 +++++++++++++- 4 files changed, 196 insertions(+), 31 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 45a2fa58d..ff64133a7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -415,6 +415,24 @@ public class Utils { return C; } + /** + * Concatenates byte arrays + * @return a concat of all bytes in allBytes in order + */ + public static byte[] concat(final byte[] ... allBytes) { + int size = 0; + for ( final byte[] bytes : allBytes ) size += bytes.length; + + final byte[] c = new byte[size]; + int offset = 0; + for ( final byte[] bytes : allBytes ) { + System.arraycopy(bytes, 0, c, offset, bytes.length); + offset += bytes.length; + } + + return c; + } + /** * Appends String(s) B to array A. * @param A First array. diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 76ccede62..fa0187728 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.fragments; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -56,7 +58,8 @@ import java.util.*; * Date: 3/26/11 * Time: 10:09 PM */ -public class FragmentUtils { +public final class FragmentUtils { + protected final static byte MIN_QUAL_BAD_OVERLAP = 16; private FragmentUtils() {} // private constructor /** @@ -65,18 +68,28 @@ public class FragmentUtils { * Allows us to write a generic T -> Fragment algorithm that works with any object containing * a read. * - * @param + * @param The type of the object that contains a GATKSAMRecord */ public interface ReadGetter { + /** + * Get the GATKSAMRecord associated with object + * + * @param object the thing that contains the read + * @return a non-null GATKSAMRecord read + */ public GATKSAMRecord get(T object); } - /** Identify getter for SAMRecords themselves */ + /** + * Identify getter for SAMRecords themselves + */ private final static ReadGetter SamRecordGetter = new ReadGetter() { @Override public GATKSAMRecord get(final GATKSAMRecord object) { return object; } }; - /** Gets the SAMRecord in a PileupElement */ + /** + * Gets the SAMRecord in a PileupElement + */ private final static ReadGetter PileupElementGetter = new ReadGetter() { @Override public GATKSAMRecord get(final PileupElement object) { return object.getRead(); } }; @@ -87,13 +100,20 @@ public class FragmentUtils { * and returns a FragmentCollection that contains the T objects whose underlying reads either overlap (or * not) with their mate pairs. * - * @param readContainingObjects - * @param nElements - * @param getter + * @param readContainingObjects An iterator of objects that contain GATKSAMRecords + * @param nElements the number of elements to be provided by the iterator, which is usually known upfront and + * greatly improves the efficiency of the fragment calculation + * @param getter a helper function that takes an object of type T and returns is associated GATKSAMRecord * @param - * @return + * @return a fragment collection */ - private final static FragmentCollection create(Iterable readContainingObjects, int nElements, ReadGetter getter) { + @Requires({ + "readContainingObjects != null", + "nElements >= 0", + "getter != null" + }) + @Ensures("result != null") + private static FragmentCollection create(final Iterable readContainingObjects, final int nElements, final ReadGetter getter) { Collection singletons = null; Collection> overlapping = null; Map nameMap = null; @@ -145,30 +165,69 @@ public class FragmentUtils { return new FragmentCollection(singletons, overlapping); } - public final static FragmentCollection create(ReadBackedPileup rbp) { + /** + * Create a FragmentCollection containing PileupElements from the ReadBackedPileup rbp + * @param rbp a non-null read-backed pileup. The elements in this ReadBackedPileup must be ordered + * @return a non-null FragmentCollection + */ + @Ensures("result != null") + public static FragmentCollection create(final ReadBackedPileup rbp) { + if ( rbp == null ) throw new IllegalArgumentException("Pileup cannot be null"); return create(rbp, rbp.getNumberOfElements(), PileupElementGetter); } - public final static FragmentCollection create(List reads) { + /** + * Create a FragmentCollection containing GATKSAMRecords from a list of reads + * + * @param reads a non-null list of reads, ordered by their start location + * @return a non-null FragmentCollection + */ + @Ensures("result != null") + public static FragmentCollection create(final List reads) { + if ( reads == null ) throw new IllegalArgumentException("Pileup cannot be null"); return create(reads, reads.size(), SamRecordGetter); } - public final static List mergeOverlappingPairedFragments( final List overlappingPair ) { - final byte MIN_QUAL_BAD_OVERLAP = 16; + public static List mergeOverlappingPairedFragments( final List overlappingPair ) { if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } - GATKSAMRecord firstRead = overlappingPair.get(0); - GATKSAMRecord secondRead = overlappingPair.get(1); + final GATKSAMRecord firstRead = overlappingPair.get(0); + final GATKSAMRecord secondRead = overlappingPair.get(1); + + final GATKSAMRecord merged; + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { + merged = mergeOverlappingPairedFragments(secondRead, firstRead); + } else { + merged = mergeOverlappingPairedFragments(firstRead, secondRead); + } + + return merged == null ? overlappingPair : Collections.singletonList(merged); + } + + /** + * Merge two overlapping reads from the same fragment into a single super read, if possible + * + * firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks + * at the bases and alignment, and tries its best to create a meaningful synthetic single super read + * that represents the entire sequenced fragment. + * + * Assumes that firstRead starts before secondRead (according to their soft clipped starts) + * + * @param firstRead the left most read + * @param firstRead the right most read + * + * @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one + */ + public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord firstRead, final GATKSAMRecord secondRead) { + if ( firstRead == null ) throw new IllegalArgumentException("firstRead cannot be null"); + if ( secondRead == null ) throw new IllegalArgumentException("secondRead cannot be null"); + if ( ! firstRead.getReadName().equals(secondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + firstRead + " and " + secondRead); if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { - firstRead = overlappingPair.get(1); // swap them - secondRead = overlappingPair.get(0); - } - if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { - return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + return null; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A } if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { - return overlappingPair; // fragments contain indels so don't merge them + return null; // fragments contain indels so don't merge them } final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); @@ -190,10 +249,10 @@ public class FragmentUtils { } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { - return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + return null; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them } if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) { - return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on + return null; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on } bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); @@ -237,8 +296,6 @@ public class FragmentUtils { returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION ); } - final ArrayList returnList = new ArrayList(); - returnList.add(returnRead); - return returnList; + return returnRead; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index 705db6f85..154b000ce 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -112,6 +112,19 @@ public class UtilsUnitTest extends BaseTest { Assert.assertTrue("one-1;two-2;three-1;four-2;five-1;six-2".equals(joined)); } + @Test + public void testConcat() { + final String s1 = "A"; + final String s2 = "CC"; + final String s3 = "TTT"; + final String s4 = "GGGG"; + Assert.assertEquals(new String(Utils.concat()), ""); + Assert.assertEquals(new String(Utils.concat(s1.getBytes())), s1); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes())), s1 + s2); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes())), s1 + s2 + s3); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes(), s4.getBytes())), s1 + s2 + s3 + s4); + } + @Test public void testEscapeExpressions() { String[] expected, actual; diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 15d69c400..89d192f9e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -27,23 +27,30 @@ package org.broadinstitute.sting.utils.fragments; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; /** * Test routines for read-backed pileup. */ public class FragmentUtilsUnitTest extends BaseTest { private static SAMFileHeader header; + private static GATKSAMReadGroupRecord rgForMerged; + private final static boolean DEBUG = false; private class FragmentUtilsTest extends TestDataProvider { List statesForPileup = new ArrayList(); @@ -119,7 +126,7 @@ public class FragmentUtilsUnitTest extends BaseTest { return FragmentUtilsTest.getTests(FragmentUtilsTest.class); } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsPileup(FragmentUtilsTest test) { for ( TestState testState : test.statesForPileup ) { ReadBackedPileup rbp = testState.pileup; @@ -129,7 +136,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsListOfReadsFromPileup(FragmentUtilsTest test) { for ( TestState testState : test.statesForPileup ) { FragmentCollection fp = FragmentUtils.create(testState.pileup.getReads()); @@ -138,7 +145,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsListOfReads(FragmentUtilsTest test) { for ( TestState testState : test.statesForReads ) { FragmentCollection fp = FragmentUtils.create(testState.rawReads); @@ -147,7 +154,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, expectedExceptions = IllegalArgumentException.class) + @Test(enabled = !DEBUG, expectedExceptions = IllegalArgumentException.class) public void testOutOfOrder() { final List pair = ArtificialSAMUtils.createPair(header, "readpair", 100, 1, 50, true, true); final GATKSAMRecord left = pair.get(0); @@ -161,5 +168,75 @@ public class FragmentUtilsUnitTest extends BaseTest { @BeforeTest public void setup() { header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); + rgForMerged = new GATKSAMReadGroupRecord("RG1"); + } + + @DataProvider(name = "MergeFragmentsTest") + public Object[][] createMergeFragmentsTest() throws Exception { + List tests = new ArrayList(); + + final String leftFlank = "CCC"; + final String rightFlank = "AAA"; + final String allOverlappingBases = "ACGTACGTGGAACCTTAG"; + for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) { + final String overlappingBases = allOverlappingBases.substring(0, overlapSize); + final byte[] overlappingBaseQuals = new byte[overlapSize]; + for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = (byte)(i + 30); + final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, "", 30, 1); + final GATKSAMRecord read2 = makeOverlappingRead("", 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, leftFlank.length() + 1); + final GATKSAMRecord merged = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, 1); + tests.add(new Object[]{"equalQuals", read1, read2, merged}); + + // test that the merged read base quality is the + tests.add(new Object[]{"lowQualLeft", modifyBaseQualities(read1, leftFlank.length(), overlapSize), read2, merged}); + tests.add(new Object[]{"lowQualRight", read1, modifyBaseQualities(read2, 0, overlapSize), merged}); + } + + return tests.toArray(new Object[][]{}); + } + + private GATKSAMRecord modifyBaseQualities(final GATKSAMRecord read, final int startOffset, final int length) throws Exception { + final GATKSAMRecord readWithLowQuals = (GATKSAMRecord)read.clone(); + final byte[] withLowQuals = Arrays.copyOf(read.getBaseQualities(), read.getBaseQualities().length); + for ( int i = startOffset; i < startOffset + length; i++ ) + withLowQuals[i] = (byte)(read.getBaseQualities()[i] + (i % 2 == 0 ? -1 : 0)); + readWithLowQuals.setBaseQualities(withLowQuals); + return readWithLowQuals; + } + + private GATKSAMRecord makeOverlappingRead(final String leftFlank, final int leftQual, final String overlapBases, + final byte[] overlapQuals, final String rightFlank, final int rightQual, + final int alignmentStart) { + final String bases = leftFlank + overlapBases + rightFlank; + final int readLength = bases.length(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, alignmentStart, readLength); + final byte[] leftQuals = Utils.dupBytes((byte) leftQual, leftFlank.length()); + final byte[] rightQuals = Utils.dupBytes((byte) rightQual, rightFlank.length()); + final byte[] quals = Utils.concat(leftQuals, overlapQuals, rightQuals); + read.setCigarString(readLength + "M"); + read.setReadBases(bases.getBytes()); + for ( final EventType type : EventType.values() ) + read.setBaseQualities(quals, type); + read.setReadGroup(rgForMerged); + read.setMappingQuality(60); + return read; + } + + @Test(enabled = true, dataProvider = "MergeFragmentsTest") + public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + + if ( expectedMerged == null ) { + Assert.assertNull(actual, "Expected reads not to merge, but got non-null result from merging"); + } else { + Assert.assertNotNull(actual, "Expected reads to merge, but got null result from merging"); + // I really care about the bases, the quals, the CIGAR, and the read group tag + Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); + Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); + Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); + Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality()); + for ( final EventType type : EventType.values() ) + Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); + } } } From b5b63eaac708ecc1c3b08725d1c66611d58a9be1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 11 Mar 2013 14:54:20 -0400 Subject: [PATCH 052/226] New GATKSAMRecord concept of a strandless read, update to FS -- Strandless GATK reads are ones where they don't really have a meaningful strand value, such as Reduced Reads or fragment merged reads. Added GATKSAMRecord support for such reads, along with unit tests -- The merge overlapping fragments code in FragmentUtils now produces strandless merged fragments -- FisherStrand annotation generalized to treat strandless as providing 1/2 the representative count for both strands. This means that that merged fragments are properly handled from the HC, so we don't hallucinate fake strand-bias just because we managed to merge a lot of reads together. -- The previous getReducedCount() wouldn't work if a read was made into a reduced read after getReducedCount() had been called. Added new GATKSAMRecord method setReducedCounts() that does the right thing. Updated SlidingWindow and SyntheticRead to explicitly call this function, and so the readTag parameter is now gone. -- Update MD5s for change to FS calculation. Differences are just minor updates to the FS --- .../gatk/walkers/annotator/FisherStrand.java | 30 +++++++---- .../reducereads/SlidingWindow.java | 6 +-- .../reducereads/SyntheticRead.java | 12 ++--- .../reducereads/SyntheticReadUnitTest.java | 2 +- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 12 ++--- .../sting/utils/fragments/FragmentUtils.java | 1 + .../sting/utils/sam/GATKSAMRecord.java | 50 +++++++++++++++++++ .../fragments/FragmentUtilsUnitTest.java | 1 + .../utils/sam/GATKSAMRecordUnitTest.java | 31 +++++++++++- 10 files changed, 117 insertions(+), 30 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 14c785678..39fdcb707 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import cern.jet.math.Arithmetic; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -74,6 +75,8 @@ import java.util.*; * calculated for certain complex indel cases or for multi-allelic sites. */ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(FisherStrand.class); + private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; @@ -95,6 +98,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); +// logger.info("VC " + vc); +// printTable(table, 0.0); return pValueForBestTable(table, null); } else @@ -131,9 +136,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private Map annotationForOneTable(final double pValue) { final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs return Collections.singletonMap(FS, value); -// Map map = new HashMap(); -// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue))); -// return map; } public List getKeyNames() { @@ -192,7 +194,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private static void printTable(int[][] table, double pValue) { - System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue); + logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); } private static boolean rotateTable(int[][] table) { @@ -315,13 +317,21 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat final boolean matchesAlt = allele.equals(alt, true); if ( matchesRef || matchesAlt ) { + final int row = matchesRef ? 0 : 1; - final boolean isFW = !read.getReadNegativeStrandFlag(); - - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - - table[row][column] += representativeCount; + if ( read.isStrandless() ) { + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + final int toAdd = Math.max(representativeCount / 2, 1); + table[row][0] += toAdd; + table[row][1] += toAdd; + } else { + // a normal read with an actual strand + final boolean isFW = !read.getReadNegativeStrandFlag(); + final int column = isFW ? 0 : 1; + table[row][column] += representativeCount; + } } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6c063110e..11e023b9b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -567,7 +567,7 @@ public class SlidingWindow { ObjectArrayList result = new ObjectArrayList(); if (filteredDataConsensus == null) - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand); ListIterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { @@ -583,7 +583,7 @@ public class SlidingWindow { if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) { result.add(finalizeFilteredDataConsensus()); - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, isNegativeStrand); } genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); @@ -606,7 +606,7 @@ public class SlidingWindow { @Requires({"start >= 0 && (end >= start || end == 0)"}) private void addToRunningConsensus(LinkedList header, int start, int end, boolean isNegativeStrand) { if (runningConsensus == null) - runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); + runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand); Iterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index 72fd52ebe..451e50286 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -124,8 +124,7 @@ public class SyntheticRead { private final ObjectArrayList basesCountsQuals; - private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus - private String readTag; + private double mappingQuality; // Information to produce a GATKSAMRecord private SAMFileHeader header; @@ -147,14 +146,12 @@ public class SyntheticRead { * @param contigIndex the read's contig index * @param readName the read's name * @param refStart the alignment start (reference based) - * @param readTag the reduce reads tag for the synthetic read */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { final int initialCapacity = 10000; basesCountsQuals = new ObjectArrayList(initialCapacity); mappingQuality = 0.0; - this.readTag = readTag; this.header = header; this.readGroupRecord = readGroupRecord; this.contig = contig; @@ -165,13 +162,12 @@ public class SyntheticRead { this.isNegativeStrand = isNegativeRead; } - public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { basesCountsQuals = new ObjectArrayList(bases.size()); for (int i = 0; i < bases.size(); ++i) { basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i))); } this.mappingQuality = mappingQuality; - this.readTag = readTag; this.header = header; this.readGroupRecord = readGroupRecord; this.contig = contig; @@ -228,7 +224,7 @@ public class SyntheticRead { read.setReadBases(convertReadBases()); read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size())); read.setReadGroup(readGroupRecord); - read.setAttribute(readTag, convertBaseCounts()); + read.setReducedReadCounts(convertBaseCounts()); if (hasIndelQualities) { read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java index 1ed28dec2..570b797ca 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java @@ -77,7 +77,7 @@ public void testBaseCounts() { new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; for (TestRead testRead : testReads) { - SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false); + SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false); Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 2e3e45247..fcf9168b3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2232995ca9bec143e664748845a0045"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b83b53741edb07218045d6f25f20a18b"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index bf2ddea12..8ed589c63 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "8f33e40686443b9a72de45d5a9da1861"); + HCTest(CEUTRIO_BAM, "", "4a2880f0753e6e813b9e0c35209b3708"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "8f2b047cdace0ef122d6ad162e7bc5b9"); + HCTest(NA12878_BAM, "", "588892934f2e81247bf32e457db88449"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "9d4be26a2c956ba4b7b4044820eab030"); + "fa1b92373c89d2238542a319ad25c257"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("03557376242bdf78c5237703b762573b")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9296f1af6cf1f1cc4b79494eb366e976")); executeTest("HCTestStructuralIndels: ", spec); } @@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("adb08cb25e902cfe0129404a682b2169")); + Arrays.asList("cf0a1bfded656153578df6cf68aa68a2")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -142,7 +142,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("a43c595a617589388ff3d7e2ddc661e7")); + Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index fa0187728..99f1d99c7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -263,6 +263,7 @@ public final class FragmentUtils { } final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); + returnRead.setIsStrandless(true); returnRead.setAlignmentStart( firstRead.getSoftStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 01a8c1996..c5f9f606b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -74,6 +74,8 @@ public class GATKSAMRecord extends BAMRecord { private int softEnd = UNINITIALIZED; private Integer adapterBoundary = null; + private boolean isStrandlessRead = false; + // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; private boolean retrievedReduceReadCounts = false; @@ -141,6 +143,45 @@ public class GATKSAMRecord extends BAMRecord { return ArtificialSAMUtils.createArtificialRead(cigar); } + /////////////////////////////////////////////////////////////////////////////// + // *** support for reads without meaningful strand information ***// + /////////////////////////////////////////////////////////////////////////////// + + /** + * Does this read have a meaningful strandedness value? + * + * Some advanced types of reads, such as reads coming from merged fragments, + * don't have meaningful strandedness values, as they are composites of multiple + * other reads. Strandless reads need to be handled specially by code that cares about + * stranded information, such as FS. + * + * @return true if this read doesn't have meaningful strand information + */ + public boolean isStrandless() { + return isStrandlessRead; + } + + /** + * Set the strandless state of this read to isStrandless + * @param isStrandless true if this read doesn't have a meaningful strandedness value + */ + public void setIsStrandless(final boolean isStrandless) { + this.isStrandlessRead = isStrandless; + } + + @Override + public boolean getReadNegativeStrandFlag() { + return ! isStrandless() && super.getReadNegativeStrandFlag(); + } + + @Override + public void setReadNegativeStrandFlag(boolean flag) { + if ( isStrandless() ) + throw new IllegalStateException("Cannot set the strand of a strandless read"); + super.setReadNegativeStrandFlag(flag); + } + + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// @@ -313,6 +354,15 @@ public class GATKSAMRecord extends BAMRecord { return getReducedReadCounts() != null; } + /** + * Set the reduced read counts for this record to counts + * @param counts the count array + */ + public void setReducedReadCounts(final byte[] counts) { + retrievedReduceReadCounts = false; + setAttribute(REDUCED_READ_CONSENSUS_TAG, counts); + } + /** * The number of bases corresponding the i'th base of the reduced read. * diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 89d192f9e..4f49eb933 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -229,6 +229,7 @@ public class FragmentUtilsUnitTest extends BaseTest { if ( expectedMerged == null ) { Assert.assertNull(actual, "Expected reads not to merge, but got non-null result from merging"); } else { + Assert.assertTrue(actual.isStrandless(), "Merged reads should be strandless"); Assert.assertNotNull(actual, "Expected reads to merge, but got null result from merging"); // I really care about the bases, the quals, the CIGAR, and the read group tag Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index baf4bfbb0..38840fab1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -64,6 +64,7 @@ public class GATKSAMRecordUnitTest extends BaseTest { for (int i = 0; i < reducedRead.getReadLength(); i++) { Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); } + Assert.assertEquals(reducedRead.isStrandless(), false, "Reduced reads don't have meaningful strandedness information"); } @Test @@ -103,7 +104,35 @@ public class GATKSAMRecordUnitTest extends BaseTest { read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null); Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); - } + @Test + public void testStrandlessReads() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + Assert.assertEquals(read.isStrandless(), false); + + read.setReadNegativeStrandFlag(false); + Assert.assertEquals(read.isStrandless(), false); + Assert.assertEquals(read.getReadNegativeStrandFlag(), false); + + read.setReadNegativeStrandFlag(true); + Assert.assertEquals(read.isStrandless(), false); + Assert.assertEquals(read.getReadNegativeStrandFlag(), true); + + read.setReadNegativeStrandFlag(true); + read.setIsStrandless(true); + Assert.assertEquals(read.isStrandless(), true); + Assert.assertEquals(read.getReadNegativeStrandFlag(), false, "negative strand flag should return false even through its set for a strandless read"); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testStrandlessReadsFailSetStrand() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + read.setIsStrandless(true); + read.setReadNegativeStrandFlag(true); + } } From ff87b62fe3e711775c4995facd2c31718d029f79 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 12 Mar 2013 13:58:20 -0400 Subject: [PATCH 053/226] Fixed bug in SelectVariants where maxIndelSize argument wasn't getting applied to deletions. Added unit tests and docs. --- .../walkers/variantutils/SelectVariants.java | 20 +++-- .../variantutils/SelectVariantsUnitTest.java | 88 +++++++++++++++++++ 2 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index f72ce3bd6..b64c64d11 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -507,7 +507,7 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - if ( badIndelSize(vc) ) + if ( containsIndelLargerThan(vc, maxIndelSize) ) continue; VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); @@ -531,12 +531,20 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } - private boolean badIndelSize(final VariantContext vc) { - List lengths = vc.getIndelLengths(); + /* + * Determines if any of the alternate alleles are greater than the max indel size + * + * @param vc the variant context to check + * @param maxIndelSize the maximum size of allowed indels + * @return true if the VC contains an indel larger than maxIndelSize and false otherwise + */ + protected static boolean containsIndelLargerThan(final VariantContext vc, final int maxIndelSize) { + final List lengths = vc.getIndelLengths(); if ( lengths == null ) - return false; // VC does not harbor indel - for ( Integer indelLength : vc.getIndelLengths() ) { - if ( indelLength > maxIndelSize ) + return false; + + for ( Integer indelLength : lengths ) { + if ( Math.abs(indelLength) > maxIndelSize ) return true; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java new file mode 100644 index 000000000..ca60c6cfe --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsUnitTest.java @@ -0,0 +1,88 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class SelectVariantsUnitTest extends BaseTest { + + ////////////////////////////////////////// + // Tests for maxIndelSize functionality // + ////////////////////////////////////////// + + @DataProvider(name = "MaxIndelSize") + public Object[][] MaxIndelSizeTestData() { + + List tests = new ArrayList(); + + for ( final int size : Arrays.asList(1, 3, 10, 100) ) { + for ( final int otherSize : Arrays.asList(0, 1) ) { + for ( final int max : Arrays.asList(0, 1, 5, 50, 100000) ) { + for ( final String op : Arrays.asList("D", "I") ) { + tests.add(new Object[]{size, otherSize, max, op}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MaxIndelSize") + public void maxIndelSizeTest(final int size, final int otherSize, final int max, final String op) { + + final byte[] largerAllele = Utils.dupBytes((byte) 'A', size+1); + final byte[] smallerAllele = Utils.dupBytes((byte) 'A', 1); + + final List alleles = new ArrayList(2); + final Allele ref = Allele.create(op.equals("I") ? smallerAllele : largerAllele, true); + final Allele alt = Allele.create(op.equals("D") ? smallerAllele : largerAllele, false); + alleles.add(ref); + alleles.add(alt); + if ( otherSize > 0 && otherSize != size ) { + final Allele otherAlt = Allele.create(op.equals("D") ? Utils.dupBytes((byte) 'A', size-otherSize+1) : Utils.dupBytes((byte) 'A', otherSize+1), false); + alleles.add(otherAlt); + } + + final VariantContext vc = new VariantContextBuilder("test", "1", 10, 10 + ref.length() - 1, alleles).make(); + + boolean hasTooLargeIndel = SelectVariants.containsIndelLargerThan(vc, max); + Assert.assertEquals(hasTooLargeIndel, size > max); + } + +} \ No newline at end of file From 573ed07ad06ec022b4bf9896384cd452162ca116 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 14 Mar 2013 11:06:45 -0400 Subject: [PATCH 056/226] Fixed reported bug in BQSR for RNA seq alignments with Ns. * ClippingOp updated to incorporate Ns in the hard clips. * ReadUtils.getReadCoordinateForReferenceCoordinate() updated to account for Ns. * Added test that covers the BQSR case we saw. * Created GSA-856 (for Mauricio) to add lots of tests to ReadUtils. * It will require refactoring code and not in the scope of what I was willing to do to fix this. --- .../sting/utils/clipping/ClippingOp.java | 4 ++-- .../sting/utils/sam/ReadUtils.java | 2 +- .../sting/utils/sam/ReadUtilsUnitTest.java | 22 +++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index fe1a386fb..ad6f05563 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -581,8 +581,8 @@ public class ClippingOp { if (cigarElement.getOperator() == CigarOperator.INSERTION) return -clippedLength; - // Deletions should be added to the total hard clip count - else if (cigarElement.getOperator() == CigarOperator.DELETION) + // Deletions and Ns should be added to the total hard clip count (because we want to maintain the original alignment start) + else if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) return cigarElement.getLength(); // There is no shift if we are not clipping an indel diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 95e0d55f3..c84e4245d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -524,7 +524,7 @@ public class ReadUtils { // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need // to add the shift of the current cigar element but go back to it's last element to return the last // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar) + else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases()) readBases += shift - 1; // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index baad67d53..331121c55 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -25,13 +25,19 @@ package org.broadinstitute.sting.utils.sam; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; @@ -179,4 +185,20 @@ public class ReadUtilsUnitTest extends BaseTest { final List reads = new LinkedList(); Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); } + + @Test (enabled = true) + public void testReadWithNs() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 3); + } } From 7cab709a88c86145d3be601c5ec2ea6476aa02a3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 13 Mar 2013 14:57:28 -0400 Subject: [PATCH 057/226] Fixed the logic of the @Output annotation and its interaction with 'required'. ALL GATK DEVELOPERS PLEASE READ NOTES BELOW: I have updated the @Output annotation to behave differently and to include a 'defaultToStdout' tag. * The 'defaultToStdout' tags lets walkers specify whether to default to stdout if -o is not provided. * The logic for @Output is now: * if required==true then -o MUST be provided or a User Error is generated. * if required==false and defaultToStdout==true then the output is assigned to stdout if no -o is provided. * this is the default behavior (i.e. @Output with no modifiers). * if required==false and defaultToStdout==false then the output object is null. * use this combination for truly optional outputs (e.g. the -badSites option in AssessNA12878). * I have updated walkers so that previous behavior has been maintained (as best I could). * In general, all @Outputs with default long/short names have required=false. * Walkers with nWayOut options must have required==false and defaultToStdout==false (I added checks for this) * I added unit tests for @Output changes with David's help (thanks!). * #resolve GSA-837 --- .../bqsr/RecalibrationArgumentCollection.java | 4 +- .../bqsr/RecalibrationPerformance.java | 2 +- .../compression/reducereads/ReduceReads.java | 11 +- .../targets/BaseCoverageDistribution.java | 2 +- .../diagnostics/targets/DiagnoseTargets.java | 2 +- .../targets/FindCoveredIntervals.java | 2 +- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 8 +- .../haplotypecaller/HaplotypeResolver.java | 2 +- .../gatk/walkers/indels/IndelRealigner.java | 8 +- .../walkers/phasing/ReadBackedPhasing.java | 2 +- .../ValidationSiteSelector.java | 2 +- .../ApplyRecalibration.java | 2 +- .../VariantRecalibrator.java | 2 +- .../variantutils/RegenotypeVariants.java | 2 +- .../sting/commandline/ArgumentSource.java | 8 + .../sting/commandline/Output.java | 7 + .../OutputStreamArgumentTypeDescriptor.java | 6 +- .../SAMFileWriterArgumentTypeDescriptor.java | 6 +- .../VCFWriterArgumentTypeDescriptor.java | 8 +- .../gatk/walkers/ActiveRegionWalker.java | 4 +- .../walkers/annotator/VariantAnnotator.java | 2 +- .../walkers/beagle/BeagleOutputToVCF.java | 2 +- .../walkers/beagle/ProduceBeagleInput.java | 4 +- .../beagle/VariantsToBeagleUnphased.java | 2 +- .../diagnostics/CoveredByNSamplesSites.java | 2 +- .../gatk/walkers/diffengine/DiffObjects.java | 2 +- .../walkers/filters/VariantFiltration.java | 2 +- .../gatk/walkers/qc/DocumentationTest.java | 2 +- .../gatk/walkers/readutils/ClipReads.java | 4 +- .../gatk/walkers/readutils/PrintReads.java | 2 +- .../walkers/variantutils/CombineVariants.java | 2 +- .../variantutils/FilterLiftedVariants.java | 2 +- .../variantutils/LeftAlignVariants.java | 2 +- .../variantutils/LiftoverVariants.java | 2 +- .../walkers/variantutils/SelectHeaders.java | 2 +- .../walkers/variantutils/SelectVariants.java | 2 +- .../VariantValidationAssessor.java | 2 +- .../VariantsToAllelicPrimitives.java | 2 +- .../walkers/variantutils/VariantsToTable.java | 2 +- .../walkers/variantutils/VariantsToVCF.java | 2 +- .../ArgumentTypeDescriptorUnitTest.java | 183 ++++++++++++++++++ 42 files changed, 262 insertions(+), 57 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index ee2edee5a..447569643 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -91,7 +91,7 @@ public class RecalibrationArgumentCollection { * If not provided, then no plots will be generated (useful for queue scatter/gathering). * However, we *highly* recommend that users generate these plots whenever possible for QC checking. */ - @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false) + @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false, defaultToStdout = false) public File RECAL_PDF_FILE = null; /** @@ -220,7 +220,7 @@ public class RecalibrationArgumentCollection { public String FORCE_PLATFORM = null; @Hidden - @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only") + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false) public PrintStream RECAL_TABLE_UPDATE_LOG = null; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java index fb11f6249..d0af08d90 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java @@ -66,7 +66,7 @@ import java.io.*; @PartitionBy(PartitionType.READ) public class RecalibrationPerformance extends RodWalker implements NanoSchedulable { - @Output(doc="Write output to this file", required = true) + @Output(doc="Write output to this file") public PrintStream out; @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file") diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index bc582fd49..da9bc1b37 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -69,6 +69,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; @@ -112,7 +113,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40) public class ReduceReads extends ReadWalker, ReduceReadsStash> { - @Output(required=true) + @Output(required = false, defaultToStdout = false) private StingSAMFileWriter out = null; private SAMFileWriter writerToUse = null; @@ -259,6 +260,13 @@ public class ReduceReads extends ReadWalker, Redu @Override public void initialize() { super.initialize(); + + if ( !nwayout && out == null ) + throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes"); + + if ( nwayout && out != null ) + throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other"); + GenomeAnalysisEngine toolkit = getToolkit(); readNameHash = new Object2LongOpenHashMap(100000); // prepare the read name hash to keep track of what reads have had their read names compressed intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode @@ -266,7 +274,6 @@ public class ReduceReads extends ReadWalker, Redu if (toolkit.getIntervals() != null) intervalList.addAll(toolkit.getIntervals()); - final boolean preSorted = true; final boolean indexOnTheFly = true; final boolean keep_records = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java index 9bd08a020..b70581dd3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java @@ -99,7 +99,7 @@ public class BaseCoverageDistribution extends LocusWalker, Ma /** * The output GATK Report table */ - @Output(required = true, doc = "The output GATK Report table") + @Output(doc = "The output GATK Report table") private PrintStream out; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index e4310588e..b302a967c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -110,7 +110,7 @@ import java.util.*; @PartitionBy(PartitionType.INTERVAL) public class DiagnoseTargets extends LocusWalker { - @Output(doc = "File to which variants should be written", required = true) + @Output(doc = "File to which variants should be written") private VariantContextWriter vcfWriter = null; @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 6b4d1f7a8..eef581160 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -92,7 +92,7 @@ import java.io.PrintStream; @PartitionBy(PartitionType.CONTIG) @ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000) public class FindCoveredIntervals extends ActiveRegionWalker { - @Output(required = true) + @Output private PrintStream out; @Argument(fullName = "uncovered", shortName = "u", required = false, doc = "output intervals that fail the coverage threshold instead") diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 4347a1a84..54fcad1df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -180,7 +180,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * A raw, unfiltered, highly sensitive callset in VCF format. */ //@Gather(className = "org.broadinstitute.sting.queue.extensions.gatk.CatVariantsGatherer") - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter writer = null; @Hidden diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7948b93a9..4bf09ad2d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -139,10 +139,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * A raw, unfiltered, highly sensitive callset in VCF format. */ - @Output(doc="File to which variants should be written", required = true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; - @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) + @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) protected PrintStream graphWriter = null; /** @@ -170,14 +170,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png * */ - @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false) + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) protected StingSAMFileWriter bamWriter = null; private HaplotypeBAMWriter haplotypeBAMWriter; /** * The type of BAM output we want to see. */ - @Output(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) + @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index 4de9488e9..facc929cd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -125,7 +125,7 @@ public class HaplotypeResolver extends RodWalker { @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) public List> variants; - @Output(doc="File to which variants should be written", required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; private VariantContextWriter writer; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d3a13df29..7d8243c98 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -189,7 +189,7 @@ public class IndelRealigner extends ReadWalker { /** * The realigned bam file. */ - @Output(required=false, doc="Output bam") + @Output(required=false, doc="Output bam", defaultToStdout=false) protected StingSAMFileWriter writer = null; protected ConstrainedMateFixingManager manager = null; protected SAMFileWriter writerToUse = null; @@ -295,15 +295,15 @@ public class IndelRealigner extends ReadWalker { protected boolean KEEP_ALL_PG_RECORDS = false; @Hidden - @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") + @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") protected String OUT_INDELS = null; @Hidden - @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) protected String OUT_STATS = null; @Hidden - @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) + @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false) protected String OUT_SNPS = null; // fasta reference reader to supplement the edges of the reference sequence diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index c1b484542..a297b38cf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -131,7 +131,7 @@ public class ReadBackedPhasing extends RodWalker { /** * The output VCF file */ - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 22425e62e..7de0c7e60 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -128,7 +128,7 @@ public class ApplyRecalibration extends RodWalker implements T ///////////////////////////// // Outputs ///////////////////////////// - @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true) + @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value") private VariantContextWriter vcfWriter = null; ///////////////////////////// diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 99d926ea5..320328ab1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -194,7 +194,7 @@ public class VariantRecalibrator extends RodWalker implements T @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; private UnifiedGenotyperEngine UG_engine = null; diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java index b9c785879..efacde231 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java @@ -175,6 +175,14 @@ public class ArgumentSource { return field.isAnnotationPresent(Deprecated.class); } + /** + * Returns whether the field should default to stdout if not provided explicitly on the command-line. + * @return True if field should default to stdout. + */ + public boolean defaultsToStdout() { + return field.isAnnotationPresent(Output.class) && (Boolean)CommandLineUtils.getValue(ArgumentTypeDescriptor.getArgumentAnnotation(this),"defaultToStdout"); + } + /** * Returns false if a type-specific default can be employed. * @return True to throw in a type specific default. False otherwise. diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java index 47a47602a..0db870f2e 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Output.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java @@ -66,6 +66,13 @@ public @interface Output { */ boolean required() default false; + /** + * If this argument is not required, should it default to use stdout if no + * output file is explicitly provided on the command-line? + * @return True if the argument should default to stdout. False otherwise. + */ + boolean defaultToStdout() default true; + /** * Should this command-line argument be exclusive of others. Should be * a comma-separated list of names of arguments of which this should be diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java index fbcc32d78..18185f12e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java @@ -66,7 +66,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public boolean createsTypeDefault(ArgumentSource source) { - return source.isRequired(); + return !source.isRequired() && source.defaultsToStdout(); } @Override @@ -76,7 +76,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(!source.isRequired()) + if(source.isRequired() || !source.defaultsToStdout()) throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default."); OutputStreamStub stub = new OutputStreamStub(defaultOutputStream); engine.addOutput(stub); @@ -90,7 +90,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor { // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; // therefore, the user must have failed to specify a type default - if(fileName == null && !source.isRequired()) + if(fileName == null && source.isRequired()) throw new MissingArgumentValueException(definition); OutputStreamStub stub = new OutputStreamStub(new File(fileName)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java index 34a7f967f..458846db0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java @@ -89,7 +89,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor @Override public boolean createsTypeDefault(ArgumentSource source) { - return source.isRequired(); + return !source.isRequired() && source.defaultsToStdout(); } @Override @@ -99,7 +99,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor @Override public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(!source.isRequired()) + if(source.isRequired() || !source.defaultsToStdout()) throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default."); SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream); engine.addOutput(stub); @@ -162,7 +162,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor DEFAULT_ARGUMENT_FULLNAME, DEFAULT_ARGUMENT_SHORTNAME, ArgumentDefinition.getDoc(annotation), - false, + source.isRequired(), false, source.isMultiValued(), source.isHidden(), diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java index 5b03859f5..91013673f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -110,7 +110,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { */ @Override public boolean createsTypeDefault(ArgumentSource source) { - return source.isRequired(); + return !source.isRequired() && source.defaultsToStdout(); } @Override @@ -119,8 +119,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { } @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(!source.isRequired()) + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + if(source.isRequired() || !source.defaultsToStdout()) throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default."); VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); engine.addOutput(stub); @@ -143,7 +143,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; // therefore, the user must have failed to specify a type default - if(writerFile == null && !source.isRequired()) + if(writerFile == null && source.isRequired()) throw new MissingArgumentValueException(defaultArgumentDefinition); // Create a stub for the given object. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index e14e50b1a..ebfc52d3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -67,7 +67,7 @@ public abstract class ActiveRegionWalker extends Walker extends Walker implements Ann public List> resources = Collections.emptyList(); public List> getResourceRodBindings() { return resources; } - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java index 4b96dbffb..15bd79586 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java @@ -106,7 +106,7 @@ public class BeagleOutputToVCF extends RodWalker { @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true) public RodBinding beaglePhased; - @Output(doc="VCF File to which variants should be written",required=true) + @Output(doc="VCF File to which variants should be written") protected VariantContextWriter vcfWriter = null; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java index 618fda0df..6e5aa250f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInput.java @@ -89,11 +89,11 @@ public class ProduceBeagleInput extends RodWalker { public RodBinding validation; - @Output(doc="File to which BEAGLE input should be written",required=true) + @Output(doc="File to which BEAGLE input should be written") protected PrintStream beagleWriter = null; @Hidden - @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) + @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false, defaultToStdout = false) protected PrintStream markers = null; int markerCounter = 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java index ab0ce79fd..646c57a2b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphased.java @@ -62,7 +62,7 @@ public class VariantsToBeagleUnphased extends RodWalker { @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true) public RodBinding variants; - @Output(doc="File to which BEAGLE unphased genotypes should be written",required=true) + @Output(doc="File to which BEAGLE unphased genotypes should be written") protected PrintStream beagleWriter = null; @Argument(fullName = "bootstrap_fraction", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java index 169c2708b..bff2ace63 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java @@ -75,7 +75,7 @@ import java.util.Collection; @By(DataSource.REFERENCE_ORDERED_DATA) public class CoveredByNSamplesSites extends RodWalker implements TreeReducible { - @Output(fullName = "OutputIntervals", shortName = "out", doc = "Name of file for output intervals", required = true) + @Output(fullName = "OutputIntervals", shortName = "out", doc = "Name of file for output intervals") PrintStream outputStream; @ArgumentCollection diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java index 6b5189dfd..524f5c250 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java @@ -146,7 +146,7 @@ public class DiffObjects extends RodWalker { * * See http://www.broadinstitute.org/gatk/guide/article?id=1299 for details. */ - @Output(doc="File to which results should be written",required=true) + @Output(doc="File to which results should be written") protected PrintStream out; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index c59c61803..8feb9101c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -92,7 +92,7 @@ public class VariantFiltration extends RodWalker { @Input(fullName="mask", doc="Input ROD mask", required=false) public RodBinding mask; - @Output(doc="File to which variants should be written", required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter writer = null; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java index 8902773f7..5db67a7f0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java @@ -73,7 +73,7 @@ public class DocumentationTest extends RodWalker { @Input(fullName="featureArg", shortName = "featureArg", doc="A RodBinding of feature", required=false) private RodBinding featureArg = null; - @Output(doc="VCFWriter",required=true) + @Output(doc="VCFWriter") protected VariantContextWriter vcfWriter = null; @Advanced diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java index 739da5a98..879022299 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java @@ -161,13 +161,13 @@ public class ClipReads extends ReadWalker implements NanoSchedulable { - @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) + @Output(doc="Write output to this BAM filename instead of STDOUT") StingSAMFileWriter out; @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 436a973df..45dbc937d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -133,7 +133,7 @@ public class CombineVariants extends RodWalker implements Tree @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) public List> variants; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index f285fb797..e61cda765 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -56,7 +56,7 @@ public class FilterLiftedVariants extends RodWalker { private static final int MAX_VARIANT_SIZE = 100; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter writer = null; private long failedLocs = 0, totalLocs = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index e6d3e6e94..700b34b38 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -87,7 +87,7 @@ public class LeftAlignVariants extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; private VariantContextWriter writer; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 0a7ad5b7b..17d50f101 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -62,7 +62,7 @@ public class LiftoverVariants extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected File file = null; protected VariantContextWriter writer = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java index 9bbf728e1..478bba846 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -110,7 +110,7 @@ public class SelectHeaders extends RodWalker implements TreeRe @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc = "File to which variants should be written", required = true) + @Output(doc = "File to which variants should be written") protected VariantContextWriter vcfWriter; @Argument(fullName = "header_name", shortName = "hn", doc = "Include header. Can be specified multiple times", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index f72ce3bd6..1c5e9d1ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -199,7 +199,7 @@ public class SelectVariants extends RodWalker implements TreeR @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) protected RodBinding concordanceTrack; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 0e2a04bf2..d189459c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -91,7 +91,7 @@ public class VariantValidationAssessor extends RodWalker @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter vcfwriter = null; @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java index 319183f28..e25f158f2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToAllelicPrimitives.java @@ -84,7 +84,7 @@ public class VariantsToAllelicPrimitives extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; private VariantContextWriter vcfWriter; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 444eb745c..f1f93f1f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -111,7 +111,7 @@ public class VariantsToTable extends RodWalker { @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) public List> variants; - @Output(doc="File to which results should be written",required=true) + @Output(doc="File to which results should be written") protected PrintStream out; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 7c7f52803..96b66a0e3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -87,7 +87,7 @@ import java.util.*; @Reference(window=@Window(start=-40,stop=40)) public class VariantsToVCF extends RodWalker { - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; private VariantContextWriter vcfwriter; // needed because hapmap/dbsnp indel records move diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java new file mode 100644 index 000000000..85ad5d575 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/ArgumentTypeDescriptorUnitTest.java @@ -0,0 +1,183 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.io.stubs.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collection; + + +public class ArgumentTypeDescriptorUnitTest extends BaseTest { + + //////////////////////////////////////////////////////////////////// + // This section tests the functionality of the @Output annotation // + //////////////////////////////////////////////////////////////////// + + private class ATDTestCommandLineProgram extends CommandLineProgram { + public int execute() { return 0; } + + @Override + public Collection getArgumentTypeDescriptors() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + return Arrays.asList( new SAMFileWriterArgumentTypeDescriptor(engine, System.out), + new OutputStreamArgumentTypeDescriptor(engine, System.out), + new VCFWriterArgumentTypeDescriptor(engine, System.out, null)); + } + + protected abstract class ATDTestOutputArgumentSource { + public abstract Object getOut(); + } + + protected class OutputRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public PrintStream out; + public Object getOut() { return out; } + } + } + + @DataProvider(name = "OutputProvider") + public Object[][] OutputProvider() { + + ObjectArrayList tests = new ObjectArrayList(); + + final ATDTestCommandLineProgram clp = new ATDTestCommandLineProgram(); + + for ( final Object obj : Arrays.asList(clp.new OutputRequiredSamArgumentSource(), clp.new OutputRequiredVcfArgumentSource(), clp.new OutputRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, true, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredSamArgumentSource(), clp.new OutputNotRequiredVcfArgumentSource(), clp.new OutputNotRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredNoDefaultSamArgumentSource(), clp.new OutputNotRequiredNoDefaultVcfArgumentSource(), clp.new OutputNotRequiredNoDefaultStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, false, provided}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OutputProvider") + public void testOutput(final ATDTestCommandLineProgram.ATDTestOutputArgumentSource argumentSource, final boolean required, final boolean hasDefault, final boolean provided) { + + final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); + parser.addArgumentSource(argumentSource.getClass()); + parser.parse(provided ? new String[] {"out", "foo"} : new String[] {}); + + try { + parser.loadArgumentsIntoObject(argumentSource); + + if ( !provided && (required || !hasDefault) ) + Assert.assertEquals(argumentSource.getOut(), null); + else if ( !provided ) + Assert.assertNotEquals(argumentSource.getOut(), null); + else if ( argumentSource.getOut() == null || !(argumentSource.getOut() instanceof SAMFileWriterStub) ) // can't test this one case + Assert.assertEquals(!provided, outputIsStdout(argumentSource.getOut())); + + } catch (Exception e) { + throw new ReviewedStingException(e.getMessage()); + } + } + + private static boolean outputIsStdout(final Object out) { + if ( out == null ) { + return false; + } else if ( out instanceof SAMFileWriterStub ) { + return ((SAMFileWriterStub)out).getOutputStream() != System.out; + } else if ( out instanceof VariantContextWriterStub ) { + return ((VariantContextWriterStub)out).getOutputStream() == System.out; + } else if ( out instanceof OutputStreamStub ) { + return ((OutputStreamStub)out).getOutputStream() == System.out; + } + return false; + } + +} \ No newline at end of file From 61349ecefa9e6bdc979b85f9bc943affcfff0dcd Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Tue, 12 Mar 2013 16:32:59 -0400 Subject: [PATCH 058/226] Cleaned up annotations - Moved AverageAltAlleleLength, MappingQualityZeroFraction and TechnologyComposition to Private - VariantType, TransmissionDisequilibriumTest, MVLikelihoodRatio and GCContent are no longer Experimental - AlleleBalanceBySample, HardyWeinberg and HomopolymerRun are Experimental and available to users with a big bold caveat message - Refactored getMeanAltAlleleLength() out of AverageAltAlleleLength into GATKVariantContextUtils in order to make QualByDepth independent of where AverageAltAlleleLength lives - Unrelated change, bundled in for convenience: made HC argument includeUnmappedreads @Hidden - Removed unnecessary check in AverageAltAlleleLength --- .../annotator/BaseQualityRankSumTest.java | 8 +- .../walkers/annotator/ChromosomeCounts.java | 12 +- .../annotator/ClippingRankSumTest.java | 17 +-- .../gatk/walkers/annotator/GCContent.java | 13 +- .../gatk/walkers/annotator/HardyWeinberg.java | 12 +- .../walkers/annotator/HomopolymerRun.java | 12 +- .../walkers/annotator/MVLikelihoodRatio.java | 19 +-- .../annotator/MappingQualityRankSumTest.java | 8 +- .../gatk/walkers/annotator/QualByDepth.java | 3 +- .../walkers/annotator/ReadPosRankSumTest.java | 8 +- .../annotator/TandemRepeatAnnotator.java | 6 +- .../TransmissionDisequilibriumTest.java | 13 +- .../gatk/walkers/annotator/VariantType.java | 6 +- .../haplotypecaller/HaplotypeCaller.java | 1 + .../annotator/AlleleBalanceBySample.java | 8 +- .../annotator/AverageAltAlleleLength.java | 117 ------------------ .../annotator/MappingQualityZeroFraction.java | 85 ------------- .../sting/gatk/walkers/annotator/SnpEff.java | 7 +- .../annotator/TechnologyComposition.java | 101 --------------- .../variant/GATKVariantContextUtils.java | 45 ++++++- 20 files changed, 148 insertions(+), 353 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 43e929ac0..04f9e87c7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -58,8 +58,12 @@ import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). - * Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities + * + *

        This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities(ref bases vs. bases of the alternate allele).

        + * + *

        Caveat

        + *

        The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

        */ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index eb3dc6959..64d45df02 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -65,9 +65,15 @@ import java.util.*; /** - * Allele count in genotypes, for each ALT allele, in the same order as listed; - * allele Frequency, for each ALT allele, in the same order as listed; total number - * of alleles in called genotypes. + * Allele counts and frequency for each ALT allele and total number of alleles in called genotypes + * + *

        This annotation tool outputs the following: + * + *

          + *
        • Allele count in genotypes, for each ALT allele, in the same order as listed
        • + *
        • Allele Frequency, for each ALT allele, in the same order as listed
        • + *
        • Total number of alleles in called genotypes
        • + *

        */ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index dbb977ebf..90ca5c667 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -57,14 +57,15 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; /** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 6/28/12 - */ - -/** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele) - * Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases + * + *

        This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele).

        + * + *

        Caveat

        + *

        The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

        + * + * @author rpoplin + * @since 6/28/12 */ public class ClippingRankSumTest extends RankSumTest { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index aa5b779da..a4b1b1b49 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -68,9 +68,16 @@ import java.util.Map; /** - * The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site + * GC content of the reference around this site + * + *

        The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.

        + * + *

        Caveat

        + *

        The window size used to calculate the GC content around the site is set by the tool used for annotation + * (currently UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Technical Document for each tool + * to find out what window size they use.

        */ -public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class GCContent extends InfoFieldAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -86,7 +93,7 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota public List getKeyNames() { return Arrays.asList("GC"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content within 20 bp +/- the variant")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content around the variant (see docs for window size details)")); } public boolean useZeroQualityReads() { return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index b349be285..43ec537a4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; @@ -68,11 +69,16 @@ import java.util.Map; /** - * Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium + * Hardy-Weinberg test for disequilibrium * - *

        Requires at least 10 samples in order to run. Only genotypes with sufficient quality (>10) will be taken into account.

        + *

        This annotation calculates the Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium.

        + * + *

        Caveats

        + *

        This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

        + *

        Right now we just ignore genotypes that are not confident, but this throws off our HW ratios. + * More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is het or hom var.

        */ -public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation { +public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation { private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index f9663d33e..4039241ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -50,6 +50,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.GenomeLoc; @@ -63,13 +64,16 @@ import java.util.List; import java.util.Map; /** - * Largest contiguous homopolymer run of the variant allele in either direction on the reference. + * Largest contiguous homopolymer run of the variant allele * - *

        Computed only for bi-allelic sites.

        + *

        Calculates the length of the largest contiguous homopolymer run of the variant allele in either direction on the reference.

        * - *

        Note that this annotation is no longer supported, as we have found that it does not give satisfactory results. Use at your own risk!

        + *

        Caveats

        + *

        This can only be computed for bi-allelic sites.

        + *

        This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

        + *

        This needs to be computed in a more accurate manner. We currently look only at direct runs of the alternate allele adjacent to this position.

        */ -public class HomopolymerRun extends InfoFieldAnnotation { +public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalAnnotation { private boolean ANNOTATE_INDELS = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index 58d720899..ad974a083 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -65,19 +65,20 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; /** - * Likelihood of the site being a mendelian violation versus the likelihood of the site transmitting according to mendelian rules. + * Likelihood of being a Mendelian Violation * - *

        - * Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation - * versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is - * diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than - * the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios. - *

        + *

        Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation + * versus the likelihood of the site transmitting according to mendelian rules.

        * - *

        Note that this annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).

        + *

        Note that this annotation requires a valid ped file.

        + * + *

        Caveat

        + *

        This tool assumes that the organism is diploid. When multiple trios are present, the annotation is simply the maximum + * of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain + * sites and many trios.

        */ -public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { +public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; public static final String MVLR_KEY = "MVLR"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 8c401eecd..b30df04a8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -59,8 +59,12 @@ import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) - * Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities + * + *

        This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele).

        + * + *

        Caveat

        + *

        The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

        */ public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 80bbfc2e4..6f875b23c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -54,6 +54,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.variant.variantcontext.Genotype; @@ -113,7 +114,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( depth == 0 ) return null; - double altAlleleLength = AverageAltAlleleLength.getMeanAltAlleleLength(vc); + double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index ae0d2a87b..182a9226f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -65,8 +65,12 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; /** - * The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). - * Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + * U-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele + * + *

        This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error.

        + * + *

        Caveat

        + *

        The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

        */ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index d976592cb..332d18341 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -65,11 +65,13 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - /** * Annotates variants that are composed of tandem repeats * - *

        Note that this annotation is currently not compatible with HaplotypeCaller.

        + *

        This tool outputs the number of times the tandem repeat unit is repeated, for each allele (including reference).

        + * + *

        Caveat

        + *

        This annotation is currently not compatible with HaplotypeCaller.

        */ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation { private static final String STR_PRESENT = "STR"; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index f29899f7f..f8efd7c3f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -67,10 +67,19 @@ import java.util.*; /** * Wittkowski transmission disequilibrium test * - *

        Note that this annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).

        + *

        Test statistic from Wittkowski transmission disequilibrium test. + * The calculation is based on the following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT

        + * + *

        Note that this annotation requires a valid ped file.

        + * + *

        Caveat

        + *

        This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).

        + * + * @author rpoplin, lfran, ebanks + * @since 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation { private Set trios = null; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java index 89b0bcf96..555c75deb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java @@ -50,7 +50,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.IndelUtils; @@ -62,8 +61,11 @@ import java.util.*; /** * Assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.) + * + *

        This tool assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.). + * It also specifies whether the variant is multiallelic (>2 alleles).

        */ -public class VariantType extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class VariantType extends InfoFieldAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7948b93a9..3f3d7123a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -212,6 +212,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, * and may make use of them in assembly and calling, where possible. */ + @Hidden @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) protected boolean includeUnmappedReads = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index fbba6722e..608257b54 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -46,7 +46,13 @@ import java.util.List; /** - * The allele balance (fraction of ref bases over ref + alt bases) separately for each bialleleic het-called sample + * Allele balance per sample + * + *

        The allele balance is the fraction of ref bases over ref + alt bases.

        + * + *

        Caveats

        + *

        Note that this annotation will only work properly for biallelic het-called samples.

        + *

        This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

        */ public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java deleted file mode 100644 index 17a33bdca..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 1/3/13 - * Time: 11:36 AM - * To change this template use File | Settings | File Templates. - */ -public class AverageAltAlleleLength extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation, ExperimentalAnnotation { - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Average Allele Length")); - } - - public List getKeyNames() { return Arrays.asList("AAL"); } - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map perReadAlleleLikelihoodMap ) { - if ( !vc.hasLog10PError() ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - Map map = new HashMap(); - - double length = getMeanAltAlleleLength(vc); - map.put(getKeyNames().get(0),String.format("%.2f",length)); - return map; - } - - public static double getMeanAltAlleleLength(VariantContext vc) { - double averageLength = 1.0; - if ( ! vc.isSNP() && ! vc.isSymbolic() ) { - // adjust for the event length - int averageLengthNum = 0; - int averageLengthDenom = 0; - int refLength = vc.getReference().length(); - for ( Allele a : vc.getAlternateAlleles() ) { - int numAllele = vc.getCalledChrCount(a); - int alleleSize; - if ( a.length() == refLength ) { - // SNP or MNP - byte[] a_bases = a.getBases(); - byte[] ref_bases = vc.getReference().getBases(); - int n_mismatch = 0; - for ( int idx = 0; idx < a_bases.length; idx++ ) { - if ( a_bases[idx] != ref_bases[idx] ) - n_mismatch++; - } - alleleSize = n_mismatch; - } - else if ( a.isSymbolic() ) { - alleleSize = 1; - } else { - alleleSize = Math.abs(refLength-a.length()); - } - averageLengthNum += alleleSize*numAllele; - averageLengthDenom += numAllele; - } - averageLength = ( (double) averageLengthNum )/averageLengthDenom; - } - - return averageLength; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java deleted file mode 100644 index 65d2f0757..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ /dev/null @@ -1,85 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Fraction of all reads across samples that have mapping quality zero - */ -public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { - - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) - return null; - - int mq0 = 0; - int depth = 0; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - depth += context.size(); - final ReadBackedPileup pileup = context.getBasePileup(); - for (PileupElement p : pileup ) { - if ( p.getMappingQual() == 0 ) - mq0++; - } - } - if (depth > 0) { - double mq0f = (double)mq0 / (double )depth; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%1.4f", mq0f)); - return map; - } - else - return null; - } - - public List getKeyNames() { return Arrays.asList("MQ0Fraction"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Fraction of Mapping Quality Zero Reads")); } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 17002ba39..bc365c59c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -45,11 +45,12 @@ import java.util.*; /** * A set of genomic annotations based on the output of the SnpEff variant effect predictor tool - * (http://snpeff.sourceforge.net/). * - * For each variant, chooses one of the effects of highest biological impact from the SnpEff + *

        See http://snpeff.sourceforge.net/ for more information on the SnpEff tool

        . + * + *

        For each variant, this tol chooses one of the effects of highest biological impact from the SnpEff * output file (which must be provided on the command line via --snpEffFile filename.vcf), - * and adds annotations on that effect. + * and adds annotations on that effect.

        * * @author David Roazen */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java deleted file mode 100644 index dbaafb1ed..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Counts of bases from Illumina, 454, and SOLiD at this site - */ -@Hidden -public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation { - private String nIllumina = "NumIllumina"; - private String n454 ="Num454"; - private String nSolid = "NumSOLiD"; - private String nOther = "NumOther"; - public Map annotate(final RefMetaDataTracker tracker, - final AnnotatorCompatible walker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) - return null; - - int readsIllumina = 0; - int readsSolid = 0; - int reads454 = 0; - int readsOther = 0; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - for ( PileupElement p : pileup ) { - if(ReadUtils.is454Read(p.getRead())) - reads454++; - else if (ReadUtils.isSOLiDRead(p.getRead())) - readsSolid++; - else if (ReadUtils.isIlluminaRead(p.getRead())) - readsIllumina++; - else - readsOther++; - } - } - - Map map = new HashMap(); - map.put(nIllumina, String.format("%d", readsIllumina)); - map.put(n454, String.format("%d", reads454)); - map.put(nSolid, String.format("%d", readsSolid)); - map.put(nOther, String.format("%d", readsOther)); - return map; - } - - public List getKeyNames() { return Arrays.asList(nIllumina,n454,nSolid,nOther); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(nIllumina, 1, VCFHeaderLineType.Integer, "Number of Illumina reads"), - new VCFInfoHeaderLine(n454, 1, VCFHeaderLineType.Integer, "Number of 454 reads"), - new VCFInfoHeaderLine(nSolid, 1, VCFHeaderLineType.Integer, "Number of SOLiD reads"), - new VCFInfoHeaderLine(nOther, 1, VCFHeaderLineType.Integer, "Number of Other technology reads")); } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 398b32669..627bee3ea 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -51,7 +51,6 @@ public class GATKVariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_INTERSECTION = "Intersection"; - public enum GenotypeMergeType { /** * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. @@ -97,6 +96,46 @@ public class GATKVariantContextUtils { MIX_TYPES } + /** + * Refactored out of the AverageAltAlleleLength annotation class + * @param vc the variant context + * @return the average length of the alt allele (a double) + */ + public static double getMeanAltAlleleLength(VariantContext vc) { + double averageLength = 1.0; + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + averageLength = ( (double) averageLengthNum )/averageLengthDenom; + } + + return averageLength; + } + /** * create a genome location, given a variant context * @param genomeLocParser parser @@ -114,14 +153,14 @@ public class GATKVariantContextUtils { } /** - * If this is a BiAlleic SNP, is it a transition? + * If this is a BiAllelic SNP, is it a transition? */ public static boolean isTransition(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; } /** - * If this is a BiAlleic SNP, is it a transversion? + * If this is a BiAllelic SNP, is it a transversion? */ public static boolean isTransversion(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; From 38914384d1a1523b3ed9d3b8d5cf9b0106d42bd0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 12 Mar 2013 15:41:28 -0400 Subject: [PATCH 059/226] Changing CALLED_IN_DB_UNKNOWN_STATUS to count as TRUE_POSITIVEs in the simplified stats for AssessNA12878. --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- .../sting/utils/activeregion/ActivityProfile.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 3f3d7123a..1f72fd82f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -149,7 +149,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. * - * Turning on this mode may result in serious performance cost for the HC. It's really only approprate to + * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to * use in specific areas where you want to better understand why the HC is making specific calls. * * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 25948a857..39509e9df 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -259,7 +259,7 @@ public class ActivityProfile { * Can be overridden by subclasses to transform states in any way * * There's no particular contract for the output states, except that they can never refer to states - * beyond the current end of the stateList unless the explictly include preceding states before + * beyond the current end of the stateList unless the explicitly include preceding states before * the reference. So for example if the current state list is [1, 2, 3] this function could return * [1,2,3,4,5] but not [1,2,3,5]. * From 2d350652387ae4751ac68cb621f5da5c5e0f8c95 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 13:44:48 -0500 Subject: [PATCH 061/226] QualityByDepth remaps QD values > 40 to a gaussian around 30 -- This is a temporarily fix / hack to deal with the very high QD values that are generated by the haplotype caller when nearby events occur within reads. In that case, the QUAL field can be many fold higher than normal, and results in an inflated QD value. This hack projects such high QD values back into the good range (as these are good variants in general) so they aren't filtered away by VQSR. -- The long-term solution to this problem is to move the HaplotypeCaller to the full bubble calling algorithm -- Update md5s --- .../gatk/walkers/annotator/QualByDepth.java | 33 ++++++++++++++++--- ...dGenotyperIndelCallingIntegrationTest.java | 4 +-- .../UnifiedGenotyperIntegrationTest.java | 4 +-- ...GenotyperNormalCallingIntegrationTest.java | 4 +-- ...dGenotyperReducedReadsIntegrationTest.java | 6 ++-- .../HaplotypeCallerIntegrationTest.java | 6 ++-- 6 files changed, 40 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 6f875b23c..a3fbcc439 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -61,10 +62,7 @@ import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length. @@ -73,6 +71,7 @@ import java.util.Map; * reads associated with the samples with polymorphic genotypes. */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +// private final static Logger logger = Logger.getLogger(QualByDepth.class); public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -114,13 +113,37 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( depth == 0 ) return null; - double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); + final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); + QD = fixTooHighQD(QD); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } + /** + * The haplotype caller generates very high quality scores when multiple events are on the + * same haplotype. This causes some very good variants to have unusually high QD values, + * and VQSR will filter these out. This code looks at the QD value, and if it is above + * threshold we map it down to the mean high QD value, with some jittering + * + * // TODO -- remove me when HaplotypeCaller bubble caller is live + * + * @param QD the raw QD score + * @return a QD value + */ + private double fixTooHighQD(final double QD) { + if ( QD < MAX_QD_BEFORE_FIXING ) { + return QD; + } else { + return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + } + } + + private final static double MAX_QD_BEFORE_FIXING = 35; + private final static double IDEAL_HIGH_QD = 30; + private final static double JITTER_SIGMA = 3; + public List getKeyNames() { return Arrays.asList("QD"); } public List getDescriptions() { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 670666fe2..8d0c1f04f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("4bebbe4ed4a7554285a3b4bb7311101c")); + Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("08b3a85be00c8f6a4fefd3c671463ecf")); + Arrays.asList("939da0bb73b706badd8a0def7446b384")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a0440aaed..15655622e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("68961b19a29ae224059c33ef41cdcb58")); + Arrays.asList("3a805f5b823ccac19aaec01a3016100e")); executeTest(String.format("test multiple technologies"), spec); } @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("9fcb234f7573209dec4dae86db091efd")); + Arrays.asList("25aa0259876692dc3c848a37369bac6a")); executeTest(String.format("test calling with BAQ"), spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 49083e45b..2512dd5c2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("9fac00485419878749b03706ae6b852f")); + Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e")); executeTest("test Multiple SNP alleles", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("de2c5707c1805d17d70acaecd36b7372")); + Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index d65020dcc..b5fe79993 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8b9a9fc2e7150acbe2dac91b4620f304")); + Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "b5991dddbfb59366614ff8819062649f"); + testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "acde5694a74f867256a54a26cbebbf21"); + testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8ed589c63..42eb09e6e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,7 +73,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "588892934f2e81247bf32e457db88449"); + HCTest(NA12878_BAM, "", "b3bffabb7aafd43e0339958395e6aa10"); } @Test(enabled = false) @@ -95,7 +95,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5af4782a0e1bc9b966b9e3ae76245919"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "852623c93feef5e62fcb555beedc8c53"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("cf0a1bfded656153578df6cf68aa68a2")); + Arrays.asList("fd1b51b17f8f9c88abdf66a9372bce5a")); executeTest("HC calling on a ReducedRead BAM", spec); } From 232afdcbeaf0cf0f341c911e22f6e27286db08ee Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 14 Mar 2013 15:30:37 -0400 Subject: [PATCH 062/226] Added check in the MalformedReadFilter for reads without stored bases (i.e. that use '*'). * We now throw a User Error for such reads * User can override this to filter instead with --filter_bases_not_stored * Added appropriate unit test --- .../gatk/filters/MalformedReadFilter.java | 22 ++++++- .../filters/MalformedReadFilterUnitTest.java | 62 +++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java index 366e927dc..f7d1d0297 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java @@ -45,6 +45,9 @@ public class MalformedReadFilter extends ReadFilter { @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) boolean filterMismatchingBaseAndQuals = false; + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false) + boolean filterBasesNotStored = false; + @Override public void initialize(GenomeAnalysisEngine engine) { this.header = engine.getSAMFileHeader(); @@ -57,7 +60,8 @@ public class MalformedReadFilter extends ReadFilter { !checkAlignmentDisagreesWithHeader(this.header,read) || !checkHasReadGroup(read) || !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) || - !checkCigarDisagreesWithAlignment(read); + !checkCigarDisagreesWithAlignment(read) || + !checkSeqStored(read, filterBasesNotStored); } private static boolean checkHasReadGroup(final SAMRecord read) { @@ -146,4 +150,20 @@ public class MalformedReadFilter extends ReadFilter { return result; } + + /** + * Check if the read has its base sequence stored + * @param read the read to validate + * @return true if the sequence is stored and false otherwise ("*" in the SEQ field). + */ + protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) { + + if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE ) + return true; + + if ( filterBasesNotStored ) + return false; + + throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName())); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java new file mode 100644 index 000000000..981d54d54 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/MalformedReadFilterUnitTest.java @@ -0,0 +1,62 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * Tests for the MalformedReadFilter + * + * @author Eric Banks + * @since 3/14/13 + */ +public class MalformedReadFilterUnitTest { + + ////////////////////////////////////// + // Test the checkSeqStored() method // + ////////////////////////////////////// + + @Test(enabled = true) + public void testcheckSeqStored () { + + final GATKSAMRecord goodRead = ArtificialSAMUtils.createArtificialRead(new byte[]{(byte)'A'}, new byte[]{(byte)'A'}, "1M"); + final GATKSAMRecord badRead = ArtificialSAMUtils.createArtificialRead(new byte[]{}, new byte[]{}, "1M"); + badRead.setReadString("*"); + + Assert.assertTrue(MalformedReadFilter.checkSeqStored(goodRead, true)); + Assert.assertFalse(MalformedReadFilter.checkSeqStored(badRead, true)); + + try { + MalformedReadFilter.checkSeqStored(badRead, false); + Assert.assertTrue(false, "We should have exceptioned out in the previous line"); + } catch (UserException e) { } + } +} From b8991f5e9899374549a97a3aa92038ab662ec025 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 15 Mar 2013 12:31:54 -0400 Subject: [PATCH 064/226] Fix for edge case bug of trying to create insertions/deletions on the edge of contigs. -- Added integration test using MT that previously failed --- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 54 ++++++++++--------- .../HaplotypeCallerIntegrationTest.java | 8 ++- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 0a552c0a1..1e447240b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -430,7 +430,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @param refWithPadding the full reference byte array with padding which encompasses the active region * @return a haplotype fully extended to encompass the active region */ - @Requires({"haplotype != null", "activeRegionStart > 0", "refWithPadding != null", "refWithPadding.length > 0"}) + @Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"}) @Ensures({"result != null", "result.getCigar() != null"}) private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) { final Cigar cigar = haplotype.getCigar(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 34a6ddfa6..1cfc65581 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -710,24 +710,26 @@ public class GenotypingEngine { switch( ce.getOperator() ) { case I: { - final List insertionAlleles = new ArrayList(); - final int insertionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) ) { - insertionAlleles.add( Allele.create(refByte, true) ); - } - if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele - insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); - } else { - byte[] insertionBases = new byte[]{}; - insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base - insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); - if( BaseUtils.isAllRegularBases(insertionBases) ) { - insertionAlleles.add( Allele.create(insertionBases, false) ); + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final List insertionAlleles = new ArrayList(); + final int insertionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele + insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); + } else { + byte[] insertionBases = new byte[]{}; + insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base + insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); } - } - if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele - vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); } alignmentPos += elementLength; break; @@ -739,14 +741,16 @@ public class GenotypingEngine { } case D: { - final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base - final List deletionAlleles = new ArrayList(); - final int deletionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { - deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(refByte, false) ); - vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base + final List deletionAlleles = new ArrayList(); + final int deletionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { + deletionAlleles.add( Allele.create(deletionBases, true) ); + deletionAlleles.add( Allele.create(refByte, false) ); + vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } } refPos += elementLength; break; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 42eb09e6e..a9898b567 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -58,6 +58,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; + final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; private void HCTest(String bam, String args, String md5) { @@ -76,7 +77,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTest(NA12878_BAM, "", "b3bffabb7aafd43e0339958395e6aa10"); } - @Test(enabled = false) + @Test(enabled = false) // can't annotate the rsID's yet public void testHaplotypeCallerSingleSampleWithDbsnp() { HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); } @@ -98,6 +99,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "852623c93feef5e62fcb555beedc8c53"); } + @Test + public void testHaplotypeCallerInsertionOnEdgeOfContig() { + HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "e6f7bbab7cf96cbb25837b7a94bf0f82"); + } + // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust From 0cf5d30dacbff0a4e69baa2192b68a68ddca44fe Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 13 Mar 2013 15:51:01 -0400 Subject: [PATCH 066/226] Bug fix in assembly for edge case in which the extendPartialHaplotype function was filling in deletions in the middle of haplotypes. --- .../haplotypecaller/DeBruijnAssembler.java | 24 ++++++++++--------- .../walkers/haplotypecaller/KBestPaths.java | 20 +--------------- ...lexAndSymbolicVariantsIntegrationTest.java | 4 ++-- .../HaplotypeCallerIntegrationTest.java | 12 +++++----- 4 files changed, 22 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 1e447240b..566605a8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -438,7 +438,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { byte[] newHaplotypeBases = haplotype.getBases(); int refPos = activeRegionStart; int hapPos = 0; - for( CigarElement ce : cigar.getCigarElements() ) { + for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) { + final CigarElement ce = cigar.getCigarElement(iii); switch (ce.getOperator()) { case M: refPos += ce.getLength(); @@ -450,16 +451,17 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { newCigar.add(ce); break; case D: - refPos += ce.getLength(); - newCigar.add(ce); - break; - case X: - newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos), - ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()), - Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length))); - refPos += ce.getLength(); - hapPos += ce.getLength(); - newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M)); + if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) { + newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos), + ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()), + Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length))); + hapPos += ce.getLength(); + refPos += ce.getLength(); + newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M)); + } else { + refPos += ce.getLength(); + newCigar.add(ce); + } break; default: throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 90c2e6a2a..e97fdb3cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -310,31 +310,13 @@ public class KBestPaths { if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference returnCigar.add(new CigarElement(1, CigarOperator.N)); } else { - int skipElement = -1; - if( fromVertex == null ) { - for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { - final CigarElement ce = swCigar.getCigarElement(iii); - if( ce.getOperator().equals(CigarOperator.D) ) { - skipElement = iii; - break; - } - } - } else if (toVertex == null ) { - for( int iii = swCigar.numCigarElements() - 1; iii >= 0; iii-- ) { - final CigarElement ce = swCigar.getCigarElement(iii); - if( ce.getOperator().equals(CigarOperator.D) ) { - skipElement = iii; - break; - } - } - } for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { // now we need to remove the padding from the cigar string int length = swCigar.getCigarElement(iii).getLength(); if( iii == 0 ) { length -= padding.length; } if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } if( length > 0 ) { - returnCigar.add(new CigarElement(length, (skipElement == iii ? CigarOperator.X : swCigar.getCigarElement(iii).getOperator()))); + returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator())); } } if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index fcf9168b3..72e06ddc6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "417174e043dbb8b86cc3871da9b50536"); + "fd3412030628fccf77effdb1ec03dce7"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "9563e3c1eee2ef46afc7822af0bb58a8"); + "633e8930a263e34def5e097889dd9805"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a9898b567..fb267297f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "4a2880f0753e6e813b9e0c35209b3708"); + HCTest(CEUTRIO_BAM, "", "694d6ea7f0f305854d4108379d68de75"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "b3bffabb7aafd43e0339958395e6aa10"); + HCTest(NA12878_BAM, "", "995501d8af646af3b6eaa4109e2fb4a0"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "fa1b92373c89d2238542a319ad25c257"); + "627124af27dc4556d83df1a04e4b9f97"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "852623c93feef5e62fcb555beedc8c53"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "205fc8647b908c0dab7b5c6d6b78c0c2"); } @Test @@ -118,7 +118,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9296f1af6cf1f1cc4b79494eb366e976")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f1250a8ecd404443dcca20741a74ec4f")); executeTest("HCTestStructuralIndels: ", spec); } @@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9")); + Arrays.asList("d3eb900eecdafafda3170f67adff42ae")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 742a7651e93a323703d28f26dfd3ce8f8441b2e9 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 15 Mar 2013 12:25:21 -0400 Subject: [PATCH 068/226] Further tweaking of test timeouts Increase one timeout, restore others that were only timing out due to the Java crypto lib bug to their original values. -DOUBLE timeout for NanoSchedulerUnitTest.testNanoSchedulerInLoop() -REDUCE timeout for EngineFeaturesIntegrationTest to its original value -REDUCE timeout for MaxRuntimeIntegrationTest to its original value -REDUCE timeout for GATKRunReportUnitTest to its original value --- .../sting/gatk/EngineFeaturesIntegrationTest.java | 2 +- .../broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java | 5 ++--- .../sting/utils/nanoScheduler/NanoSchedulerUnitTest.java | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index 2a9bbeb09..8d0874ea1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -117,7 +117,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 300 * 1000 ) + @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 60 * 1000 ) public void testEngineErrorHandlingTestProvider(final EngineErrorHandlingTestProvider cfg) { for ( int i = 0; i < cfg.iterationsToTest; i++ ) { final String root = "-T ErrorThrowing -R " + exampleFASTA; diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java index 9df768e70..55f9e1f7d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java @@ -39,8 +39,7 @@ import java.util.concurrent.TimeUnit; * */ public class MaxRuntimeIntegrationTest extends WalkerTest { - // Assume a ridiculous amount of startup overhead to allow for running these tests on slow farm nodes - private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(300, TimeUnit.SECONDS); + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(20, TimeUnit.SECONDS); private class MaxRuntimeTestProvider extends TestDataProvider { final long maxRuntime; @@ -69,7 +68,7 @@ public class MaxRuntimeIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 600 * 1000) + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 60 * 1000) public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T PrintReads -R " + hg18Reference diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 5587d32f8..b734ecc96 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -216,7 +216,7 @@ public class NanoSchedulerUnitTest extends BaseTest { nanoScheduler.shutdown(); } - @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = 2 * NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { if ( test.bufferSize > 1) { logger.warn("Running " + test); From a67d8c8dd6f37281b5cf2b30111086436e9ea5ab Mon Sep 17 00:00:00 2001 From: David Roazen Date: Sun, 17 Mar 2013 16:17:29 -0400 Subject: [PATCH 071/226] Bump timeout for MaxRuntimeIntegrationTest Looks like returning this timeout to its original value was a bit too aggressive -- adding 40 seconds to the tolerance limit. --- .../broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java index 55f9e1f7d..e6176dbe8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java @@ -39,7 +39,7 @@ import java.util.concurrent.TimeUnit; * */ public class MaxRuntimeIntegrationTest extends WalkerTest { - private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(20, TimeUnit.SECONDS); + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); private class MaxRuntimeTestProvider extends TestDataProvider { final long maxRuntime; @@ -68,7 +68,7 @@ public class MaxRuntimeIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 60 * 1000) + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 120 * 1000) public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T PrintReads -R " + hg18Reference From 0e9c1913ffd551bf26a8e257c5f0bf412bd15b70 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Mon, 18 Mar 2013 10:46:27 -0400 Subject: [PATCH 072/226] fix typos in argument docs and in printed output in CoveredByNSamplesSites and rewrite an unaccurate comment --- .../walkers/diagnostics/CoveredByNSamplesSites.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java index bff2ace63..92034da70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java @@ -81,10 +81,10 @@ public class CoveredByNSamplesSites extends RodWalker implem @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(fullName = "minCoverage", shortName = "minCov",doc = "only samples that have covarage bigger then minCoverage will be counted",required = false) + @Argument(fullName = "minCoverage", shortName = "minCov",doc = "only samples that have coverage bigger than minCoverage will be counted",required = false) int minCoverage = 10; - @Argument(fullName = "precentageOfSamples", shortName = "percentage", doc = "only sites where at list percentageOfSamples of the samples have good coverage, will be emited", required = false) + @Argument(fullName = "percentageOfSamples", shortName = "percentage", doc = "only sites where at least percentageOfSamples of the samples have good coverage, will be emitted", required = false) double percentageOfSamples = 0.9; @Override @@ -95,8 +95,6 @@ public class CoveredByNSamplesSites extends RodWalker implem Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); if ( VCs.size() == 0 ) return null; - if(VCs.size() != 1) - throw new RuntimeException("there are more then one vc: "+VCs.size()); boolean emitSite = false; for(VariantContext vc : VCs){ @@ -135,12 +133,11 @@ public class CoveredByNSamplesSites extends RodWalker implem } /** - * Tell the user the number of sites processed and how many passed. Close out the new intervals file. * - * @param result pair of *the number of sites seen and number of sites passed the filter. + * @param result the number of sites that passed the filter. */ public void onTraversalDone(Integer result) { - logger.info(result+" sites that have "+(percentageOfSamples*100)+"% of the samples with at list "+minCoverage+" coverage.\n"); + logger.info(result+" sites that have "+(percentageOfSamples*100)+"% of the samples with at least "+minCoverage+" coverage.\n"); } From d7bec9eb6e709bbd9234d7cc2ac079cb91d970b2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 18 Mar 2013 12:55:58 -0400 Subject: [PATCH 073/226] AssessNA12878 bugfixes -- @Output isn't required for AssessNA12878 -- Previous version would could non-variant sites in NA12878 that resulted from subsetting a multi-sample VC to NA12878 as CALLED_BUT_NOT_IN_DB sites. Now they are properly skipped -- Bugfix for subsetting samples to NA12878. Previous version wouldn't trim the alleles when subsetting down a multi-sample VCF, so we'd have false FN/FP sites at indels when the multi-sample VCF has alleles that result in the subset for NA12878 having non-trimmed alleles. Fixed and unit tested now. --- .../sting/utils/variant/GATKVariantContextUtils.java | 2 +- .../utils/variant/GATKVariantContextUtilsUnitTest.java | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 627bee3ea..dee282056 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -1000,7 +1000,7 @@ public class GATKVariantContextUtils { public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); - if ( inputVC.getNAlleles() <= 1 ) + if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) return inputVC; // see whether we need to trim common reference base from all alleles diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index ff42abb23..fcc7c7998 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -694,6 +694,15 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { root.alleles(Arrays.asList(CAref, C)).stop(11).make(), root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); + final Allele twoCopies = Allele.create("GTTTTATTTTA", true); + final Allele zeroCopies = Allele.create("G", false); + final Allele oneCopies = Allele.create("GTTTTA", false); + tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), + Arrays.asList( + root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), + root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); + return tests.toArray(new Object[][]{}); } From bccc9d79e5bb8c63dd6a0b1b08a76a5c944776b5 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Tue, 19 Mar 2013 14:03:40 -0400 Subject: [PATCH 076/226] Clear ReduceReads name cache after each set of reads produced by ReduceReadsStash. Name cache was filling up with names of all reads in entire file, which for large file eventually consumes all of memory. Only keep read name cache for the reads that are together in one variant region, so that a pair of reads within the same variant region will still be joined via read name. Otherwise the ability to connect a read to its mate is lost. Update MD5s in integration test to reflect altered output. Add new integration test that confirms that pair within variant region is joined by read name. --- .../compression/reducereads/ReduceReads.java | 7 ++++++ .../ReduceReadsIntegrationTest.java | 25 +++++++++++++------ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index da9bc1b37..62410d191 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -408,6 +408,13 @@ public class ReduceReads extends ReadWalker, Redu for (GATKSAMRecord compressedRead : stash.compress(readReady)) outputRead(compressedRead); + // We only care about maintaining the link between read pairs if they are in the same variant + // region. Since an entire variant region's worth of reads is returned in a single call to + // stash.compress(), the readNameHash can be cleared after the for() loop above. + // The advantage of clearing the hash is that otherwise it holds all reads that have been encountered, + // which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory. + readNameHash.clear(); + } } else stash.add(read); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index adbc65037..0cbd537ed 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -64,6 +64,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam"; final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057"; final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam"; + final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; private void RRTest(String testName, String args, String md5) { @@ -74,29 +75,29 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "17908e8515217c4693d303ed68108ccc"); + RRTest("testDefaultCompression ", L, "16d97a47b8dbfae4ea64fbdf522b693c"); } @Test(enabled = true) public void testInsertionsAtEdgeOfConsensus() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("3103667fc68c3136a8cfa8e22429f94e"))); + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("f7a9a27c5eaf791b67a768fff960a9e1"))); } @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "497c5e36c2beaad2fcdbd02a0b9c121b"); + RRTest("testMultipleIntervals ", intervals, "8886ba383e21883241b386882e8e5063"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "0ff4142e4d7b6a9a9c76012246ad9e2d"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "54253f25d363852a1182aff33e500b92"); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7890a37444a0e05b902f63a83238ce37"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "1d7d2d28900db57dad65a8beef64b8cb"); } @Test(enabled = true) @@ -137,7 +138,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("13c44a9afa92ae728bf55b7075cc5de3"))); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("81312c31b9910a42bff6acb5167592ab"))); } /** @@ -147,8 +148,18 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("c57cd191dc391983131be43f6cc2e381"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); } + + /** + * Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same. + */ + @Test(enabled = true) + public void testPairedReadsInVariantRegion() { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + + " -o %s --downsample_coverage 250 -dcov 50 "; + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("9bed260b6245f5ff47db8541405504aa"))); + } } From 95a9ed853dc4ee1642a581decfe8c1a6e5a2ecdd Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Fri, 15 Mar 2013 13:26:05 -0400 Subject: [PATCH 077/226] Made some documentation updates & fixes --Mostly doc block tweaks --Added @DocumentedGATKFeature to some walkers that were undocumented because they were ending up in "uncategorized". Very important for GSA: if a walker is in public or protected, it HAS to be properly tagged-in. If it's not ready for the public, it should be in private. --- .../gatk/walkers/annotator/Coverage.java | 3 ++- .../gatk/walkers/annotator/FisherStrand.java | 13 ++++++++---- .../gatk/walkers/annotator/GCContent.java | 2 +- .../bqsr/RecalibrationArgumentCollection.java | 6 +++--- .../bqsr/RecalibrationPerformance.java | 14 +++++++++++-- .../targets/BaseCoverageDistribution.java | 14 ++++++++----- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../diagnostics/CoveredByNSamplesSites.java | 14 +++++++++---- .../variantutils/GenotypeConcordance.java | 20 +++++++++++-------- 10 files changed, 60 insertions(+), 30 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java index 5138ac9af..5c48417ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java @@ -70,10 +70,11 @@ import java.util.Map; /** * Total (unfiltered) depth over all samples. * - * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's + *

        While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for * N samples with -dcov D is N * D + *

        */ public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 39fdcb707..7960a3ce2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -69,10 +69,15 @@ import java.util.*; /** - * Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation - * being seen on only the forward or only the reverse strand) in the reads? More bias is - * indicative of false positive calls. Note that the fisher strand test may not be - * calculated for certain complex indel cases or for multi-allelic sites. + * Phred-scaled p-value using Fisher's Exact Test to detect strand bias + * + *

        Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation + * being seen on only the forward or only the reverse strand) in the reads. More bias is + * indicative of false positive calls. + *

        + * + *

        Caveat

        + *

        The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

        */ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static Logger logger = Logger.getLogger(FisherStrand.class); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index a4b1b1b49..827e39c11 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -68,7 +68,7 @@ import java.util.Map; /** - * GC content of the reference around this site + * GC content of the reference around the given site * *

        The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.

        * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 447569643..0a4899f1c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -61,7 +61,7 @@ import java.util.List; * User: rpoplin * Date: Nov 27, 2009 * - * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker. + * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker. * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. */ @@ -131,14 +131,14 @@ public class RecalibrationArgumentCollection { public boolean RUN_WITHOUT_DBSNP = false; /** - * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the + * BaseRecalibrator accepts a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. */ @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; /** - * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles + * BaseRecalibrator accepts a --solid_nocall_strategy flag which governs how the recalibrator handles * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in * their color space tag can not be recalibrated. */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java index d0af08d90..271617059 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; @@ -55,18 +56,27 @@ import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.recalibration.*; import java.io.*; /** + * Evaluate the performance of the base recalibration process + * + *

        This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.

        + * + *

        Caveat

        + *

        This tool is currently experimental. We do not provide documentation nor support for its operation.

        + * */ - +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @PartitionBy(PartitionType.READ) public class RecalibrationPerformance extends RodWalker implements NanoSchedulable { - @Output(doc="Write output to this file") + @Output public PrintStream out; @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file") diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java index b70581dd3..53b7cebaa 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -55,6 +56,8 @@ import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import java.io.PrintStream; import java.util.ArrayList; @@ -63,11 +66,11 @@ import java.util.LinkedList; import java.util.Map; /** - * Simple walker to plot the coverage distribution per base. + * Simple walker to plot the coverage distribution per base * *

        * Features of this walker: - *

      3. includes a smart counting of uncovered bases without visiting the uncovered loci.
      4. + *
      5. includes a smart counting of uncovered bases without visiting the uncovered loci
      6. *
      7. includes reads with deletions in the loci (optionally can be turned off)
      8. *

        * @@ -91,10 +94,11 @@ import java.util.Map; * -fd \ * -o report.grp *
        - * User: carneiro - * Date: 1/27/13 - * Time: 11:16 AM + * + * @author carneiro + * @since 1/27/13 */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class BaseCoverageDistribution extends LocusWalker, Map>> { /** * The output GATK Report table diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index ce2869e94..ae6b56e19 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -82,7 +82,7 @@ import java.util.*; * * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. - * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration. + * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. */ public class RecalUtils { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 61574d947..29016af43 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -117,7 +117,7 @@ import java.util.*; // todo -- alter logarithmic scaling to spread out bins more // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @By(DataSource.REFERENCE) @PartitionBy(PartitionType.NONE) @Downsample(by= DownsampleType.NONE, toCoverage=Integer.MAX_VALUE) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java index 92034da70..506ef2c72 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java @@ -29,12 +29,15 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -44,12 +47,15 @@ import java.io.*; import java.util.Collection; /** - * print intervals file with all the variant sites that have "most" ( >= 90% by default) of the samples with "good" (>= 10 by default)coverage ("most" and "good" can be set in the command line). + * Print intervals file with all the variant sites for which most of the samples have good coverage * *

        - * CoveredByNSamplesSites is a GATK tool for filter out sites based on their coverage. + * CoveredByNSamplesSites is a GATK tool for filtering out sites based on their coverage. * The sites that pass the filter are printed out to an intervals file. * + * See argument defaults for what constitutes "most" samples and "good" coverage. These parameters can be modified from the command line. + *

        + * *

        Input

        *

        * A variant file and optionally min coverage and sample percentage values. @@ -60,7 +66,7 @@ import java.util.Collection; * An intervals file. *

        * - *

        Examples

        + *

        Example

        *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        @@ -71,7 +77,7 @@ import java.util.Collection;
          * 
        * */ - +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @By(DataSource.REFERENCE_ORDERED_DATA) public class CoveredByNSamplesSites extends RodWalker implements TreeReducible { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java index 048c7ef77..35213af34 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,6 +34,8 @@ import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFHeader; @@ -41,29 +44,30 @@ import java.io.PrintStream; import java.util.*; /** - * A simple walker for performing genotype concordance calculations between two callsets. Outputs a GATK table with - * per-sample and aggregate counts and frequencies, a summary table for NRD/NRS, and a table for site allele overlaps. + * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets * *

        - * Genotype concordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, + * GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles, * and for each sample, the genotype-by-genotype counts (for instance, the number of sites at which a sample was * called homozygous reference in the EVAL callset, but homozygous variant in the COMP callset). It outputs these * counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in * the COMP) and metrics (such as NRD and NRS). * - *

        INPUT

        + *

        Input

        *

        * Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via - * the -eval and -comp arguments - *

        + * the -eval and -comp arguments. + * * (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and * -cfe arguments, respectively. + *

        * - *

        OUTPUT

        - * Genotype Concordance writes a GATK report to the specified (via -o) file, consisting of multiple tables of counts + *

        Output

        + * Genotype Concordance writes a GATK report to the specified file (via -o) , consisting of multiple tables of counts * and proportions. These tables may be optionally moltenized via the -moltenize argument. * */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { /** From 6b4d88ebe96d3383a0778c6f8a3bbf6bd88ccaee Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Fri, 15 Mar 2013 16:34:29 -0400 Subject: [PATCH 078/226] Created ListAnnotations utility (extends CommandLineProgram) --Refactored listAnnotations basic method out of VA into HelpUtils --HelpUtils.listAnnotations() is now called by both VA and the new ListAnnotations utility (lives in sting.tools) --This way we keep the VA --list option but we also offer a way to list annotations without a full valid VA command-line, which was a pain users continually complained about --We could get rid of the VA --list option altogether ...? --- .../walkers/annotator/VariantAnnotator.java | 30 ++----- .../sting/tools/CatVariants.java | 15 ++-- .../sting/tools/ListAnnotations.java | 85 +++++++++++++++++++ .../sting/utils/help/HelpConstants.java | 1 + .../sting/utils/help/HelpUtils.java | 29 +++++++ 5 files changed, 129 insertions(+), 31 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 330d29c79..301baaba3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -36,10 +36,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -47,7 +47,6 @@ import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import java.util.*; - /** * Annotates variant calls with context information. * @@ -165,7 +164,7 @@ public class VariantAnnotator extends RodWalker implements Ann protected Boolean USE_ALL_ANNOTATIONS = false; /** - * Note that the --list argument requires a fully resolved and correct command-line to work. + * Note that the --list argument requires a fully resolved and correct command-line to work. As a simpler alternative, you can use ListAnnotations (see Help Utilities). */ @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit", required=false) protected Boolean LIST = false; @@ -177,7 +176,7 @@ public class VariantAnnotator extends RodWalker implements Ann protected Boolean ALWAYS_APPEND_DBSNP_ID = false; public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; } - @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio") + @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality threshold in order to annotate mendelian violation ratio") public double minGenotypeQualityP = 0.0; @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false) @@ -185,33 +184,14 @@ public class VariantAnnotator extends RodWalker implements Ann private VariantAnnotatorEngine engine; - - private void listAnnotationsAndExit() { - System.out.println("\nStandard annotations in the list below are marked with a '*'."); - List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins(); - System.out.println("\nAvailable annotations for the VCF INFO field:"); - for (int i = 0; i < infoAnnotationClasses.size(); i++) - System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName()); - System.out.println(); - List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins(); - System.out.println("\nAvailable annotations for the VCF FORMAT field:"); - for (int i = 0; i < genotypeAnnotationClasses.size(); i++) - System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName()); - System.out.println(); - System.out.println("\nAvailable classes/groups of annotations:"); - for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() ) - System.out.println("\t" + c.getSimpleName()); - System.out.println(); - System.exit(0); - } - /** * Prepare the output file and the list of available features. */ public void initialize() { if ( LIST ) - listAnnotationsAndExit(); + HelpUtils.listAnnotations(); + System.exit(0); // get the list of all sample names from the variant VCF input rod, if applicable List rodName = Arrays.asList(variantCollection.variants.getName()); diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java index e1dd2c255..ad77b2548 100644 --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java +++ b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java @@ -35,7 +35,6 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.CommandLineProgram; -import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.bcf2.BCF2Codec; @@ -54,7 +53,7 @@ import java.util.*; /** * - * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples. + * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples * *

        * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. @@ -80,10 +79,14 @@ import java.util.*; * A combined VCF. The output file should be 'name.vcf' or 'name.VCF'. * <\p> * + *

        Important note

        + *

        This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to + * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the + * classic "CommandLineGATK" arguments.

        * - *

        Examples

        + *

        Example

        *
        - * java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
        + * java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants \
          *    -R ref.fasta \
          *    -V input1.vcf \
          *    -V input2.vcf \
        @@ -95,7 +98,7 @@ import java.util.*;
          * @since Jan 2012
          */
         
        -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
        +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP )
         public class CatVariants extends CommandLineProgram {
             // setup the logging system, used by some codecs
             private static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger();
        @@ -124,7 +127,7 @@ public class CatVariants extends CommandLineProgram {
              * print usage information
              */
             private static void printUsage() {
        -        System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.AppendVariants    [sorted (optional)]");
        +        System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.CatVariants    [sorted (optional)]");
                 System.err.println("    The input files can be of type: VCF (ends in .vcf or .VCF)");
                 System.err.println("                                    BCF2 (ends in .bcf or .BCF)");
                 System.err.println("    Output file must be vcf or bcf file (.vcf or .bcf)");
        diff --git a/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java b/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java
        new file mode 100644
        index 000000000..fabcf828a
        --- /dev/null
        +++ b/public/java/src/org/broadinstitute/sting/tools/ListAnnotations.java
        @@ -0,0 +1,85 @@
        +/*
        +* Copyright (c) 2012 The Broad Institute
        +* 
        +* Permission is hereby granted, free of charge, to any person
        +* obtaining a copy of this software and associated documentation
        +* files (the "Software"), to deal in the Software without
        +* restriction, including without limitation the rights to use,
        +* copy, modify, merge, publish, distribute, sublicense, and/or sell
        +* copies of the Software, and to permit persons to whom the
        +* Software is furnished to do so, subject to the following
        +* conditions:
        +* 
        +* The above copyright notice and this permission notice shall be
        +* included in all copies or substantial portions of the Software.
        +* 
        +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
        +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
        +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
        +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
        +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
        +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
        +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
        +* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
        +*/
        +
        +package org.broadinstitute.sting.tools;
        +
        +import org.broadinstitute.sting.commandline.CommandLineProgram;
        +import org.broadinstitute.sting.utils.exceptions.UserException;
        +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
        +import org.broadinstitute.sting.utils.help.HelpConstants;
        +import org.broadinstitute.sting.utils.help.HelpUtils;
        +
        +/**
        + * Utility program to print a list of available annotations
        + *
        + * 

        This is a very simple utility tool that retrieves available annotations for use with tools such as + * UnifiedGenotyper, HaplotypeCaller and VariantAnnotator.

        + * + *

        Important note

        + *

        This is a command-line utility that bypasses the GATK engine. As a result, the command-line you must use to + * invoke it is a little different from other GATK tools (see usage below), and it does not accept any of the + * classic "CommandLineGATK" arguments.

        + * + *

        Usage

        + *
        java -cp GenomeAnalysisTK.jar org.broadinstitute.sting.tools.ListAnnotations
        + * + * @author vdauwera + * @since 3/14/13 + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_HELPUTILS ) +public class ListAnnotations extends CommandLineProgram { + + /* + * Print usage information + * + * TODO: would be more convenient if we could just call the program by name instead of the full classpath + */ + private static void printUsage() { + System.err.println("Usage: java -cp dist/GenomeAnalysisTK.jar org.broadinstitute.sting.tools.ListAnnotations"); + System.err.println(" Prints a list of available annotations and exits."); + } + + // TODO: override CommandLineProgram bit that offers version, logging etc arguments. We don't need that stuff here and it makes the doc confusing. + + @Override + protected int execute() throws Exception { + + HelpUtils.listAnnotations(); + return 0; + } + + public static void main(String[] args){ + try { + ListAnnotations instance = new ListAnnotations(); + start(instance, args); + System.exit(CommandLineProgram.result); + } catch ( UserException e ) { + printUsage(); + exitSystemWithUserError(e); + } catch ( Exception e ) { + exitSystemWithError(e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java index f99ff7538..2ed35d848 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java @@ -56,6 +56,7 @@ public class HelpConstants { public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; public final static String DOCS_CAT_TEST = "Testing Tools"; + public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; public static String forumPost(String post) { return GATK_FORUM_URL + post; diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 81606d2f3..9a23fd022 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -28,9 +28,15 @@ package org.broadinstitute.sting.utils.help; import com.sun.javadoc.FieldDoc; import com.sun.javadoc.PackageDoc; import com.sun.javadoc.ProgramElementDoc; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; import java.lang.reflect.Field; +import java.util.List; public class HelpUtils { @@ -70,4 +76,27 @@ public class HelpUtils { String.format("%s", doc.name()); } + /** + * Simple method to print a list of available annotations. + */ + public static void listAnnotations() { + System.out.println("\nThis is a list of available Variant Annotations for use with tools such as UnifiedGenotyper, HaplotypeCaller and VariantAnnotator. Please see the Technical Documentation for more details about these annotations:"); + System.out.println("http://www.broadinstitute.org/gatk/gatkdocs/"); + System.out.println("\nStandard annotations in the list below are marked with a '*'."); + List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins(); + System.out.println("\nAvailable annotations for the VCF INFO field:"); + for (int i = 0; i < infoAnnotationClasses.size(); i++) + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName()); + System.out.println(); + List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins(); + System.out.println("\nAvailable annotations for the VCF FORMAT field:"); + for (int i = 0; i < genotypeAnnotationClasses.size(); i++) + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName()); + System.out.println(); + System.out.println("\nAvailable classes/groups of annotations:"); + for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() ) + System.out.println("\t" + c.getSimpleName()); + System.out.println(); + } + } \ No newline at end of file From d70bf647379cc8f1eb9a2018d539ec88a86f4702 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Fri, 15 Mar 2013 16:41:14 -0400 Subject: [PATCH 079/226] Created new DeprecatedToolChecks class --Based on existing code in GenomeAnalysisEngine --Hashmaps hold mapping of deprecated tool name to version number and recommended replacement (if any) --Using FastUtils for maps; specifically Object2ObjectMap but there could be a better type for Strings... --Added user exception for deprecated annotations --Added deprecation check to AnnotationInterfaceManager.validateAnnotations --Run when annotations are initialized --Made annotation sets instead of lists --- .../sting/gatk/GenomeAnalysisEngine.java | 41 ++------ .../walkers/annotator/VariantAnnotator.java | 6 +- .../annotator/VariantAnnotatorEngine.java | 2 +- .../AnnotationInterfaceManager.java | 12 ++- .../sting/utils/DeprecatedToolChecks.java | 95 +++++++++++++++++++ .../sting/utils/exceptions/UserException.java | 8 +- 6 files changed, 121 insertions(+), 43 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index e45a750ba..2d8b9cd9a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -67,6 +67,9 @@ import java.io.File; import java.util.*; import java.util.concurrent.TimeUnit; +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; +import static org.broadinstitute.sting.utils.DeprecatedToolChecks.isDeprecatedWalker; + /** * A GenomeAnalysisEngine that runs a specified walker. */ @@ -288,40 +291,6 @@ public class GenomeAnalysisEngine { //return result; } - // TODO -- Let's move this to a utility class in unstable - but which one? - // ************************************************************************************** - // * Handle Deprecated Walkers * - // ************************************************************************************** - - // Mapping from walker name to major version number where the walker first disappeared - private static Map deprecatedGATKWalkers = new HashMap(); - static { - deprecatedGATKWalkers.put("CountCovariates", "2.0"); - deprecatedGATKWalkers.put("TableRecalibration", "2.0"); - deprecatedGATKWalkers.put("AlignmentWalker", "2.2"); - deprecatedGATKWalkers.put("CountBestAlignments", "2.2"); - } - - /** - * Utility method to check whether a given walker has been deprecated in a previous GATK release - * - * @param walkerName the walker class name (not the full package) to check - */ - public static boolean isDeprecatedWalker(final String walkerName) { - return deprecatedGATKWalkers.containsKey(walkerName); - } - - /** - * Utility method to check whether a given walker has been deprecated in a previous GATK release - * - * @param walkerName the walker class name (not the full package) to check - */ - public static String getDeprecatedMajorVersionNumber(final String walkerName) { - return deprecatedGATKWalkers.get(walkerName); - } - - // ************************************************************************************** - /** * Retrieves an instance of the walker based on the walker name. * @@ -333,7 +302,7 @@ public class GenomeAnalysisEngine { return walkerManager.createByName(walkerName); } catch ( UserException e ) { if ( isDeprecatedWalker(walkerName) ) { - e = new UserException.DeprecatedWalker(walkerName, getDeprecatedMajorVersionNumber(walkerName)); + e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); } throw e; } @@ -565,6 +534,8 @@ public class GenomeAnalysisEngine { if ( intervals != null && intervals.isEmpty() ) { logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); } + + // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 301baaba3..f2bd6c14c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -44,6 +44,7 @@ import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import java.util.*; @@ -155,7 +156,7 @@ public class VariantAnnotator extends RodWalker implements Ann * If multiple records in the rod overlap the given position, one is chosen arbitrarily. */ @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false) - protected List expressionsToUse = new ArrayList(); + protected Set expressionsToUse = new ObjectOpenHashSet(); /** * Note that the -XL argument can be used along with this one to exclude annotations. @@ -189,9 +190,10 @@ public class VariantAnnotator extends RodWalker implements Ann */ public void initialize() { - if ( LIST ) + if ( LIST ) { HelpUtils.listAnnotations(); System.exit(0); + } // get the list of all sample names from the variant VCF input rod, if applicable List rodName = Arrays.asList(variantCollection.variants.getName()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index c5703afc8..695868bb1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -104,7 +104,7 @@ public class VariantAnnotatorEngine { } // select specific expressions to use - public void initializeExpressions(List expressionsToUse) { + public void initializeExpressions(Set expressionsToUse) { // set up the expressions for ( String expression : expressionsToUse ) requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java index 221887158..59b4b1b3b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotationInterfaceManager.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; +import org.broadinstitute.sting.utils.DeprecatedToolChecks; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -58,7 +59,7 @@ public class AnnotationInterfaceManager { if ( interfaceClass == null ) interfaceClass = classMap.get(group + "Annotation"); if ( interfaceClass == null ) - throw new UserException.BadArgumentValue("group", "Class " + group + " is not found; please check that you have specified the class name correctly"); + throw new UserException.BadArgumentValue("group", "Annotation group " + group + " was not found; please check that you have specified the group name correctly"); } } @@ -67,8 +68,13 @@ public class AnnotationInterfaceManager { Class annotationClass = classMap.get(annotation); if ( annotationClass == null ) annotationClass = classMap.get(annotation + "Annotation"); - if ( annotationClass == null ) - throw new UserException.BadArgumentValue("annotation", "Class " + annotation + " is not found; please check that you have specified the class name correctly"); + if ( annotationClass == null ) { + if (DeprecatedToolChecks.isDeprecatedAnnotation(annotation) ) { + throw new UserException.DeprecatedAnnotation(annotation, DeprecatedToolChecks.getAnnotationDeprecationInfo(annotation)); + } else { + throw new UserException.BadArgumentValue("annotation", "Annotation " + annotation + " was not found; please check that you have specified the annotation name correctly"); + } + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java new file mode 100644 index 000000000..e20872c5b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java @@ -0,0 +1,95 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils; + +import it.unimi.dsi.fastutil.objects.Object2ObjectMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; + +import java.util.*; + +/** + * Utility class for handling deprecated tools gracefully + * + * @author vdauwera + * @since 3/11/13 + */ +public class DeprecatedToolChecks { + + // Mapping from walker name to major version number where the walker first disappeared and optional replacement options + private static Object2ObjectMap deprecatedGATKWalkers = new Object2ObjectOpenHashMap(); + static { + // Indicate recommended replacement in parentheses if applicable + deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); + deprecatedGATKWalkers.put("AnalyzeCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); + deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)"); + deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)"); + deprecatedGATKWalkers.put("CountBestAlignments", "2.2 (no replacement)"); + } + + // Mapping from walker name to major version number where the walker first disappeared and optional replacement options + private static Object2ObjectMap deprecatedGATKAnnotations = new Object2ObjectOpenHashMap(); + static { + // Same comments as for walkers + deprecatedGATKAnnotations.put("DepthOfCoverage", "2.4 (renamed to Coverage)"); + } + + /** + * Utility method to check whether a given walker has been deprecated in a previous GATK release + * + * @param walkerName the walker class name (not the full package) to check + */ + public static boolean isDeprecatedWalker(final String walkerName) { + return deprecatedGATKWalkers.containsKey(walkerName); + } + + /** + * Utility method to check whether a given annotation has been deprecated in a previous GATK release + * + * @param annotationName the annotation class name (not the full package) to check + */ + public static boolean isDeprecatedAnnotation(final String annotationName) { + return deprecatedGATKAnnotations.containsKey(annotationName); + } + + /** + * Utility method to pull up the version number at which a walker was deprecated and the suggested replacement, if any + * + * @param walkerName the walker class name (not the full package) to check + */ + public static String getWalkerDeprecationInfo(final String walkerName) { + return deprecatedGATKWalkers.get(walkerName).toString(); + } + + /** + * Utility method to pull up the version number at which an annotation was deprecated and the suggested replacement, if any + * + * @param annotationName the annotation class name (not the full package) to check + */ + public static String getAnnotationDeprecationInfo(final String annotationName) { + return deprecatedGATKAnnotations.get(annotationName).toString(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index b3c5bd2c7..fcc132ffe 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -371,14 +371,18 @@ public class UserException extends ReviewedStingException { } } - - public static class DeprecatedWalker extends UserException { public DeprecatedWalker(String walkerName, String version) { super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); } } + public static class DeprecatedAnnotation extends UserException { + public DeprecatedAnnotation(String annotationName, String version) { + super(String.format("Annotation %s is no longer available in the GATK; it has been deprecated since version %s", annotationName, version)); + } + } + public static class CannotExecuteQScript extends UserException { public CannotExecuteQScript(String message) { super(String.format("Unable to execute QScript: " + message)); From ea01dbf1309b56657477c6f0886b577fe0844be3 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 19 Mar 2013 15:26:50 -0400 Subject: [PATCH 080/226] Fix to issue encountered when running HaplotypeCaller in GGA mode with data from other 1000G callers. In particular, someone produced a tandem repeat site with 57 alt alleles (sic) which made the caller blow up. Inelegant fix is to detect if # of alleles is > our max cached capacity, and if so, emit an informative warning and skip site. -- Added unit test to UG engine to cover this case. -- Commit to posterity private scala script currently used for 1000G indel consensus (still very much subject to changes). GSA-878 #resolve --- .../genotyper/UnifiedGenotyperEngine.java | 29 +++++++++++++++++++ .../UnifiedGenotyperEngineUnitTest.java | 25 ++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 1d0c10795..4259dbdb6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -385,11 +385,23 @@ public class UnifiedGenotyperEngine { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; + // TODO TODO TODO TODO + // REFACTOR THIS FUNCTION, TOO UNWIELDY!! + // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } + // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call + if (!canVCbeGenotyped(vc)) { + if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext) + return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext); + else + return null; + + } + // estimate our confidence in a reference call and return if ( vc.getNSamples() == 0 ) { if ( limitedContext ) @@ -544,6 +556,23 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); } + /** + * Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed. + * @param vc Input VC + * @return Status check + */ + @Requires("vc != null") + protected boolean canVCbeGenotyped(final VariantContext vc) { + // protect against too many alternate alleles that we can't even run AF on: + if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { + logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + + " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart()); + return false; + } + else return true; + + } + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { if ( !BaseUtils.isRegularBase(refContext.getBase()) ) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java index 23596db83..657cd9c0c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java @@ -50,10 +50,16 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; // the imports for unit testing. +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; @@ -102,4 +108,23 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest { Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref); Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample"); } + + @Test(enabled=true) + public void testTooManyAlleles() { + + for ( Integer numAltAlleles = 0; numAltAlleles < 100; numAltAlleles++ ) { + + Set alleles = new HashSet(); + alleles.add(Allele.create("A", true)); // ref allele + + for (int len = 1; len <=numAltAlleles; len++) { + // add alt allele of length len+1 + alleles.add(Allele.create(Utils.dupString('A', len + 1), false)); + } + final VariantContext vc = new VariantContextBuilder("test", "chr1", 1000, 1000, alleles).make(); + final boolean result = ugEngine.canVCbeGenotyped(vc); + Assert.assertTrue(result == (vc.getNAlleles()<= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)); + } + } + } \ No newline at end of file From a783f19ab12060084c9811902365d7629b1631ca Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 13:45:53 -0500 Subject: [PATCH 082/226] Fix for potential HaplotypeCaller bug in annotation ordering -- Annotations were being called on VariantContext that might needed to be trimmed. Simply inverted the order of operations so trimming occurs before the annotations are added. -- Minor cleanup of call to PairHMM in LikelihoodCalculationEngine --- .../walkers/haplotypecaller/GenotypingEngine.java | 13 ++++++++----- .../LikelihoodCalculationEngine.java | 9 ++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 1cfc65581..400de6485 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -273,16 +273,19 @@ public class GenotypingEngine { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); - VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); + + VariantContext annotatedCall = call; + // TODO -- should be before annotated call, so that QDL works correctly + if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! + annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); + } + + annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, annotatedCall); // maintain the set of all called haplotypes for ( final Allele calledAllele : call.getAlleles() ) calledHaplotypes.addAll(alleleMapper.get(calledAllele)); - if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! - annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); - } - returnCalls.add( annotatedCall ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a7d85b969..87b488b3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -151,9 +151,12 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), - pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); + final boolean isFirstHaplotype = jjj == 0; + final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), + read.getReadBases(), readQuals, readInsQuals, readDelQuals, + overallGCP, haplotypeStart, isFirstHaplotype); + + perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); } } return perReadAlleleLikelihoodMap; From 752440707d6005104410ff67f79fe410723df964 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 13:52:53 -0500 Subject: [PATCH 083/226] AlignmentUtils.calcNumDifferentBases computes the number of bases that differ between a reference and read sequence given a cigar between the two. --- .../sting/utils/sam/AlignmentUtils.java | 39 +++++++++++++++++++ .../utils/sam/AlignmentUtilsUnitTest.java | 30 +++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index d59d0ef63..58f70d4b6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -48,6 +48,45 @@ public final class AlignmentUtils { // cannot be instantiated private AlignmentUtils() { } + /** + * Get the number of bases at which refSeq and readSeq differ, given their alignment + * + * @param cigar the alignment of readSeq to refSeq + * @param refSeq the bases of the reference sequence + * @param readSeq the bases of the read sequence + * @return the number of bases that differ between refSeq and readSeq + */ + public static int calcNumDifferentBases(final Cigar cigar, final byte[] refSeq, final byte[] readSeq) { + int refIndex = 0, readIdx = 0, delta = 0; + + for (final CigarElement ce : cigar.getCigarElements()) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case X:case EQ:case M: + for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) + delta += refSeq[refIndex] != readSeq[readIdx] ? 1 : 0; + break; + case I: + delta += elementLength; + case S: + readIdx += elementLength; + break; + case D: + delta += elementLength; + case N: + refIndex += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); + } + } + + return delta; + } + public static class MismatchCount { public int numMismatches = 0; public long mismatchQualities = 0; diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index ae01c6c63..660dadc00 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,7 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { - private final static boolean DEBUG = false; + private final static boolean DEBUG = true; private SAMFileHeader header; /** Basic aligned and mapped read. */ @@ -145,6 +145,34 @@ public class AlignmentUtilsUnitTest { } + @DataProvider(name = "CalcNumDifferentBasesData") + public Object[][] makeCalcNumDifferentBasesData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"5M", "ACGTA", "ACGTA", 0}); + tests.add(new Object[]{"5M", "ACGTA", "ACGTT", 1}); + tests.add(new Object[]{"5M", "ACGTA", "TCGTT", 2}); + tests.add(new Object[]{"5M", "ACGTA", "TTGTT", 3}); + tests.add(new Object[]{"5M", "ACGTA", "TTTTT", 4}); + tests.add(new Object[]{"5M", "ACGTA", "TTTCT", 5}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "ACNNNGTA", 3}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "ACNNNGTT", 4}); + tests.add(new Object[]{"2M3I3M", "ACGTA", "TCNNNGTT", 5}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "ACA", 2}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "ACT", 3}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "TCT", 4}); + tests.add(new Object[]{"2M2D1M", "ACGTA", "TGT", 5}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "CalcNumDifferentBasesData") + public void testCalcNumDifferentBases(final String cigarString, final String ref, final String read, final int expectedDifferences) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + Assert.assertEquals(AlignmentUtils.calcNumDifferentBases(cigar, ref.getBytes(), read.getBytes()), expectedDifferences); + } + + @DataProvider(name = "NumAlignedBasesCountingSoftClips") public Object[][] makeNumAlignedBasesCountingSoftClips() { List tests = new ArrayList(); From a8fb26bf0167147bae2c3896e41be5049dd0bb48 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 21:39:18 -0500 Subject: [PATCH 084/226] A generic downsampler that reduces coverage for a bunch of reads -- Exposed the underlying minElementsPerStack parameter for LevelingDownsampler --- .../gatk/downsampling/DownsamplingUtils.java | 107 ++++++++++++++++++ .../downsampling/LevelingDownsampler.java | 26 ++++- .../walkers/readutils/DownsampleReadsQC.java | 105 +++++++++++++++++ 3 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java new file mode 100644 index 000000000..877083829 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Utilities for using the downsamplers for common tasks + * + * User: depristo + * Date: 3/6/13 + * Time: 4:26 PM + */ +public class DownsamplingUtils { + private DownsamplingUtils() { } + + /** + * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing + * coverage at any read start to less than minReadsPerAlignmentStart + * + * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and + * want to reduce the coverage of the big peak down without removing the many reads at the edge of this + * interval that are in fact good + * + * This algorithm separately operates on the reads for each sample independently. + * + * @param reads a sorted list of reads + * @param downsampleTo the targeted number of reads we want from reads per sample + * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start + * to below this. That is, if this value is 2, we'll never reduce the number + * of reads starting at a specific start site to less than 2 + * @return a sorted list of reads + */ + public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { + if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); + + final List downsampled = new ArrayList(reads.size()); + + final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); + for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { + final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); + downsampler.submit(readsByPosMap.values()); + downsampler.signalEndOfInput(); + for ( final List downsampledReads : downsampler.consumeFinalizedItems()) + downsampled.addAll(downsampledReads); + } + + return ReadUtils.sortReadsByCoordinate(downsampled); + } + + /** + * Build the data structure mapping for each sample -> (position -> reads at position) + * + * Note that the map position -> reads isn't ordered in any meaningful way + * + * @param reads a list of sorted reads + * @return a map containing the list of reads at each start location, for each sample independently + */ + private static Map>> partitionReadsBySampleAndStart(final List reads) { + final Map>> readsBySampleByStart = new LinkedHashMap>>(); + + for ( final GATKSAMRecord read : reads ) { + Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); + + if ( readsByStart == null ) { + readsByStart = new LinkedHashMap>(); + readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); + } + + List readsAtStart = readsByStart.get(read.getAlignmentStart()); + if ( readsAtStart == null ) { + readsAtStart = new LinkedList(); + readsByStart.put(read.getAlignmentStart(), readsAtStart); + } + + readsAtStart.add(read); + } + + return readsBySampleByStart; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java index 9b4b2adcb..a8a808333 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -47,8 +47,8 @@ import java.util.*; * @author David Roazen */ public class LevelingDownsampler, E> implements Downsampler { - - private int targetSize; + private final int minElementsPerStack; + private final int targetSize; private List groups; @@ -59,12 +59,32 @@ public class LevelingDownsampler, E> implements Downsampler /** * Construct a LevelingDownsampler * + * Uses the default minElementsPerStack of 1 + * * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed * this value -- if it does, items are removed from Lists evenly until the total size * is <= this value */ public LevelingDownsampler( int targetSize ) { + this(targetSize, 1); + } + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, + * if a stack has only 3 elements and minElementsPerStack is 3, no matter what + * we'll not reduce this stack below 3. + */ + public LevelingDownsampler(final int targetSize, final int minElementsPerStack) { + if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); + if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); + this.targetSize = targetSize; + this.minElementsPerStack = minElementsPerStack; clear(); reset(); } @@ -148,7 +168,7 @@ public class LevelingDownsampler, E> implements Downsampler // remove any more items without violating the constraint that all groups must // be left with at least one item while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > 1 ) { + if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { groupSizes[currentGroupIndex]--; numItemsToRemove--; numConsecutiveUmodifiableGroups = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java new file mode 100644 index 000000000..1141a9164 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; + +/** + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class DownsampleReadsQC extends ReadWalker> implements NanoSchedulable { + @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) + StingSAMFileWriter out; + + @Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignmentStart", doc ="", required = false) + private int minReadsPerAlignmentStart = 5; + + @Argument(fullName = "downsampleTo", shortName = "downsampleTo", doc ="", required = false) + private int downsampleTo = 1000; + + /** + * The initialize function. + */ + public void initialize() { +// final boolean preSorted = true; +// if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { +// Utils.setupWriter(out, getToolkit(), getToolkit().getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); +// } + } + + /** + * The reads map function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param readIn the read itself, as a GATKSAMRecord + * @return the read itself + */ + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + return readIn; + } + + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise + */ + public Collection reduceInit() { + return new LinkedList(); + } + + /** + * given a read and a output location, reduce by emitting the read + * + * @param read the read itself + * @param output the output source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public Collection reduce( GATKSAMRecord read, Collection output ) { + output.add(read); + return output; + } + + @Override + public void onTraversalDone(Collection result) { + for ( final GATKSAMRecord read : DownsamplingUtils.levelCoverageByPosition(new ArrayList(result), downsampleTo, minReadsPerAlignmentStart) ) + out.addAlignment(read); + } +} From ffea6dd95f34de0c979273c0783d6da75bbe16f0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 18 Mar 2013 17:06:32 -0400 Subject: [PATCH 085/226] HaplotypeCaller now has the ability to only consider the best N haplotypes for genotyping -- Added a -dontGenotype mode for testing assembly efficiency -- However, it looks like this has a very negative impact on the quality of the results, so the code should be deleted --- .../haplotypecaller/DeBruijnAssembler.java | 74 +++++++++++++------ .../haplotypecaller/HaplotypeCaller.java | 22 +++++- .../broadinstitute/sting/utils/Haplotype.java | 32 +++++++- 3 files changed, 101 insertions(+), 27 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 566605a8c..bf08d1526 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -52,6 +52,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -73,6 +74,7 @@ import java.util.*; */ public class DeBruijnAssembler extends LocalAssemblyEngine { + private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; @@ -85,18 +87,20 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; - private final boolean DEBUG; - private final PrintStream GRAPH_WRITER; + private final boolean debug; + private final PrintStream graphWriter; private final List graphs = new ArrayList(); - private final int MIN_KMER; + private final int minKmer; + private final int maxHaplotypesToConsider; private int PRUNE_FACTOR = 2; - public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) { + public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { super(); - DEBUG = debug; - GRAPH_WRITER = graphWriter; - MIN_KMER = minKmer; + this.debug = debug; + this.graphWriter = graphWriter; + this.minKmer = minKmer; + this.maxHaplotypesToConsider = maxHaplotypesToConsider; } /** @@ -123,7 +127,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); // print the graphs if the appropriate debug option has been turned on - if( GRAPH_WRITER != null ) { + if( graphWriter != null ) { printGraphs(); } @@ -136,11 +140,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { graphs.clear(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs + if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs // create the graph for each possible kmer - for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) { - final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG ); + for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { + //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); + final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready pruneGraph(graph, PRUNE_FACTOR); @@ -320,22 +325,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } protected void printGraphs() { - GRAPH_WRITER.println("digraph assemblyGraphs {"); + graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];"); + graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); } if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); + graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); } if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); } } - GRAPH_WRITER.println("}"); + graphWriter.println("}"); } @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @@ -343,6 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes + // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm final List returnHaplotypes = new ArrayList(); refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart()); final Cigar c = new Cigar(); @@ -383,7 +389,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } if( !returnHaplotypes.contains(h) ) { h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setCigar( leftAlignedCigar ); + h.setCigar(leftAlignedCigar); + h.setScore(path.getScore()); returnHaplotypes.add(h); // for GGA mode, add the desired allele into the haplotype if it isn't already present @@ -409,18 +416,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - if( DEBUG ) { - if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against."); + final List finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes); + if ( finalHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + + if( debug ) { + if( finalHaplotypes.size() > 1 ) { + System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { System.out.println("Found only the reference haplotype in the assembly graph."); } - for( final Haplotype h : returnHaplotypes ) { + for( final Haplotype h : finalHaplotypes ) { System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() ); + System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } - return returnHaplotypes; + + return finalHaplotypes; + } + + /** + * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider + * + * @param haplotypes a list of haplotypes to consider + * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider + */ + private List selectHighestScoringHaplotypes(final List haplotypes) { + if ( haplotypes.size() <= maxHaplotypesToConsider ) + return haplotypes; + else { + final List sorted = new ArrayList(haplotypes); + Collections.sort(sorted, new Haplotype.ScoreComparator()); + return sorted.subList(0, maxHaplotypesToConsider); + } } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 4fc075807..cff631802 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -205,6 +206,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) protected int minKmer = 11; + @Advanced + @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation. Setting this number too high can have dramatic performance implications", required = false) + protected int maxHaplotypesToConsider = 100000; + /** * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the @@ -227,6 +232,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false) protected boolean justDetermineActiveRegions = false; + @Hidden + @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) + protected boolean dontGenotype = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -296,6 +305,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // reference base padding size private static final int REFERENCE_PADDING = 500; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument + private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument + // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; @@ -374,7 +386,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer ); + assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -514,6 +526,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + if (dontGenotype) + return 1; + // evaluate each sample's reads against all haplotypes final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); @@ -575,7 +590,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // //--------------------------------------------------------------------------------------------------------------- - private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private void finalizeActiveRegion( final ActiveRegion activeRegion ) { if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); @@ -599,7 +614,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } } - activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse)); + + activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 415cb73ac..070ae4f5d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -41,12 +41,12 @@ import java.io.Serializable; import java.util.*; public class Haplotype extends Allele { - private GenomeLoc genomeLocation = null; private Map eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; + private double score = 0; /** * Main constructor @@ -259,4 +259,34 @@ public class Haplotype extends Allele { this.pos = pos; } } + + /** + * Get the score (an estimate of the support) of this haplotype + * @return a double, where higher values are better + */ + public double getScore() { + return this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * Set the score (an estimate of the support) of this haplotype. + * + * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score + * + * @param score a double, where higher values are better + */ + public void setScore(double score) { + this.score = this.isReference() ? Double.MAX_VALUE : score; + } + + /** + * A comparator that sorts haplotypes in decreasing order of score, so that the best supported + * haplotypes are at the top + */ + public static class ScoreComparator implements Comparator { + @Override + public int compare(Haplotype o1, Haplotype o2) { + return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore()); + } + } } From 53a904bcbd8ec63420a76e98e7dda6432d2907f8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 8 Mar 2013 11:28:22 -0500 Subject: [PATCH 086/226] Bugfix for HaplotypeCaller: GSA-822 for trimming softclipped reads -- Previous version would not trim down soft clip bases that extend beyond the active region, causing the assembly graph to go haywire. The new code explicitly reverts soft clips to M bases with the ever useful ReadClipper, and then trims. Note this isn't a 100% fix for the issue, as it's possible that the newly unclipped bases might in reality extend beyond the active region, should their true alignment include a deletion in the reference. Needs to be fixed. JIRA added -- See https://jira.broadinstitute.org/browse/GSA-822 -- #resolve #fix GSA-822 --- .../haplotypecaller/DeBruijnAssembler.java | 18 +++++++++++-- .../DeBruijnAssemblyGraph.java | 27 ++++++++++++++++--- .../haplotypecaller/HaplotypeCaller.java | 12 +++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index bf08d1526..33198ce8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -271,9 +271,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph + //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -289,6 +290,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { + //if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue; + //logger.info("Adding read " + read + " with sequence " + read.getReadString()); final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced @@ -325,8 +328,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } protected void printGraphs() { + final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size + final int writeFirstGraphWithSizeSmallerThan = 50; + graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { + if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); + continue; + } + for( final DeBruijnEdge edge : graph.edgeSet() ) { if( edge.getMultiplicity() > PRUNE_FACTOR ) { graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); @@ -337,8 +348,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final DeBruijnVertex v : graph.vertexSet() ) { - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]"); } + + if ( onlyWriteOneGraph ) + break; } graphWriter.println("}"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java index 6a95049d1..d28f81b55 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java @@ -47,9 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.jgrapht.graph.DefaultDirectedGraph; import java.io.PrintStream; @@ -62,9 +60,32 @@ import java.util.Arrays; */ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { + private final int kmerSize; - public DeBruijnAssemblyGraph() { + /** + * Construct a DeBruijnAssemblyGraph with kmerSize + * @param kmerSize + */ + public DeBruijnAssemblyGraph(final int kmerSize) { super(DeBruijnEdge.class); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); + this.kmerSize = kmerSize; + } + + /** + * Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11 + */ + protected DeBruijnAssemblyGraph() { + this(11); + } + + /** + * How big of a kmer did we use to create this graph? + * @return + */ + public int getKmerSize() { + return kmerSize; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index cff631802..affad6450 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -608,8 +608,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + + // uncomment to remove hard clips from consideration at all + //clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); readsToUse.add(clippedRead); } } From 0f4328f6fe0bdb08e0d82553a27bd2fd0d5668d5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 8 Mar 2013 13:10:15 -0500 Subject: [PATCH 087/226] Basic kmer error correction algorithm xfor the HaplotypeCaller -- Error correction algorithm for the assembler. Only error correct reads to others that are exactly 1 mismatch away -- The assembler logic is now: build initial graph, error correct*, merge nodes*, prune dead nodes, merge again, make haplotypes. The * elements are new -- Refactored the printing routines a bit so it's easy to write a single graph to disk for testing. -- Easier way to control the testing of the graph assembly algorithms -- Move graph printing function to DeBruijnAssemblyGraph from DeBruijnAssembler -- Simple protected parsing function for making DeBruijnAssemblyGraph -- Change the default prune factor for the graph to 1, from 2 -- debugging graph transformations are controllable from command line --- .../haplotypecaller/DeBruijnAssembler.java | 107 ++++++-- .../DeBruijnAssemblyGraph.java | 115 ++++++-- .../haplotypecaller/DeBruijnVertex.java | 12 + .../haplotypecaller/HaplotypeCaller.java | 7 +- .../haplotypecaller/KMerErrorCorrector.java | 253 ++++++++++++++++++ .../DeBruijnAssemblerUnitTest.java | 68 ++++- .../KMerErrorCorrectorUnitTest.java | 78 ++++++ 7 files changed, 594 insertions(+), 46 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 33198ce8c..0caebebee 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -64,6 +64,9 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.PrintStream; import java.util.*; @@ -88,16 +91,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; private final boolean debug; + private final int onlyBuildKmerGraphOfThisSite = -1; // 35; + private final boolean debugGraphTransformations; private final PrintStream graphWriter; private final List graphs = new ArrayList(); private final int minKmer; private final int maxHaplotypesToConsider; private int PRUNE_FACTOR = 2; - - public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { + + public DeBruijnAssembler(final boolean debug, final boolean debugGraphTransformations, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { super(); this.debug = debug; + this.debugGraphTransformations = debugGraphTransformations; this.graphWriter = graphWriter; this.minKmer = minKmer; this.maxHaplotypesToConsider = maxHaplotypesToConsider; @@ -144,13 +150,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // create the graph for each possible kmer for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { - //if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + if ( onlyBuildKmerGraphOfThisSite != -1 && kmer != onlyBuildKmerGraphOfThisSite ) + continue; + + if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); + DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready - pruneGraph(graph, PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); + graph = graph.errorCorrect(); + if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); cleanNonRefPaths(graph); mergeNodes(graph); + if ( debugGraphTransformations ) graph.printGraph(new File("merged.dot"), PRUNE_FACTOR); + pruneGraph(graph, PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("pruned.dot"), PRUNE_FACTOR); + mergeNodes(graph); + if ( debugGraphTransformations ) graph.printGraph(new File("merged2.dot"), PRUNE_FACTOR); if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference sanityCheckReferenceGraph(graph, refHaplotype); graphs.add(graph); @@ -169,7 +185,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 && - graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { + graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); final Set inEdges = graph.incomingEdgesOf(incomingVertex); if( inEdges.size() == 1 && outEdges.size() == 1 ) { @@ -199,6 +215,59 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + // + // X -> ABC -> Y + // -> aBC -> Y + // + // becomes + // + // X -> A -> BCY + // -> a -> BCY + // +// @Requires({"graph != null"}) +// protected static void simplifyMergedGraph(final DeBruijnAssemblyGraph graph) { +// boolean foundNodesToMerge = true; +// while( foundNodesToMerge ) { +// foundNodesToMerge = false; +// +// for( final DeBruijnVertex v : graph.vertexSet() ) { +// if ( isRootOfComplexDiamond(v) ) { +// foundNodesToMerge = simplifyComplexDiamond(graph, v); +// if ( foundNodesToMerge ) +// break; +// } +// } +// } +// } +// +// private static boolean simplifyComplexDiamond(final DeBruijnAssemblyGraph graph, final DeBruijnVertex root) { +// final Set outEdges = graph.outgoingEdgesOf(root); +// final DeBruijnVertex diamondBottom = graph.getEdge(graph.getEdgeTarget(outEdges.iterator().next()); +// // all of the edges point to the same sink, so it's time to merge +// final byte[] commonSuffix = commonSuffixOfEdgeTargets(outEdges, targetSink); +// if ( commonSuffix != null ) { +// final DeBruijnVertex suffixVertex = new DeBruijnVertex(commonSuffix, graph.getKmerSize()); +// graph.addVertex(suffixVertex); +// graph.addEdge(suffixVertex, targetSink); +// +// for( final DeBruijnEdge edge : outEdges ) { +// final DeBruijnVertex target = graph.getEdgeTarget(edge); +// final DeBruijnVertex prefix = target.withoutSuffix(commonSuffix); +// graph.addEdge(prefix, suffixVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); +// graph.removeVertex(graph.getEdgeTarget(edge)); +// graph.removeAllEdges(root, target); +// graph.removeAllEdges(target, targetSink); +// } +// +// graph.removeAllEdges(outEdges); +// graph.removeVertex(targetSink); +// +// return true; +// } else { +// return false; +// } +// } + protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) { if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) { return; @@ -279,7 +348,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) { + if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true, 1) ) { if( DEBUG ) { System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); } @@ -297,7 +366,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { @@ -318,42 +387,32 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); for( int kkk=0; kkk < countNumber; kkk++ ) { - graph.addKmersToGraph(kmer1, kmer2, false); + graph.addKmersToGraph(kmer1, kmer2, false, 1); } } } } } + return graph; } protected void printGraphs() { - final boolean onlyWriteOneGraph = false; // debugging flag -- if true we'll only write a graph for a single kmer size final int writeFirstGraphWithSizeSmallerThan = 50; graphWriter.println("digraph assemblyGraphs {"); for( final DeBruijnAssemblyGraph graph : graphs ) { - if ( onlyWriteOneGraph && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { + if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); continue; } - for( final DeBruijnEdge edge : graph.edgeSet() ) { - if( edge.getMultiplicity() > PRUNE_FACTOR ) { - graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\"" + edge.getMultiplicity() + "\"") + "];"); - } - if( edge.isRef() ) { - graphWriter.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];"); - } - if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } - } - for( final DeBruijnVertex v : graph.vertexSet() ) { - graphWriter.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\",shape=box]"); - } + graph.printGraph(graphWriter, false, PRUNE_FACTOR); - if ( onlyWriteOneGraph ) + if ( debugGraphTransformations ) break; } + graphWriter.println("}"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java index d28f81b55..a78a5c627 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java @@ -48,8 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.jgrapht.graph.DefaultDirectedGraph; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.PrintStream; import java.util.Arrays; @@ -60,6 +64,7 @@ import java.util.Arrays; */ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { + private final static Logger logger = Logger.getLogger(DeBruijnAssemblyGraph.class); private final int kmerSize; /** @@ -73,6 +78,24 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + "label=\""+ edge.getMultiplicity() +"\"" + "];"); +// if( edge.getMultiplicity() > PRUNE_FACTOR ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); +// } if( edge.isRef() ) { - GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } + //if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } + for( final DeBruijnVertex v : vertexSet() ) { - final String label = ( inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() ); - GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]"); + graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]"); } - GRAPH_WRITER.println("}"); + + if ( writeHeader ) + graphWriter.println("}"); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 1390b0ee9..aa8e24576 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -68,6 +68,18 @@ public class DeBruijnVertex { this.kmer = kmer; } + protected DeBruijnVertex( final String sequence, final int kmer ) { + this(sequence.getBytes(), kmer); + } + + protected DeBruijnVertex( final String sequence ) { + this(sequence.getBytes(), sequence.length()); + } + + public int getKmer() { + return kmer; + } + @Override public boolean equals( Object v ) { return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index affad6450..d5f283475 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -192,7 +192,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected String keepRG = null; @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 2; + protected int MIN_PRUNE_FACTOR = 1; @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) @@ -284,6 +284,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler", required = false) + protected boolean debugGraphTransformations = false; + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -386,7 +389,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer, maxHaplotypesToConsider ); + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java new file mode 100644 index 000000000..66ea8a078 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java @@ -0,0 +1,253 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.*; + +/** + * generic utility function that error corrects kmers based on counts + * + * This class provides a generic facility for remapping kmers (byte[] of constant size) + * that occur infrequently to those that occur frequently, based on their simple edit distance + * as measured by mismatches. + * + * The overall workflow of using this class is simple. First, you create the class with + * parameters determining how the error correction should proceed. Next, you provide all + * of the kmers you see in your data. Once all kmers have been added, you call computeErrorCorrectionMap + * to tell this class that all kmers have been added and its time to determine error correcting + * mapping from observed kmers to corrected kmers. This correction looks for low-count (as determined + * by maxCountToCorrect) kmers and chooses the best kmer (minimizing mismatches) among those + * with at least minCountOfKmerToBeCorrection occurrences to error correct the kmer to. If + * there is no kmer with less than maxMismatchesToCorrect then the kmer will be mapped to + * null, indicating the kmer should not be used. + * + * TODO -- for ease of implementation this class uses strings instead of byte[] as those cannot + * TODO -- be added to hashmaps (more specifically, those don't implement .equals). A more efficient + * TODO -- version would use the byte[] directly + * + * User: depristo + * Date: 3/8/13 + * Time: 1:16 PM + */ +public class KMerErrorCorrector { + /** + * A map of for each kmer to its num occurrences in addKmers + */ + Map countsByKMer = new HashMap(); + + /** + * A map from raw kmer -> error corrected kmer + */ + Map rawToErrorCorrectedMap = null; + + final int kmerLength; + final int maxCountToCorrect; + final int maxMismatchesToCorrect; + final int minCountOfKmerToBeCorrection; + + /** + * Create a new kmer corrector + * + * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 + * @param maxCountToCorrect kmers with < maxCountToCorrect will try to be error corrected to another kmer, must be >= 0 + * @param maxMismatchesToCorrect the maximum number of mismatches between a to-be-corrected kmer and its + * best match that we attempt to error correct. If no sufficiently similar + * kmer exists, it will be remapped to null. Must be >= 1 + * @param minCountOfKmerToBeCorrection the minimum count of a kmer to be considered a target for correction. + * That is, kmers that need correction will only be matched with kmers + * with at least minCountOfKmerToBeCorrection occurrences. Must be >= 1 + */ + public KMerErrorCorrector(final int kmerLength, + final int maxCountToCorrect, + final int maxMismatchesToCorrect, + final int minCountOfKmerToBeCorrection) { + if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); + if ( maxCountToCorrect < 0 ) throw new IllegalArgumentException("maxCountToCorrect must be >= 0 but got " + maxCountToCorrect); + if ( maxMismatchesToCorrect < 1 ) throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect); + if ( minCountOfKmerToBeCorrection < 1 ) throw new IllegalArgumentException("minCountOfKmerToBeCorrection must be >= 1 but got " + minCountOfKmerToBeCorrection); + + this.kmerLength = kmerLength; + this.maxCountToCorrect = maxCountToCorrect; + this.maxMismatchesToCorrect = maxMismatchesToCorrect; + this.minCountOfKmerToBeCorrection = minCountOfKmerToBeCorrection; + } + + /** + * For testing purposes + * + * @param kmers + */ + protected void addKmers(final String ... kmers) { + for ( final String kmer : kmers ) + addKmer(kmer, 1); + computeErrorCorrectionMap(); + } + + /** + * Add a kmer that occurred kmerCount times + * + * @param rawKmer a kmer + * @param kmerCount the number of occurrences + */ + public void addKmer(final byte[] rawKmer, final int kmerCount) { + addKmer(new String(rawKmer), kmerCount); + } + + + /** + * Get the error corrected kmer for rawKmer + * + * @param rawKmer a kmer that was already added that we want to get an error corrected version for + * @return an error corrected kmer to use instead of rawKmer. May be == rawKmer if no error correction + * is not necessary. May be null, indicating the rawKmer shouldn't be used at all + */ + public byte[] getErrorCorrectedKmer(final byte[] rawKmer) { + final String result = getErrorCorrectedKmer(new String(rawKmer)); + return result == null ? null : result.getBytes(); + } + + /** + * Indicate that no more kmers will be added to the kmer error corrector, so that the + * error correction data structure should be computed from the added kmers. Enabled calls + * to getErrorCorrectedKmer, and disable calls to addKmer. + */ + public void computeErrorCorrectionMap() { + if ( countsByKMer == null ) + throw new IllegalStateException("computeErrorCorrectionMap can only be called once"); + + final LinkedList needsCorrection = new LinkedList(); + final LinkedList goodKmers = new LinkedList(); + + rawToErrorCorrectedMap = new HashMap(); + for ( Map.Entry kmerCounts: countsByKMer.entrySet() ) { + if ( kmerCounts.getValue() <= maxCountToCorrect ) + needsCorrection.add(kmerCounts.getKey()); + else { + // todo -- optimization could make not in map mean == + rawToErrorCorrectedMap.put(kmerCounts.getKey(), kmerCounts.getKey()); + + // only allow corrections to kmers with at least this count + if ( kmerCounts.getValue() >= minCountOfKmerToBeCorrection ) + goodKmers.add(kmerCounts.getKey()); + } + } + + for ( final String toCorrect : needsCorrection ) { + final String corrected = findClosestKMer(toCorrect, goodKmers); + rawToErrorCorrectedMap.put(toCorrect, corrected); + } + + // cleanup memory -- we don't need the counts for each kmer any longer + countsByKMer = null; + } + + protected void addKmer(final String rawKmer, final int kmerCount) { + if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); + if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); + if ( countsByKMer == null ) throw new IllegalStateException("Cannot add kmers to an already finalized error corrector"); + + final Integer countFromMap = countsByKMer.get(rawKmer); + final int count = countFromMap == null ? 0 : countFromMap; + countsByKMer.put(rawKmer, count + kmerCount); + } + + protected String findClosestKMer(final String kmer, final Collection goodKmers) { + String bestMatch = null; + int minMismatches = Integer.MAX_VALUE; + + for ( final String goodKmer : goodKmers ) { + final int mismatches = countMismatches(kmer, goodKmer); + if ( mismatches < minMismatches ) { + minMismatches = mismatches; + bestMatch = goodKmer; + } + } + + return minMismatches > maxMismatchesToCorrect ? null : bestMatch; + } + + protected int countMismatches(final String one, final String two) { + int mismatches = 0; + for ( int i = 0; i < one.length(); i++ ) + mismatches += one.charAt(i) == two.charAt(i) ? 0 : 1; + return mismatches; + } + + protected String getErrorCorrectedKmer(final String rawKmer) { + if ( rawToErrorCorrectedMap == null ) throw new IllegalStateException("Cannot get error corrected kmers until after computeErrorCorrectionMap has been called"); + if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); + return rawToErrorCorrectedMap.get(rawKmer); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("KMerErrorCorrector{"); + for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { + final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); + if ( correcting ) + b.append(String.format("%n\t%s / %d -> %s / %d [correcting? %b]", + toCorrect.getKey(), getCounts(toCorrect.getKey()), + toCorrect.getValue(), getCounts(toCorrect.getValue()), + correcting)); + } + b.append("\n}"); + return b.toString(); + } + + /** + * Get a simple count estimate for printing for kmer + * @param kmer the kmer + * @return an integer count for kmer + */ + private int getCounts(final String kmer) { + if ( kmer == null ) return 0; + final Integer count = countsByKMer == null ? -1 : countsByKMer.get(kmer); + if ( count == null ) + throw new IllegalArgumentException("kmer not found in counts -- bug " + kmer); + return count; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index f4a6d5494..2096b487e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -67,6 +67,7 @@ import org.testng.annotations.Test; import java.util.*; public class DeBruijnAssemblerUnitTest extends BaseTest { + private final static boolean DEBUG = true; private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { @@ -97,7 +98,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { final byte[] kmer2 = new byte[KMER_LENGTH]; System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); - graph.addKmersToGraph(kmer1, kmer2, false); + graph.addKmersToGraph(kmer1, kmer2, false, 1); } DeBruijnAssembler.mergeNodes(graph); return graph; @@ -118,13 +119,70 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); } - @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true) + @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG) public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { logger.warn(String.format("Test: %s", cfg.toString())); Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph())); } - @Test(enabled = true) +// @DataProvider(name = "SimpleMergeOperationsData") +// public Object[][] makeSimpleMergeOperationsData() { +// List tests = new ArrayList(); +// +// { +// DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); +// DeBruijnVertex v1 = new DeBruijnVertex("AT"); +// DeBruijnVertex v2 = new DeBruijnVertex("TC"); +// DeBruijnVertex v3 = new DeBruijnVertex("CT"); +// DeBruijnVertex v4 = new DeBruijnVertex("TG"); +// DeBruijnVertex v5 = new DeBruijnVertex("AG"); +// DeBruijnVertex v6 = new DeBruijnVertex("GG"); +// DeBruijnVertex v7 = new DeBruijnVertex("GA"); +// DeBruijnVertex v8 = new DeBruijnVertex("AA"); +// +// graph.addVertices(v1, v2, v3, v4, v5, v6, v7, v8); +// graph.addEdge(v1, v2, new DeBruijnEdge(false, 2)); +// graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); +// graph.addEdge(v2, v4, new DeBruijnEdge(false, 5)); +// graph.addEdge(v3, v5, new DeBruijnEdge(false, 3)); +// graph.addEdge(v4, v6, new DeBruijnEdge(false, 3)); +// graph.addEdge(v5, v7, new DeBruijnEdge(false, 2)); +// graph.addEdge(v6, v7, new DeBruijnEdge(false, 6)); +// graph.addEdge(v7, v8, new DeBruijnEdge(false, 2)); +// +// graph.printGraph(new File("unittest.dot"), 1); +// +// DeBruijnAssemblyGraph expected = new DeBruijnAssemblyGraph(); +// DeBruijnVertex e1 = new DeBruijnVertex("ATC"); +// DeBruijnVertex e2 = new DeBruijnVertex("T"); +// DeBruijnVertex e3 = new DeBruijnVertex("G"); +// DeBruijnVertex e4 = new DeBruijnVertex("GAA"); +// +// expected.addVertices(e1,e2,e3,e4); +// expected.addEdge(e1, e2, new DeBruijnEdge(false, 3)); +// expected.addEdge(e1, e3, new DeBruijnEdge(false, 5)); +// expected.addEdge(e2, e4, new DeBruijnEdge(false, 2)); +// expected.addEdge(e3, e4, new DeBruijnEdge(false, 6)); +// +// expected.printGraph(new File("expected.dot"), 1); +// +// tests.add(new Object[]{graph.clone(), expected}); +// } +// +// return tests.toArray(new Object[][]{}); +// } +// +// @Test(dataProvider = "SimpleMergeOperationsData", enabled = true) +// public void testSimpleMergeOperations(final DeBruijnAssemblyGraph unmergedGraph, final DeBruijnAssemblyGraph expectedGraph) throws Exception { +// final DeBruijnAssemblyGraph mergedGraph = (DeBruijnAssemblyGraph)unmergedGraph.clone(); +// DeBruijnAssembler.mergeNodes(mergedGraph); +// mergedGraph.printGraph(new File("merged.dot"), 1); +// DeBruijnAssembler.simplifyMergedGraph(mergedGraph); +// mergedGraph.printGraph(new File("reduced.dot"), 1); +// Assert.assertTrue(graphEquals(mergedGraph, expectedGraph)); +// } + + @Test(enabled = !DEBUG) public void testPruneGraph() { DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph(); @@ -210,7 +268,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { return true; } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; @@ -221,7 +279,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testLeftAlignCigarSequentially() { String preRefString = "GATCGATCGATC"; String postRefString = "TTT"; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java new file mode 100644 index 000000000..f88d7ee7f --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java @@ -0,0 +1,78 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KMerErrorCorrectorUnitTest extends BaseTest { + @Test + public void testMyData() { + final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2); + + corrector.addKmers( + "ATG", "ATG", "ATG", "ATG", + "ACC", "ACC", "ACC", + "AAA", "AAA", + "CTG", // -> ATG + "NNA", // -> AAA + "CCC", // => ACC + "NNN", // => null + "NNC" // => ACC [because of min count won't go to NNA] + ); + + Assert.assertEquals(corrector.getErrorCorrectedKmer("ATG"), "ATG"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("ACC"), "ACC"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("AAA"), "AAA"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("CTG"), "ATG"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("NNA"), "AAA"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("CCC"), "ACC"); + Assert.assertEquals(corrector.getErrorCorrectedKmer("NNN"), null); + Assert.assertEquals(corrector.getErrorCorrectedKmer("NNC"), "ACC"); + } +} From 98c4cd060d098323655e9b0899a8253ef1be4b25 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 14 Mar 2013 10:03:04 -0400 Subject: [PATCH 088/226] HaplotypeCaller now uses SeqGraph instead of kmer graph to build haplotypes. -- DeBruijnAssembler functions are no longer static. This isn't the right way to unit test your code -- An a HaplotypeCaller command line option to use low-quality bases in the assembly -- Refactored DeBruijnGraph and associated libraries into base class -- Refactored out BaseEdge, BaseGraph, and BaseVertex from DeBruijn equivalents. These DeBruijn versions now inherit from these base classes. Added some reasonable unit tests for the base and Debruijn edges and vertex classes. -- SeqVertex: allows multiple vertices in the sequence graph to have the same sequence and yet be distinct -- Further refactoring of DeBruijnAssembler in preparation for the full SeqGraph <-> DeBruijnGraph split -- Moved generic methods in DeBruijnAssembler into BaseGraph -- Created a simple SeqGraph that contains SeqVertex objects -- Simple chain zipper for SeqGraph that reproduces the results for the mergeNode function on DeBruijnGraphs -- A working version of the diamond remodeling algorithm in SeqGraph that converts graphs that look like A -> Xa, A -> Ya, Xa -> Z, Ya -> Z into A -> X -> a, A -Y -> a, a -> Z -- Allow SeqGraph zip merging of vertices where the in vertex has multiple incoming edges or the out vertex has multiple outgoing edges -- Fix all unit tests so they work with the new SeqGraph system. All tests passed without modification. -- Debugging makes it easier to tell which kmer graph contributes to a haplotype -- Better docs and unit tests for BaseVertex, SeqVertex, BaseEdge, and KMerErrorCorrector -- Remove unnecessary printing of cleaning info in BaseGraph -- Turn off kmer graph creation in DeBruijnAssembler.java -- Only print SeqGraphs when debugGraphTransformations is set to true -- Rename DeBruijnGraphUnitTest to SeqGraphUnitTest. Now builds DeBruijnGraph, converts to SeqGraph, uses SeqGraph.mergenodes and tests for equality. -- Update KBestPathsUnitTest to use SeqGraphs not DebruijnGraphs -- DebruijnVertex now longer takes kmer argument -- it's implicit that the kmer length is the sequence.length now --- .../{DeBruijnEdge.java => BaseEdge.java} | 70 ++-- ...ruijnAssemblyGraph.java => BaseGraph.java} | 318 ++++++++++-------- .../walkers/haplotypecaller/BaseVertex.java | 148 ++++++++ .../haplotypecaller/DeBruijnAssembler.java | 249 ++++---------- .../haplotypecaller/DeBruijnGraph.java | 179 ++++++++++ .../haplotypecaller/DeBruijnVertex.java | 63 ++-- .../haplotypecaller/HaplotypeCaller.java | 12 +- .../walkers/haplotypecaller/KBestPaths.java | 96 +++--- .../haplotypecaller/KMerErrorCorrector.java | 28 +- .../walkers/haplotypecaller/SeqGraph.java | 280 +++++++++++++++ .../walkers/haplotypecaller/SeqVertex.java | 153 +++++++++ .../haplotypecaller/BaseEdgeUnitTest.java | 105 ++++++ .../haplotypecaller/BaseGraphUnitTest.java | 192 +++++++++++ .../haplotypecaller/BaseVertexUnitTest.java | 91 +++++ .../DeBruijnAssemblerUnitTest.java | 205 +---------- .../DeBruijnAssemblyGraphUnitTest.java | 2 +- .../DeBruijnVertexUnitTest.java | 69 ++++ .../haplotypecaller/KBestPathsUnitTest.java | 183 ++++++---- .../KMerErrorCorrectorUnitTest.java | 25 +- .../haplotypecaller/SeqGraphUnitTest.java | 106 ++++++ .../haplotypecaller/SeqVertexUnitTest.java | 109 ++++++ .../org/broadinstitute/sting/utils/Utils.java | 13 + 22 files changed, 1964 insertions(+), 732 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{DeBruijnEdge.java => BaseEdge.java} (83%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{DeBruijnAssemblyGraph.java => BaseGraph.java} (70%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java similarity index 83% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index 28c735b5c..053f0e1a1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -46,68 +46,94 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.jgrapht.graph.DefaultDirectedGraph; - import java.io.Serializable; import java.util.Comparator; /** - * Created by IntelliJ IDEA. + * simple edge class for connecting nodes in the graph + * + * Works equally well for all graph types (kmer or sequence) + * * User: ebanks * Date: Mar 23, 2011 */ - -// simple edge class for connecting nodes in the graph -public class DeBruijnEdge { - +public class BaseEdge { private int multiplicity; private boolean isRef; - public DeBruijnEdge() { - multiplicity = 1; - isRef = false; - } + /** + * Create a new BaseEdge with weight multiplicity and, if isRef == true, indicates a path through the reference + * + * @param isRef indicates whether this edge is a path through the reference + * @param multiplicity the number of observations of this edge + */ + public BaseEdge(final boolean isRef, final int multiplicity) { + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); - public DeBruijnEdge( final boolean isRef ) { - multiplicity = 1; - this.isRef = isRef; - } - - public DeBruijnEdge( final boolean isRef, final int multiplicity ) { this.multiplicity = multiplicity; this.isRef = isRef; } + /** + * Copy constructor + * + * @param toCopy + */ + public BaseEdge(final BaseEdge toCopy) { + this(toCopy.isRef(), toCopy.getMultiplicity()); + } + + /** + * Get the number of observations of paths connecting two vertices + * @return a positive integer >= 0 + */ public int getMultiplicity() { return multiplicity; } + /** + * Set the multiplicity of this edge to value + * @param value an integer >= 0 + */ public void setMultiplicity( final int value ) { + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); multiplicity = value; } + /** + * Does this edge indicate a path through the reference graph? + * @return true if so + */ public boolean isRef() { return isRef; } + /** + * Indicate that this edge follows the reference sequence, or not + * @param isRef true if this is a reference edge + */ public void setIsRef( final boolean isRef ) { this.isRef = isRef; } // For use when comparing edges pulled from the same graph - public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge ) { + public boolean equals( final BaseGraph graph, final BaseEdge edge ) { return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); } // For use when comparing edges across graphs! - public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge, final DeBruijnAssemblyGraph graph2 ) { + public boolean equals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); } - public static class EdgeWeightComparator implements Comparator, Serializable { + /** + * Sorts a collection of BaseEdges in decreasing order of weight, so that the most + * heavily weighted is at the start of the list + */ + public static class EdgeWeightComparator implements Comparator, Serializable { @Override - public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) { - return edge1.multiplicity - edge2.multiplicity; + public int compare(final BaseEdge edge1, final BaseEdge edge2) { + return edge2.multiplicity - edge1.multiplicity; } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java similarity index 70% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index a78a5c627..6aa687312 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -49,13 +49,15 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; +import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintStream; -import java.util.Arrays; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -63,44 +65,37 @@ import java.util.Arrays; * Date: 2/6/13 */ -public class DeBruijnAssemblyGraph extends DefaultDirectedGraph { - private final static Logger logger = Logger.getLogger(DeBruijnAssemblyGraph.class); +public class BaseGraph extends DefaultDirectedGraph { + protected final static Logger logger = Logger.getLogger(BaseGraph.class); private final int kmerSize; /** - * Construct a DeBruijnAssemblyGraph with kmerSize - * @param kmerSize + * Construct an empty BaseGraph */ - public DeBruijnAssemblyGraph(final int kmerSize) { - super(DeBruijnEdge.class); - - if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); - this.kmerSize = kmerSize; - } - - public static DeBruijnAssemblyGraph parse(final int kmerSize, final int multiplicity, final String ... reads) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(kmerSize); - - for ( final String read : reads ) { - final int kmersInSequence = read.length() - kmerSize + 1; - for (int i = 0; i < kmersInSequence - 1; i++) { - // get the kmers - final byte[] kmer1 = new byte[kmerSize]; - System.arraycopy(read.getBytes(), i, kmer1, 0, kmerSize); - final byte[] kmer2 = new byte[kmerSize]; - System.arraycopy(read.getBytes(), i+1, kmer2, 0, kmerSize); - graph.addKmersToGraph(kmer1, kmer2, false, multiplicity); - } - } - - return graph; + public BaseGraph() { + this(11); } /** - * Test construct that makes DeBruijnAssemblyGraph assuming a kmerSize of 11 + * Edge factory that creates non-reference multiplicity 1 edges + * @param the new of our vertices */ - protected DeBruijnAssemblyGraph() { - this(11); + private static class MyEdgeFactory implements EdgeFactory { + @Override + public BaseEdge createEdge(T sourceVertex, T targetVertex) { + return new BaseEdge(false, 1); + } + } + + /** + * Construct a DeBruijnGraph with kmerSize + * @param kmerSize + */ + public BaseGraph(final int kmerSize) { + super(new MyEdgeFactory()); + + if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize); + this.kmerSize = kmerSize; } /** @@ -115,9 +110,9 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph outgoingVerticesOf(final T v) { + final Set s = new HashSet(); + for ( final BaseEdge e : outgoingEdgesOf(v) ) { + s.add(getEdgeTarget(e)); + } + return s; + } + + /** + * Get the set of vertices connected to v by incoming edges + * @param v a non-null vertex + * @return a set of vertices {X} connected X -> v + */ + public Set incomingVerticesOf(final T v) { + final Set s = new HashSet(); + for ( final BaseEdge e : incomingEdgesOf(v) ) { + s.add(getEdgeSource(e)); + } + return s; + } + /** * Print out the graph in the dot language for visualization * @param destination File to write to @@ -403,11 +353,12 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph PRUNE_FACTOR ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); // } @@ -417,11 +368,114 @@ public class DeBruijnAssemblyGraph extends DefaultDirectedGraph edgesToCheck = new HashSet(); + edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex())); + while( !edgesToCheck.isEmpty() ) { + final BaseEdge e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) ); + removeEdge(e); + } + edgesToCheck.remove(e); + } + + edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex())); + while( !edgesToCheck.isEmpty() ) { + final BaseEdge e = edgesToCheck.iterator().next(); + if( !e.isRef() ) { + edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) ); + removeEdge(e); + } + edgesToCheck.remove(e); + } + + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new LinkedList(); + for( final T v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + removeAllVertices(verticesToRemove); + } + + protected void pruneGraph( final int pruneFactor ) { + final List edgesToRemove = new ArrayList(); + for( final BaseEdge e : edgeSet() ) { + if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor + edgesToRemove.add(e); + } + } + removeAllEdges(edgesToRemove); + + // Run through the graph and clean up singular orphaned nodes + final List verticesToRemove = new ArrayList(); + for( final T v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + + removeAllVertices(verticesToRemove); + } + + public void removeVerticesNotConnectedToRef() { + final HashSet toRemove = new HashSet(vertexSet()); + final HashSet visited = new HashSet(); + + final LinkedList toVisit = new LinkedList(); + final T refV = getReferenceSourceVertex(); + if ( refV != null ) { + toVisit.add(refV); + while ( ! toVisit.isEmpty() ) { + final T v = toVisit.pop(); + if ( ! visited.contains(v) ) { + toRemove.remove(v); + visited.add(v); + for ( final T prev : incomingVerticesOf(v) ) toVisit.add(prev); + for ( final T next : outgoingVerticesOf(v) ) toVisit.add(next); + } + } + +// for ( final T remove : toRemove ) +// logger.info("Cleaning up nodes not attached to any reference node: " + remove.toString()); + + removeAllVertices(toRemove); + } + } + + public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { + if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { + return false; + } + for( BaseEdge e1 : g1.edgeSet() ) { + boolean found = false; + for( BaseEdge e2 : g2.edgeSet() ) { + if( e1.equals(g1, e2, g2) ) { found = true; break; } + } + if( !found ) { return false; } + } + for( BaseEdge e2 : g2.edgeSet() ) { + boolean found = false; + for( BaseEdge e1 : g1.edgeSet() ) { + if( e2.equals(g2, e1, g1) ) { found = true; break; } + } + if( !found ) { return false; } + } + return true; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java new file mode 100644 index 000000000..fad7a51d1 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -0,0 +1,148 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; + +import java.util.Arrays; + +/** + * A graph vertex that holds some sequence information + * + * @author: depristo + * @since 03/2013 + */ +public class BaseVertex { + final byte[] sequence; + + /** + * Create a new sequence vertex with sequence + * @param sequence a non-null, non-empty sequence of bases contained in this vertex + */ + public BaseVertex(final byte[] sequence) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); + + // TODO -- should we really be cloning here? + this.sequence = sequence.clone(); + } + + /** + * Get the length of this sequence + * @return a positive integer >= 1 + */ + public int length() { + return sequence.length; + } + + /** + * For testing purposes only -- low performance + * @param sequence + */ + protected BaseVertex(final String sequence) { + this(sequence.getBytes()); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + BaseVertex that = (BaseVertex) o; + + if (!Arrays.equals(sequence, that.sequence)) return false; + + return true; + } + + @Override + public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + return Arrays.hashCode(sequence); + } + + @Override + public String toString() { + return getSequenceString(); + } + + /** + * Get the sequence of bases contained in this vertex + * + * Do not modify these bytes in any way! + * + * @return a non-null pointer to the bases contained in this vertex + */ + @Ensures("result != null") + public byte[] getSequence() { + // TODO -- why is this cloning? It's likely extremely expensive + return sequence.clone(); + } + + /** + * Get a string representation of the bases in this vertex + * @return a non-null String + */ + @Ensures("result != null") + public String getSequenceString() { + return new String(sequence); + } + + /** + * Get the sequence unique to this vertex + * + * This function may not return the entire sequence stored in the vertex, as kmer graphs + * really only provide 1 base of additional sequence (the last base of the kmer). + * + * The base implementation simply returns the sequence. + * + * @param source is this vertex a source vertex (i.e., no in nodes) in the graph + * @return a byte[] of the sequence added by this vertex to the overall sequence + */ + public byte[] getAdditionalSequence(final boolean source) { + return getSequence(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 0caebebee..9d84d611f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -65,8 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.PrintStream; import java.util.*; @@ -81,7 +79,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; - private static final byte MIN_QUALITY = (byte) 16; + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode @@ -91,22 +89,34 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; private final boolean debug; - private final int onlyBuildKmerGraphOfThisSite = -1; // 35; private final boolean debugGraphTransformations; private final PrintStream graphWriter; - private final List graphs = new ArrayList(); private final int minKmer; private final int maxHaplotypesToConsider; + private final byte minBaseQualityToUseInAssembly; + + private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; private int PRUNE_FACTOR = 2; - public DeBruijnAssembler(final boolean debug, final boolean debugGraphTransformations, final PrintStream graphWriter, final int minKmer, final int maxHaplotypesToConsider) { + protected DeBruijnAssembler() { + this(false, -1, null, 11, 1000, DEFAULT_MIN_BASE_QUALITY_TO_USE); + } + + public DeBruijnAssembler(final boolean debug, + final int debugGraphTransformations, + final PrintStream graphWriter, + final int minKmer, + final int maxHaplotypesToConsider, + final byte minBaseQualityToUseInAssembly) { super(); this.debug = debug; - this.debugGraphTransformations = debugGraphTransformations; + this.debugGraphTransformations = debugGraphTransformations > 0; + this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; this.graphWriter = graphWriter; this.minKmer = minKmer; this.maxHaplotypesToConsider = maxHaplotypesToConsider; + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } /** @@ -130,199 +140,73 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { this.PRUNE_FACTOR = PRUNE_FACTOR; // create the graphs - createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); + final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); // print the graphs if the appropriate debug option has been turned on if( graphWriter != null ) { - printGraphs(); + printGraphs(graphs); } // find the best paths in the graphs and return them as haplotypes - return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); + return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() ); } @Requires({"reads != null", "refHaplotype != null"}) - protected void createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { - graphs.clear(); + protected List createDeBruijnGraphs( final List reads, final Haplotype refHaplotype ) { + final List graphs = new LinkedList(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < minKmer) { return; } // Reads are too small for assembly so don't try to create any assembly graphs - + if( maxKmer < minKmer) { + // Reads are too small for assembly so don't try to create any assembly graphs + return Collections.emptyList(); + } // create the graph for each possible kmer for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { - if ( onlyBuildKmerGraphOfThisSite != -1 && kmer != onlyBuildKmerGraphOfThisSite ) + if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) continue; if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); graph = graph.errorCorrect(); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); - cleanNonRefPaths(graph); - mergeNodes(graph); - if ( debugGraphTransformations ) graph.printGraph(new File("merged.dot"), PRUNE_FACTOR); - pruneGraph(graph, PRUNE_FACTOR); - if ( debugGraphTransformations ) graph.printGraph(new File("pruned.dot"), PRUNE_FACTOR); - mergeNodes(graph); - if ( debugGraphTransformations ) graph.printGraph(new File("merged2.dot"), PRUNE_FACTOR); - if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference - sanityCheckReferenceGraph(graph, refHaplotype); - graphs.add(graph); + graph.cleanNonRefPaths(); + + final SeqGraph seqGraph = toSeqGraph(graph); + + if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference + sanityCheckReferenceGraph(seqGraph, refHaplotype); + graphs.add(seqGraph); + + if ( debugGraphTransformations ) // we only want to use one graph size + break; } } + } + + return graphs; } - @Requires({"graph != null"}) - protected static void mergeNodes( final DeBruijnAssemblyGraph graph ) { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; - - for( final DeBruijnEdge e : graph.edgeSet() ) { - final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e); - final DeBruijnVertex incomingVertex = graph.getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 && - graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) { - final Set outEdges = graph.outgoingEdgesOf(outgoingVertex); - final Set inEdges = graph.incomingEdgesOf(incomingVertex); - if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } - - final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer ); - graph.addVertex(addedVertex); - for( final DeBruijnEdge edge : outEdges ) { - graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); - } - for( final DeBruijnEdge edge : inEdges ) { - graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); - } - - graph.removeVertex( incomingVertex ); - graph.removeVertex( outgoingVertex ); - foundNodesToMerge = true; - break; - } - } - } + private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { + final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); + seqGraph.pruneGraph(PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + seqGraph.mergeNodes(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.preclean.dot"), PRUNE_FACTOR); + seqGraph.removeVerticesNotConnectedToRef(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), PRUNE_FACTOR); + seqGraph.mergeBranchingNodes(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.simplified.dot"), PRUNE_FACTOR); + seqGraph.mergeNodes(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.6.simplified.merged.dot"), PRUNE_FACTOR); + return seqGraph; } - // - // X -> ABC -> Y - // -> aBC -> Y - // - // becomes - // - // X -> A -> BCY - // -> a -> BCY - // -// @Requires({"graph != null"}) -// protected static void simplifyMergedGraph(final DeBruijnAssemblyGraph graph) { -// boolean foundNodesToMerge = true; -// while( foundNodesToMerge ) { -// foundNodesToMerge = false; -// -// for( final DeBruijnVertex v : graph.vertexSet() ) { -// if ( isRootOfComplexDiamond(v) ) { -// foundNodesToMerge = simplifyComplexDiamond(graph, v); -// if ( foundNodesToMerge ) -// break; -// } -// } -// } -// } -// -// private static boolean simplifyComplexDiamond(final DeBruijnAssemblyGraph graph, final DeBruijnVertex root) { -// final Set outEdges = graph.outgoingEdgesOf(root); -// final DeBruijnVertex diamondBottom = graph.getEdge(graph.getEdgeTarget(outEdges.iterator().next()); -// // all of the edges point to the same sink, so it's time to merge -// final byte[] commonSuffix = commonSuffixOfEdgeTargets(outEdges, targetSink); -// if ( commonSuffix != null ) { -// final DeBruijnVertex suffixVertex = new DeBruijnVertex(commonSuffix, graph.getKmerSize()); -// graph.addVertex(suffixVertex); -// graph.addEdge(suffixVertex, targetSink); -// -// for( final DeBruijnEdge edge : outEdges ) { -// final DeBruijnVertex target = graph.getEdgeTarget(edge); -// final DeBruijnVertex prefix = target.withoutSuffix(commonSuffix); -// graph.addEdge(prefix, suffixVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity())); -// graph.removeVertex(graph.getEdgeTarget(edge)); -// graph.removeAllEdges(root, target); -// graph.removeAllEdges(target, targetSink); -// } -// -// graph.removeAllEdges(outEdges); -// graph.removeVertex(targetSink); -// -// return true; -// } else { -// return false; -// } -// } - - protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) { - if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) { - return; - } - // Remove non-ref edges connected before and after the reference path - final Set edgesToCheck = new HashSet(); - edgesToCheck.addAll(graph.incomingEdgesOf(graph.getReferenceSourceVertex())); - while( !edgesToCheck.isEmpty() ) { - final DeBruijnEdge e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( graph.incomingEdgesOf(graph.getEdgeSource(e)) ); - graph.removeEdge(e); - } - edgesToCheck.remove(e); - } - edgesToCheck.addAll(graph.outgoingEdgesOf(graph.getReferenceSinkVertex())); - while( !edgesToCheck.isEmpty() ) { - final DeBruijnEdge e = edgesToCheck.iterator().next(); - if( !e.isRef() ) { - edgesToCheck.addAll( graph.outgoingEdgesOf(graph.getEdgeTarget(e)) ); - graph.removeEdge(e); - } - edgesToCheck.remove(e); - } - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - graph.removeAllVertices(verticesToRemove); - } - - protected static void pruneGraph( final DeBruijnAssemblyGraph graph, final int pruneFactor ) { - final List edgesToRemove = new ArrayList(); - for( final DeBruijnEdge e : graph.edgeSet() ) { - if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor - edgesToRemove.add(e); - } - } - graph.removeAllEdges(edgesToRemove); - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final DeBruijnVertex v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - graph.removeAllVertices(verticesToRemove); - } - - protected static void sanityCheckReferenceGraph(final DeBruijnAssemblyGraph graph, final Haplotype refHaplotype) { + protected void sanityCheckReferenceGraph(final BaseGraph graph, final Haplotype refHaplotype) { if( graph.getReferenceSourceVertex() == null ) { throw new IllegalStateException("All reference graphs must have a reference source vertex."); } @@ -338,9 +222,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected static DeBruijnAssemblyGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { + protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(KMER_LENGTH); + final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); @@ -370,7 +254,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { - if( qualities[jjj] < MIN_QUALITY ) { + if( qualities[jjj] < minBaseQualityToUseInAssembly ) { badKmer = true; break; } @@ -397,11 +281,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return graph; } - protected void printGraphs() { + protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; graphWriter.println("digraph assemblyGraphs {"); - for( final DeBruijnAssemblyGraph graph : graphs ) { + for( final SeqGraph graph : graphs ) { if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) { logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize()); continue; @@ -418,7 +302,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"}) @Ensures({"result.contains(refHaplotype)"}) - private List findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { + private List findBestPaths( final List graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes // TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm @@ -440,8 +324,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - for( final DeBruijnAssemblyGraph graph : graphs ) { - for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + for( final SeqGraph graph : graphs ) { + for ( final KBestPaths.Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); @@ -466,6 +350,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { h.setScore(path.getScore()); returnHaplotypes.add(h); + if ( debug ) + logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize()); + // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place @@ -599,7 +486,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @return the left-aligned cigar */ @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) - protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { final Cigar cigarToReturn = new Cigar(); Cigar cigarToAlign = new Cigar(); for (int i = 0; i < cigar.numCigarElements(); i++) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java new file mode 100644 index 000000000..d9df03539 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -0,0 +1,179 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * A DeBruijn kmer graph + * + * User: rpoplin + * Date: 2/6/13 + */ +public class DeBruijnGraph extends BaseGraph { + /** + * Create an empty DeBruijnGraph with default kmer size + */ + public DeBruijnGraph() { + super(); + } + + /** + * Create an empty DeBruijnGraph with kmer size + * @param kmerSize kmer size, must be >= 1 + */ + public DeBruijnGraph(int kmerSize) { + super(kmerSize); + } + + /** + * Pull kmers out of the given long sequence and throw them on in the graph + * @param sequence byte array holding the sequence with which to build the assembly graph + * @param KMER_LENGTH the desired kmer length to use + * @param isRef if true the kmers added to the graph will have reference edges linking them + */ + public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) { + if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); } + final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1); + } + } + + /** + * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers + * @return a freshly allocated graph + */ + protected DeBruijnGraph errorCorrect() { + final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables + + for( final BaseEdge e : edgeSet() ) { + for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) { + // TODO -- need a cleaner way to deal with the ref weight + corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); + } + } + corrector.computeErrorCorrectionMap(); + + final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); + + for( final BaseEdge e : edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } + } + + return correctedGraph; + } + + /** + * Add edge to assembly graph connecting the two kmers + * @param kmer1 the source kmer for the edge + * @param kmer2 the target kmer for the edge + * @param isRef true if the added edge is a reference edge + * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph + */ + public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { + if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } + if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } + if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } + + final int numVertexBefore = vertexSet().size(); + final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 ); + addVertex(v1); + final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 ); + addVertex(v2); + if( isRef && vertexSet().size() == numVertexBefore ) { return false; } + + final BaseEdge targetEdge = getEdge(v1, v2); + if ( targetEdge == null ) { + addEdge(v1, v2, new BaseEdge( isRef, multiplicity )); + } else { + if( isRef ) { + targetEdge.setIsRef( true ); + } + targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity); + } + return true; + } + + /** + * Convert this kmer graph to a simple sequence graph. + * + * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer + * graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence + * + * @return a newly allocated SequenceGraph + */ + @Ensures({"result != null"}) + protected SeqGraph convertToSequenceGraph() { + final SeqGraph seqGraph = new SeqGraph(getKmerSize()); + final Map vertexMap = new HashMap(); + + // create all of the equivalent seq graph vertices + for ( final DeBruijnVertex dv : vertexSet() ) { + final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); + vertexMap.put(dv, sv); + seqGraph.addVertex(sv); + } + + // walk through the nodes and connect them to their equivalent seq vertices + for( final BaseEdge e : edgeSet() ) { + final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e)); + final SeqVertex seqInV = vertexMap.get(getEdgeSource(e)); + seqGraph.addEdge(seqInV, seqOutV, e); + } + + return seqGraph; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index aa8e24576..47716b7c5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -52,59 +52,50 @@ import com.google.java.contract.Invariant; import java.util.Arrays; /** - * Created by IntelliJ IDEA. + * simple node class for storing kmer sequences + * * User: ebanks * Date: Mar 23, 2011 */ -// simple node class for storing kmer sequences -@Invariant("kmer > 0") -public class DeBruijnVertex { - - protected final byte[] sequence; - public final int kmer; - - public DeBruijnVertex( final byte[] sequence, final int kmer ) { - this.sequence = sequence.clone(); - this.kmer = kmer; - } - - protected DeBruijnVertex( final String sequence, final int kmer ) { - this(sequence.getBytes(), kmer); +public class DeBruijnVertex extends BaseVertex { + public DeBruijnVertex( final byte[] sequence ) { + super(sequence); } + /** + * For testing purposes only + * @param sequence + */ protected DeBruijnVertex( final String sequence ) { - this(sequence.getBytes(), sequence.length()); + this(sequence.getBytes()); } + /** + * Get the kmer size for this DeBruijnVertex + * @return integer >= 1 + */ + @Ensures("result >= 1") public int getKmer() { - return kmer; + return sequence.length; } - @Override - public boolean equals( Object v ) { - return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence); - } - - @Override - public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect - return Arrays.hashCode(sequence); - } - - public String toString() { - return new String(sequence); - } - + /** + * Get the string representation of the suffix of this DeBruijnVertex + * @return a non-null non-empty string + */ + @Ensures({"result != null", "result.length() >= 1"}) public String getSuffixString() { return new String(getSuffix()); } @Ensures("result != null") - public byte[] getSequence() { - return sequence.clone(); + // TODO this could be replaced with byte as the suffix is guarenteed to be exactly 1 base + public byte[] getSuffix() { + return Arrays.copyOfRange( sequence, getKmer() - 1, sequence.length ); } - @Ensures("result != null") - public byte[] getSuffix() { - return Arrays.copyOfRange( sequence, kmer - 1, sequence.length ); + @Override + public byte[] getAdditionalSequence(boolean source) { + return source ? super.getAdditionalSequence(source) : getSuffix(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index d5f283475..7bec4bee5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -284,8 +284,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; - @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler", required = false) - protected boolean debugGraphTransformations = false; + @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) + protected int debugGraphTransformations = -1; + + @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) + protected boolean useLowQualityBasesForAssembly = false; // the UG engines private UnifiedGenotyperEngine UG_engine = null; @@ -389,7 +392,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider ); + final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider, minBaseQualityToUseInAssembly ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -610,7 +614,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { - GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index e97fdb3cb..8c29cfa98 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -52,13 +52,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; @@ -70,28 +65,27 @@ import java.util.*; */ // Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. // This is different from most graph traversals because we want to test paths from any source node to any sink node. -public class KBestPaths { - +public class KBestPaths { // static access only - protected KBestPaths() { } + public KBestPaths() { } + private static int MAX_PATHS_TO_HOLD = 100; protected static class MyInt { public int val = 0; } // class to keep track of paths - protected static class Path { - + protected static class Path { // the last vertex seen in the path - private final DeBruijnVertex lastVertex; + private final T lastVertex; // the list of edges comprising the path - private final List edges; + private final List edges; // the scores for the path private final int totalScore; // the graph from which this path originated - private final DeBruijnAssemblyGraph graph; + private final BaseGraph graph; // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base @@ -101,19 +95,19 @@ public class KBestPaths { private static final double SW_GAP_EXTEND = -1.1; private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); - public Path( final DeBruijnVertex initialVertex, final DeBruijnAssemblyGraph graph ) { + public Path( final T initialVertex, final BaseGraph graph ) { lastVertex = initialVertex; - edges = new ArrayList(0); + edges = new ArrayList(0); totalScore = 0; this.graph = graph; } - public Path( final Path p, final DeBruijnEdge edge ) { + public Path( final Path p, final BaseEdge edge ) { if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } graph = p.graph; lastVertex = p.graph.getEdgeTarget(edge); - edges = new ArrayList(p.edges); + edges = new ArrayList(p.edges); edges.add(edge); totalScore = p.totalScore + edge.getMultiplicity(); } @@ -123,10 +117,10 @@ public class KBestPaths { * @param edge the given edge to test * @return true if the edge is found in this path */ - public boolean containsEdge( final DeBruijnEdge edge ) { + public boolean containsEdge( final BaseEdge edge ) { if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { if( e.equals(graph, edge) ) { return true; } @@ -140,11 +134,11 @@ public class KBestPaths { * @param edge the given edge to test * @return number of times this edge appears in the path */ - public int numInPath( final DeBruijnEdge edge ) { + public int numInPath( final BaseEdge edge ) { if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } int numInPath = 0; - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { if( e.equals(graph, edge) ) { numInPath++; } @@ -153,22 +147,11 @@ public class KBestPaths { return numInPath; } - /** - * Does this path contain a reference edge? - * @return true if the path contains a reference edge - */ - public boolean containsRefEdge() { - for( final DeBruijnEdge e : edges ) { - if( e.isRef() ) { return true; } - } - return false; - } - - public List getEdges() { return edges; } + public List getEdges() { return edges; } public int getScore() { return totalScore; } - public DeBruijnVertex getLastVertexInPath() { return lastVertex; } + public T getLastVertexInPath() { return lastVertex; } /** * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes @@ -179,7 +162,7 @@ public class KBestPaths { if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); } byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0))); - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); } return bases; @@ -201,9 +184,9 @@ public class KBestPaths { } // reset the bubble state machine - final BubbleStateMachine bsm = new BubbleStateMachine(cigar); + final BubbleStateMachine bsm = new BubbleStateMachine(cigar); - for( final DeBruijnEdge e : edges ) { + for( final BaseEdge e : edges ) { if( e.equals(graph, edges.get(0)) ) { advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); } @@ -231,7 +214,7 @@ public class KBestPaths { * @param e the edge which generated this node in the path */ @Requires({"bsm != null", "graph != null", "node != null"}) - private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DeBruijnVertex node, final DeBruijnEdge e ) { + private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { if( graph.isReferenceNode( node ) ) { if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else if( e !=null && !e.isRef() ) { @@ -283,7 +266,7 @@ public class KBestPaths { */ @Requires({"graph != null"}) @Ensures({"result != null"}) - private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) { + private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); final Cigar returnCigar = new Cigar(); @@ -328,10 +311,10 @@ public class KBestPaths { } // class to keep track of the bubble state machine - protected static class BubbleStateMachine { + protected static class BubbleStateMachine { public boolean inBubble = false; public byte[] bubbleBytes = null; - public DeBruijnVertex lastSeenReferenceNode = null; + public T lastSeenReferenceNode = null; public Cigar cigar = null; public BubbleStateMachine( final Cigar initialCigar ) { @@ -358,14 +341,14 @@ public class KBestPaths { * @return a list with at most k top-scoring paths from the graph */ @Ensures({"result != null", "result.size() <= k"}) - public static List getKBestPaths( final DeBruijnAssemblyGraph graph, final int k ) { + public List getKBestPaths( final BaseGraph graph, final int k ) { if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); } final ArrayList bestPaths = new ArrayList(); // run a DFS for best paths - for( final DeBruijnVertex v : graph.vertexSet() ) { + for( final T v : graph.vertexSet() ) { if( graph.inDegreeOf(v) == 0 ) { findBestPaths(new Path(v, graph), bestPaths); } @@ -376,31 +359,28 @@ public class KBestPaths { return bestPaths.subList(0, Math.min(k, bestPaths.size())); } - private static void findBestPaths( final Path path, final List bestPaths ) { + private void findBestPaths( final Path path, final List bestPaths ) { findBestPaths(path, bestPaths, new MyInt()); } - private static void findBestPaths( final Path path, final List bestPaths, final MyInt n ) { + private void findBestPaths( final Path path, final List bestPaths, final MyInt n ) { // did we hit the end of a path? if ( allOutgoingEdgesHaveBeenVisited(path) ) { - if( path.containsRefEdge() ) { - if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { - // clean out some low scoring paths - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 - } - bestPaths.add(path); + if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { + // clean out some low scoring paths + Collections.sort(bestPaths, new PathComparatorTotalScore() ); + for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 } + bestPaths.add(path); } else if( n.val > 10000) { // do nothing, just return } else { // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(); + final ArrayList edgeArrayList = new ArrayList(); edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex)); - Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator()); - Collections.reverse(edgeArrayList); - for ( final DeBruijnEdge edge : edgeArrayList ) { + Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); + for ( final BaseEdge edge : edgeArrayList ) { // make sure the edge is not already in the path if ( path.containsEdge(edge) ) continue; @@ -416,8 +396,8 @@ public class KBestPaths { * @param path the path to test * @return true if all the outgoing edges at the end of this path have already been visited */ - private static boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { - for( final DeBruijnEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { + private boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { + for( final BaseEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java index 66ea8a078..05bd1b881 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java @@ -226,28 +226,16 @@ public class KMerErrorCorrector { @Override public String toString() { final StringBuilder b = new StringBuilder("KMerErrorCorrector{"); - for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { - final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); - if ( correcting ) - b.append(String.format("%n\t%s / %d -> %s / %d [correcting? %b]", - toCorrect.getKey(), getCounts(toCorrect.getKey()), - toCorrect.getValue(), getCounts(toCorrect.getValue()), - correcting)); + if ( rawToErrorCorrectedMap == null ) { + b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); + } else { + for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { + final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); + if ( correcting ) + b.append(String.format("%n\tCorrecting %s -> %s", toCorrect.getKey(), toCorrect.getValue())); + } } b.append("\n}"); return b.toString(); } - - /** - * Get a simple count estimate for printing for kmer - * @param kmer the kmer - * @return an integer count for kmer - */ - private int getCounts(final String kmer) { - if ( kmer == null ) return 0; - final Integer count = countsByKMer == null ? -1 : countsByKMer.get(kmer); - if ( count == null ) - throw new IllegalArgumentException("kmer not found in counts -- bug " + kmer); - return count; - } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java new file mode 100644 index 000000000..960f2cdd7 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -0,0 +1,280 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; + +import java.util.*; + +/** + * A graph that contains base sequence at each node + * + * @author: depristo + * @since 03/2013 + */ +public class SeqGraph extends BaseGraph { + /** + * Construct an empty SeqGraph + */ + public SeqGraph() { + super(); + } + + /** + * Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer + * + * The kmer size is purely information. It is useful when converting a Debruijn graph -> SeqGraph + * for us to track the kmer used to make the transformation. + * + * @param kmer kmer + */ + public SeqGraph(final int kmer) { + super(kmer); + } + + protected void mergeNodes() { + zipLinearChains(); + } + + protected void zipLinearChains() { + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; + + for( final BaseEdge e : edgeSet() ) { + final SeqVertex outgoingVertex = getEdgeTarget(e); + final SeqVertex incomingVertex = getEdgeSource(e); + if( !outgoingVertex.equals(incomingVertex) + && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 + && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { + + final Set outEdges = outgoingEdgesOf(outgoingVertex); + final Set inEdges = incomingEdgesOf(incomingVertex); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + } else if( inEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } else if( outEdges.size() == 1 ) { + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } + + final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); + addVertex(addedVertex); + for( final BaseEdge edge : outEdges ) { + addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + for( final BaseEdge edge : inEdges ) { + addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + + removeVertex(incomingVertex); + removeVertex(outgoingVertex); + foundNodesToMerge = true; + break; + } + } + } + } + + // + // X -> ABC -> Y + // -> aBC -> Y + // + // becomes + // + // X -> A -> BCY + // -> a -> BCY + // + public void mergeBranchingNodes() { + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; + + for( final SeqVertex v : vertexSet() ) { + foundNodesToMerge = simplifyDiamond(v); + if ( foundNodesToMerge ) + break; + } + } + } + + /** + * A simple structure that looks like: + * + * v + * / | \ \ + * m1 m2 m3 ... mn + * \ | / / + * b + * + * @param v + * @return + */ + protected boolean isRootOfDiamond(final SeqVertex v) { + final Set ve = outgoingEdgesOf(v); + if ( ve.size() <= 1 ) + return false; + + SeqVertex bottom = null; + for ( final BaseEdge e : ve ) { + final SeqVertex mi = getEdgeTarget(e); + + // all nodes must have at least 1 connection + if ( outDegreeOf(mi) < 1 ) + return false; + + // can only have 1 incoming node, the root vertex + if ( inDegreeOf(mi) != 1 ) + return false; + + for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { + if ( bottom == null ) + bottom = mt; + else if ( ! bottom.equals(mt) ) + return false; + } + } + + return true; + } + + private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { + final String[] kmers = new String[middleVertices.size()]; + + int i = 0; + for ( final SeqVertex v : middleVertices ) { + kmers[i++] = (StringUtils.reverse(v.getSequenceString())); + } + + final String commonPrefix = StringUtils.getCommonPrefix(kmers); + return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); + } + + private SeqVertex getDiamondBottom(final SeqVertex top) { + final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); + final SeqVertex middle = getEdgeTarget(topEdge); + final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next(); + return getEdgeTarget(middleEdge); + } + + final Set getMiddleVertices(final SeqVertex top) { + final Set middles = new HashSet(); + for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { + middles.add(getEdgeTarget(topToMiddle)); + } + return middles; + } + + private boolean simplifyDiamond(final SeqVertex top) { + if ( ! isRootOfDiamond(top) ) + return false; + + final SeqVertex diamondBottom = getDiamondBottom(top); + final Set middleVertices = getMiddleVertices(top); + + final List verticesToRemove = new LinkedList(); + final List edgesToRemove = new LinkedList(); + + // all of the edges point to the same sink, so it's time to merge + final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); + if ( commonSuffix != null ) { + boolean newBottomEdgeIsRef = false; + int newBottomEdgeMultiplicity = 0; + + final SeqVertex newBottomV = new SeqVertex(commonSuffix); + addVertex(newBottomV); + + for ( final SeqVertex middle : middleVertices ) { + boolean missingNodeEdgeIsRef = false; + int missingNodeMultiplicity = 0; + final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); + + if ( withoutSuffix != null ) // this node is a deletion + addVertex(withoutSuffix); + + // update all edges from top -> middle to be top -> without suffix + for( final BaseEdge topToMiddleEdge : getAllEdges(top, middle) ) { + edgesToRemove.add(topToMiddleEdge); + missingNodeMultiplicity += topToMiddleEdge.getMultiplicity(); + missingNodeEdgeIsRef = missingNodeEdgeIsRef || topToMiddleEdge.isRef(); + if ( withoutSuffix != null ) // this node is a deletion + addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge.isRef(), topToMiddleEdge.getMultiplicity())); + } + + // reattached prefix to the new bottom V by updating all edges from middleV -> bottom + for ( final BaseEdge middleToBottomE : getAllEdges(middle, diamondBottom) ) { + missingNodeMultiplicity += middleToBottomE.getMultiplicity(); + missingNodeEdgeIsRef = missingNodeEdgeIsRef || middleToBottomE.isRef(); + + if ( withoutSuffix != null ) // this node is a deletion + addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE.isRef(), middleToBottomE.getMultiplicity())); + edgesToRemove.add(middleToBottomE); + + // update the info for the new bottom edge + newBottomEdgeIsRef = newBottomEdgeIsRef || middleToBottomE.isRef(); + newBottomEdgeMultiplicity += middleToBottomE.getMultiplicity(); + } + + if ( withoutSuffix == null ) // add an edge from top to new bottom + addEdge(top, newBottomV, new BaseEdge(missingNodeEdgeIsRef, missingNodeMultiplicity)); + + verticesToRemove.add(middle); + } + + addEdge(newBottomV, diamondBottom, new BaseEdge(newBottomEdgeIsRef, newBottomEdgeMultiplicity)); + + removeAllEdges(edgesToRemove); + removeAllVertices(verticesToRemove); + + return true; + } else { + return false; + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java new file mode 100644 index 000000000..b45ac0c34 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java @@ -0,0 +1,153 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.Utils; + +import java.util.Arrays; + +/** + * A graph vertex containing a sequence of bases and a unique ID that + * allows multiple distinct nodes in the graph to have the same sequence. + * + * This is essential when thinking about representing the actual sequence of a haplotype + * in a graph. There can be many parts of the sequence that have the same sequence, but + * are distinct elements in the graph because they have a different position in the graph. For example: + * + * A -> C -> G -> A -> T + * + * The two As are not the same, because they occur with different connections. In a kmer graph equals() + * is based on the sequence itself, as each distinct kmer can only be represented once. But the transformation + * of the kmer graph into a graph of base sequences, without their kmer prefixes, means that nodes that + * where once unique including their prefix can become equal after shedding the prefix. So we need to + * use some mechanism -- here a unique ID per node -- to separate nodes that have the same sequence + * but are distinct elements of the graph. + * + * @author: depristo + * @since 03/2013 + */ +public class SeqVertex extends BaseVertex { + private static int idCounter = 0; + public final int id; + + /** + * Create a new SeqVertex with sequence and the next available id + * @param sequence our base sequence + */ + public SeqVertex(final byte[] sequence) { + super(sequence); + this.id = idCounter++; + } + + /** + * Create a new SeqVertex having bases of sequence.getBytes() + * @param sequence the string representation of our bases + */ + public SeqVertex(final String sequence) { + super(sequence); + this.id = idCounter++; + } + + /** + * Create a copy of toCopy + * @param toCopy a SeqVertex to copy into this newly allocated one + */ + public SeqVertex(final SeqVertex toCopy) { + super(toCopy.sequence); + this.id = toCopy.id; + } + + /** + * Get the unique ID for this SeqVertex + * @return a positive integer >= 0 + */ + public int getId() { + return id; + } + + @Override + public String toString() { + return "SeqVertex_id_" + id + "_seq_" + getSequenceString(); + } + + /** + * Two SeqVertex are equal only if their ids are equal + * @param o + * @return + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + SeqVertex seqVertex = (SeqVertex) o; + if (id != seqVertex.id) return false; + + // note that we don't test for super equality here because the ids are unique + //if (!super.equals(o)) return false; + + return true; + } + + @Override + public int hashCode() { + return id; + } + + /** + * Return a new SeqVertex derived from this one but not including the suffix bases + * + * @param suffix the suffix bases to remove from this vertex + * @return a newly allocated SeqVertex with appropriate prefix, or null if suffix removes all bases from this node + */ + @Requires("Utils.endsWith(sequence, suffix)") + public SeqVertex withoutSuffix(final byte[] suffix) { + final int prefixSize = sequence.length - suffix.length; + return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java new file mode 100644 index 000000000..3cc44c7de --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java @@ -0,0 +1,105 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class BaseEdgeUnitTest extends BaseTest { + @DataProvider(name = "EdgeCreationData") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + for ( final int multiplicity : Arrays.asList(1, 2, 3) ) { + for ( final boolean isRef : Arrays.asList(true, false) ) { + tests.add(new Object[]{isRef, multiplicity}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "EdgeCreationData") + public void testBasic(final boolean isRef, final int mult) { + final BaseEdge e = new BaseEdge(isRef, mult); + Assert.assertEquals(e.isRef(), isRef); + Assert.assertEquals(e.getMultiplicity(), mult); + + e.setIsRef(!isRef); + Assert.assertEquals(e.isRef(), !isRef); + + e.setMultiplicity(mult + 1); + Assert.assertEquals(e.getMultiplicity(), mult + 1); + + final BaseEdge copy = new BaseEdge(e); + Assert.assertEquals(copy.isRef(), e.isRef()); + Assert.assertEquals(copy.getMultiplicity(), e.getMultiplicity()); + } + + @Test + public void testEdgeWeightComparator() { + final BaseEdge e10 = new BaseEdge(false, 10); + final BaseEdge e5 = new BaseEdge(true, 5); + final BaseEdge e2 = new BaseEdge(false, 2); + final BaseEdge e1 = new BaseEdge(false, 1); + + final List edges = new ArrayList(Arrays.asList(e1, e2, e5, e10)); + Collections.sort(edges, new BaseEdge.EdgeWeightComparator()); + Assert.assertEquals(edges.get(0), e10); + Assert.assertEquals(edges.get(1), e5); + Assert.assertEquals(edges.get(2), e2); + Assert.assertEquals(edges.get(3), e1); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java new file mode 100644 index 000000000..463e861b1 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java @@ -0,0 +1,192 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; +import scala.actors.threadpool.Arrays; + +import java.io.File; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 3/15/13 + * Time: 3:36 PM + * To change this template use File | Settings | File Templates. + */ +public class BaseGraphUnitTest extends BaseTest { + SeqGraph graph; + SeqVertex v1, v2, v3, v4, v5; + + @BeforeMethod + public void setUp() throws Exception { + graph = new SeqGraph(); + + v1 = new SeqVertex("A"); + v2 = new SeqVertex("C"); + v3 = new SeqVertex("C"); + v4 = new SeqVertex("C"); + v5 = new SeqVertex("C"); + + graph.addVertices(v1, v2, v3, v4, v5); + graph.addEdge(v1, v2); + graph.addEdge(v2, v4); + graph.addEdge(v3, v2); + graph.addEdge(v2, v3); + graph.addEdge(v4, v5); + } + + @Test + public void testIncomingAndOutgoingVertices() throws Exception { + assertVertexSetEquals(graph.outgoingVerticesOf(v1), v2); + assertVertexSetEquals(graph.incomingVerticesOf(v1)); + + assertVertexSetEquals(graph.outgoingVerticesOf(v2), v3, v4); + assertVertexSetEquals(graph.incomingVerticesOf(v2), v1, v3); + + assertVertexSetEquals(graph.outgoingVerticesOf(v3), v2); + assertVertexSetEquals(graph.incomingVerticesOf(v3), v2); + + assertVertexSetEquals(graph.outgoingVerticesOf(v4), v5); + assertVertexSetEquals(graph.incomingVerticesOf(v4), v2); + + assertVertexSetEquals(graph.outgoingVerticesOf(v5)); + assertVertexSetEquals(graph.incomingVerticesOf(v5), v4); + } + + @Test + public void testPrintEmptyGraph() throws Exception { + final File tmp = File.createTempFile("tmp", "dot"); + tmp.deleteOnExit(); + new SeqGraph().printGraph(tmp, 10); + new DeBruijnGraph().printGraph(tmp, 10); + } + + @Test + public void testComplexGraph() throws Exception { + final File tmp = File.createTempFile("tmp", "dot"); + tmp.deleteOnExit(); + graph.printGraph(tmp, 10); + } + + private void assertVertexSetEquals(final Set actual, final SeqVertex ... expected) { + final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); + Assert.assertEquals(actual, expectedSet); + } + + @Test(enabled = true) + public void testPruneGraph() { + DeBruijnGraph graph = new DeBruijnGraph(); + DeBruijnGraph expectedGraph = new DeBruijnGraph(); + + DeBruijnVertex v = new DeBruijnVertex("ATGG"); + DeBruijnVertex v2 = new DeBruijnVertex("ATGGA"); + DeBruijnVertex v3 = new DeBruijnVertex("ATGGT"); + DeBruijnVertex v4 = new DeBruijnVertex("ATGGG"); + DeBruijnVertex v5 = new DeBruijnVertex("ATGGC"); + DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC"); + + graph.addVertex(v); + graph.addVertex(v2); + graph.addVertex(v3); + graph.addVertex(v4); + graph.addVertex(v5); + graph.addVertex(v6); + graph.addEdge(v, v2, new BaseEdge(false, 1)); + graph.addEdge(v2, v3, new BaseEdge(false, 3)); + graph.addEdge(v3, v4, new BaseEdge(false, 5)); + graph.addEdge(v4, v5, new BaseEdge(false, 3)); + graph.addEdge(v5, v6, new BaseEdge(false, 2)); + + expectedGraph.addVertex(v2); + expectedGraph.addVertex(v3); + expectedGraph.addVertex(v4); + expectedGraph.addVertex(v5); + expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3)); + expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5)); + expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3)); + + graph.pruneGraph(2); + + Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); + + graph = new DeBruijnGraph(); + expectedGraph = new DeBruijnGraph(); + + graph.addVertex(v); + graph.addVertex(v2); + graph.addVertex(v3); + graph.addVertex(v4); + graph.addVertex(v5); + graph.addVertex(v6); + graph.addEdge(v, v2, new BaseEdge(true, 1)); + graph.addEdge(v2, v3, new BaseEdge(false, 3)); + graph.addEdge(v3, v4, new BaseEdge(false, 5)); + graph.addEdge(v4, v5, new BaseEdge(false, 3)); + + expectedGraph.addVertex(v); + expectedGraph.addVertex(v2); + expectedGraph.addVertex(v3); + expectedGraph.addVertex(v4); + expectedGraph.addVertex(v5); + expectedGraph.addEdge(v, v2, new BaseEdge(true, 1)); + expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3)); + expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5)); + expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3)); + + graph.pruneGraph(2); + + Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java new file mode 100644 index 000000000..cd27c7183 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java @@ -0,0 +1,91 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class BaseVertexUnitTest extends BaseTest { + @Test + public void testBasic() { + final byte[] bases = "ACT".getBytes(); + final BaseVertex v = new BaseVertex(bases); + Assert.assertEquals(v.getSequence(), bases); + Assert.assertEquals(v.getAdditionalSequence(false), bases); + Assert.assertEquals(v.getAdditionalSequence(true), bases); + Assert.assertEquals(v.getSequenceString(), new String(bases)); + Assert.assertEquals(v.toString(), v.getSequenceString()); + Assert.assertEquals(v.length(), bases.length); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCreationNull() { + new BaseVertex((byte[])null); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCreationEmptySeq() { + new BaseVertex(new byte[0]); + } + + @Test + public void testEqualsAndHashCode() { + final BaseVertex v1 = new BaseVertex("ACT".getBytes()); + final BaseVertex v1_eq = new BaseVertex("ACT".getBytes()); + final BaseVertex v2 = new BaseVertex("ACG".getBytes()); + + Assert.assertEquals(v1, v1); + Assert.assertEquals(v1.hashCode(), v1.hashCode()); + Assert.assertEquals(v1, v1_eq); + Assert.assertEquals(v1.hashCode(), v1_eq.hashCode()); + Assert.assertFalse(v1.equals(v2)); + Assert.assertFalse(v2.equals(v1)); + Assert.assertFalse(v2.hashCode() == v1.hashCode()); + Assert.assertFalse(v2.equals(v1)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 2096b487e..fa581f7fd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -69,211 +69,12 @@ import java.util.*; public class DeBruijnAssemblerUnitTest extends BaseTest { private final static boolean DEBUG = true; - - private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { - public byte[] sequence; - public int KMER_LENGTH; - - public MergeNodesWithNoVariationTestProvider(String seq, int kmer) { - super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq)); - sequence = seq.getBytes(); - KMER_LENGTH = kmer; - } - - public DeBruijnAssemblyGraph expectedGraph() { - DeBruijnVertex v = new DeBruijnVertex(sequence, KMER_LENGTH); - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - graph.addVertex(v); - return graph; - } - - public DeBruijnAssemblyGraph calcGraph() { - - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for (int i = 0; i < kmersInSequence - 1; i++) { - // get the kmers - final byte[] kmer1 = new byte[KMER_LENGTH]; - System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH); - final byte[] kmer2 = new byte[KMER_LENGTH]; - System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); - - graph.addKmersToGraph(kmer1, kmer2, false, 1); - } - DeBruijnAssembler.mergeNodes(graph); - return graph; - } - } - - @DataProvider(name = "MergeNodesWithNoVariationTestProvider") - public Object[][] makeMergeNodesWithNoVariationTests() { - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6); - new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7); - new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6); - new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66); - new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76); - - return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); - } - - @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG) - public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { - logger.warn(String.format("Test: %s", cfg.toString())); - Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph())); - } - -// @DataProvider(name = "SimpleMergeOperationsData") -// public Object[][] makeSimpleMergeOperationsData() { -// List tests = new ArrayList(); -// -// { -// DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); -// DeBruijnVertex v1 = new DeBruijnVertex("AT"); -// DeBruijnVertex v2 = new DeBruijnVertex("TC"); -// DeBruijnVertex v3 = new DeBruijnVertex("CT"); -// DeBruijnVertex v4 = new DeBruijnVertex("TG"); -// DeBruijnVertex v5 = new DeBruijnVertex("AG"); -// DeBruijnVertex v6 = new DeBruijnVertex("GG"); -// DeBruijnVertex v7 = new DeBruijnVertex("GA"); -// DeBruijnVertex v8 = new DeBruijnVertex("AA"); -// -// graph.addVertices(v1, v2, v3, v4, v5, v6, v7, v8); -// graph.addEdge(v1, v2, new DeBruijnEdge(false, 2)); -// graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); -// graph.addEdge(v2, v4, new DeBruijnEdge(false, 5)); -// graph.addEdge(v3, v5, new DeBruijnEdge(false, 3)); -// graph.addEdge(v4, v6, new DeBruijnEdge(false, 3)); -// graph.addEdge(v5, v7, new DeBruijnEdge(false, 2)); -// graph.addEdge(v6, v7, new DeBruijnEdge(false, 6)); -// graph.addEdge(v7, v8, new DeBruijnEdge(false, 2)); -// -// graph.printGraph(new File("unittest.dot"), 1); -// -// DeBruijnAssemblyGraph expected = new DeBruijnAssemblyGraph(); -// DeBruijnVertex e1 = new DeBruijnVertex("ATC"); -// DeBruijnVertex e2 = new DeBruijnVertex("T"); -// DeBruijnVertex e3 = new DeBruijnVertex("G"); -// DeBruijnVertex e4 = new DeBruijnVertex("GAA"); -// -// expected.addVertices(e1,e2,e3,e4); -// expected.addEdge(e1, e2, new DeBruijnEdge(false, 3)); -// expected.addEdge(e1, e3, new DeBruijnEdge(false, 5)); -// expected.addEdge(e2, e4, new DeBruijnEdge(false, 2)); -// expected.addEdge(e3, e4, new DeBruijnEdge(false, 6)); -// -// expected.printGraph(new File("expected.dot"), 1); -// -// tests.add(new Object[]{graph.clone(), expected}); -// } -// -// return tests.toArray(new Object[][]{}); -// } -// -// @Test(dataProvider = "SimpleMergeOperationsData", enabled = true) -// public void testSimpleMergeOperations(final DeBruijnAssemblyGraph unmergedGraph, final DeBruijnAssemblyGraph expectedGraph) throws Exception { -// final DeBruijnAssemblyGraph mergedGraph = (DeBruijnAssemblyGraph)unmergedGraph.clone(); -// DeBruijnAssembler.mergeNodes(mergedGraph); -// mergedGraph.printGraph(new File("merged.dot"), 1); -// DeBruijnAssembler.simplifyMergedGraph(mergedGraph); -// mergedGraph.printGraph(new File("reduced.dot"), 1); -// Assert.assertTrue(graphEquals(mergedGraph, expectedGraph)); -// } - - @Test(enabled = !DEBUG) - public void testPruneGraph() { - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - DeBruijnAssemblyGraph expectedGraph = new DeBruijnAssemblyGraph(); - - DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 1); - DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 1); - DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 1); - DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 1); - DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 1); - DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 1); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(false, 1)); - graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - graph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - graph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - graph.addEdge(v5, v6, new DeBruijnEdge(false, 2)); - - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - - DeBruijnAssembler.pruneGraph(graph, 2); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - - graph = new DeBruijnAssemblyGraph(); - expectedGraph = new DeBruijnAssemblyGraph(); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new DeBruijnEdge(true, 1)); - graph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - graph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - graph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - - expectedGraph.addVertex(v); - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v, v2, new DeBruijnEdge(true, 1)); - expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3)); - expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5)); - expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3)); - - DeBruijnAssembler.pruneGraph(graph, 2); - - Assert.assertTrue(graphEquals(graph, expectedGraph)); - } - - private boolean graphEquals(DeBruijnAssemblyGraph g1, DeBruijnAssemblyGraph g2) { - if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { - return false; - } - for( DeBruijnEdge e1 : g1.edgeSet() ) { - boolean found = false; - for( DeBruijnEdge e2 : g2.edgeSet() ) { - if( e1.equals(g1, e2, g2) ) { found = true; break; } - } - if( !found ) { return false; } - } - for( DeBruijnEdge e2 : g2.edgeSet() ) { - boolean found = false; - for( DeBruijnEdge e1 : g1.edgeSet() ) { - if( e2.equals(g2, e1, g1) ) { found = true; break; } - } - if( !found ) { return false; } - } - return true; - } - @Test(enabled = !DEBUG) public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DeBruijnAssemblyGraph g1 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); - final DeBruijnAssemblyGraph g2 = DeBruijnAssembler.createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); + final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); + final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); @@ -313,7 +114,7 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; - Cigar calculatedCigar = DeBruijnAssembler.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); + Cigar calculatedCigar = new DeBruijnAssembler().leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java index 5a1497236..2b87cf61d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java @@ -75,7 +75,7 @@ public class DeBruijnAssemblyGraphUnitTest { } public byte[] calculatedReferenceBytes() { - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); + DeBruijnGraph graph = new DeBruijnGraph(); graph.addSequenceToGraph(refSequence, KMER_LENGTH, true); if( altSequence.length > 0 ) { graph.addSequenceToGraph(altSequence, KMER_LENGTH, false); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java new file mode 100644 index 000000000..2db35e173 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java @@ -0,0 +1,69 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.annotations.Test; +import org.testng.Assert; + +public class DeBruijnVertexUnitTest extends BaseTest { + @Test + public void testBasic() { + final byte[] bases = "ACT".getBytes(); + final DeBruijnVertex v = new DeBruijnVertex(bases); + Assert.assertEquals(v.getSequence(), bases); + Assert.assertEquals(v.getSequenceString(), new String(bases)); + Assert.assertEquals(v.length(), bases.length); + Assert.assertEquals(v.getSuffix().length, 1); + Assert.assertEquals(v.getSuffix()[0], (byte)'T'); + Assert.assertEquals(v.getSuffixString(), "T"); + + Assert.assertEquals(v.getAdditionalSequence(true), bases); + Assert.assertEquals(v.getAdditionalSequence(false).length, 1); + Assert.assertEquals(v.getAdditionalSequence(false)[0], (byte)'T'); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java index 53400b790..10863cef9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java @@ -49,14 +49,13 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.jgrapht.graph.DefaultDirectedGraph; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -79,58 +78,105 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "BasicBubbleDataProvider") + @Test(dataProvider = "BasicBubbleDataProvider", enabled = true) public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { // Construct the assembly graph - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - final int KMER_LENGTH = 3; + SeqGraph graph = new SeqGraph(3); final String preRef = "ATGG"; - final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGGGC"; + final String postRef = "GGGGC"; - DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH); - DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v3 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH); + SeqVertex v = new SeqVertex(preRef); + SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); + SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); + SeqVertex v3 = new SeqVertex(postRef); graph.addVertex(v); graph.addVertex(v2Ref); graph.addVertex(v2Alt); graph.addVertex(v3); - graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10)); - graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10)); - graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5)); - graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5)); + graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); + graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); + graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); + graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); + + graph.printGraph(new File("test.dot"), 10); // Construct the test path - KBestPaths.Path path = new KBestPaths.Path(v, graph); - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); + KBestPaths.Path path = new KBestPaths.Path(v, graph); + path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); + path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); - expectedCigar.add(new CigarElement(altBubbleLength,CigarOperator.M)); + expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); } else if ( refBubbleLength < altBubbleLength ) { - expectedCigar.add(new CigarElement(refBubbleLength,CigarOperator.M)); + expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); } else { expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); } - expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); } + // TODO -- test block substitution because it doesn't look like it's correct now +// @Test(dataProvider = "BasicBubbleDataProvider") +// public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { +// // Construct the assembly graph +// final int KMER_LENGTH = 3; +// SeqGraph graph = new SeqGraph(KMER_LENGTH); +// final String preRef = "ATGG"; +// final String postRef = "GGGGC"; +// +// SeqVertex v = new SeqVertex(preRef); +// SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); +// SeqVertex v2Alt = new SeqVertex(Utils.dupString('T', altBubbleLength)); +// SeqVertex v3 = new SeqVertex(postRef); +// +// graph.addVertex(v); +// graph.addVertex(v2Ref); +// graph.addVertex(v2Alt); +// graph.addVertex(v3); +// graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); +// graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); +// graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); +// graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); +// +// graph.printGraph(new File("test.dot"), 10); +// +// // Construct the test path +// KBestPaths.Path path = new KBestPaths.Path(v, graph); +// path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); +// path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); +// +// // Construct the actual cigar string implied by the test path +// Cigar expectedCigar = new Cigar(); +// expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); +// if( refBubbleLength > altBubbleLength ) { +// expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); +// expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); +// } else if ( refBubbleLength < altBubbleLength ) { +// expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); +// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); +// } else { +// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); +// } +// expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); +// +// Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); +// } @DataProvider(name = "TripleBubbleDataProvider") public Object[][] makeTripleBubbleDataProvider() { List tests = new ArrayList(); for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { - for ( final boolean offRefBeginning : Arrays.asList(false) ) { - for ( final boolean offRefEnding : Arrays.asList(true, false) ) { + for ( final boolean offRefEnding : Arrays.asList(true, false) ) { + for ( final boolean offRefBeginning : Arrays.asList(false) ) { tests.add(new Object[]{refBubbleLength, altBubbleLength, offRefBeginning, offRefEnding}); } } @@ -139,30 +185,29 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "TripleBubbleDataProvider") + @Test(dataProvider = "TripleBubbleDataProvider", enabled = true) public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { // Construct the assembly graph - DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph(); - final int KMER_LENGTH = 3; + SeqGraph graph = new SeqGraph(); final String preAltOption = "ATCGATCGATCGATCGATCG"; final String postAltOption = "CCCC"; final String preRef = "ATGG"; - final String postRef = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "GGCCG"; - final String midRef1 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "TTCCT"; - final String midRef2 = new String(Utils.dupBytes((byte) 'A', KMER_LENGTH-1)) + "CCCAAAAAAAAAAAA"; + final String postRef = "GGCCG"; + final String midRef1 = "TTCCT"; + final String midRef2 = "CCCAAAAAAAAAAAA"; - DeBruijnVertex preV = new DeBruijnVertex(preAltOption.getBytes(), KMER_LENGTH); - DeBruijnVertex v = new DeBruijnVertex(preRef.getBytes(), KMER_LENGTH); - DeBruijnVertex v2Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'A', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v2Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'A', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v4Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'C', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v4Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'C', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v6Ref = new DeBruijnVertex(Utils.dupBytes((byte) 'G', refBubbleLength+KMER_LENGTH-1), KMER_LENGTH); - DeBruijnVertex v6Alt = new DeBruijnVertex(ArrayUtils.addAll(Utils.dupBytes((byte) 'G', altBubbleLength + KMER_LENGTH - 1 - 1), Utils.dupBytes((byte) 'T',1)), KMER_LENGTH); - DeBruijnVertex v3 = new DeBruijnVertex(midRef1.getBytes(), KMER_LENGTH); - DeBruijnVertex v5 = new DeBruijnVertex(midRef2.getBytes(), KMER_LENGTH); - DeBruijnVertex v7 = new DeBruijnVertex(postRef.getBytes(), KMER_LENGTH); - DeBruijnVertex postV = new DeBruijnVertex(postAltOption.getBytes(), KMER_LENGTH); + SeqVertex preV = new SeqVertex(preAltOption); + SeqVertex v = new SeqVertex(preRef); + SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); + SeqVertex v2Alt = new SeqVertex(Utils.dupString('A', altBubbleLength-1) + "T"); + SeqVertex v4Ref = new SeqVertex(Utils.dupString('C', refBubbleLength)); + SeqVertex v4Alt = new SeqVertex(Utils.dupString('C', altBubbleLength-1) + "T"); + SeqVertex v6Ref = new SeqVertex(Utils.dupString('G', refBubbleLength)); + SeqVertex v6Alt = new SeqVertex(Utils.dupString('G', altBubbleLength-1) + "T"); + SeqVertex v3 = new SeqVertex(midRef1); + SeqVertex v5 = new SeqVertex(midRef2); + SeqVertex v7 = new SeqVertex(postRef); + SeqVertex postV = new SeqVertex(postAltOption); graph.addVertex(preV); graph.addVertex(v); @@ -176,34 +221,36 @@ public class KBestPathsUnitTest { graph.addVertex(v6Alt); graph.addVertex(v7); graph.addVertex(postV); - graph.addEdge(preV, v, new DeBruijnEdge(false, 1)); - graph.addEdge(v, v2Ref, new DeBruijnEdge(true, 10)); - graph.addEdge(v2Ref, v3, new DeBruijnEdge(true, 10)); - graph.addEdge(v, v2Alt, new DeBruijnEdge(false, 5)); - graph.addEdge(v2Alt, v3, new DeBruijnEdge(false, 5)); - graph.addEdge(v3, v4Ref, new DeBruijnEdge(true, 10)); - graph.addEdge(v4Ref, v5, new DeBruijnEdge(true, 10)); - graph.addEdge(v3, v4Alt, new DeBruijnEdge(false, 5)); - graph.addEdge(v4Alt, v5, new DeBruijnEdge(false, 5)); - graph.addEdge(v5, v6Ref, new DeBruijnEdge(true, 11)); - graph.addEdge(v6Ref, v7, new DeBruijnEdge(true, 11)); - graph.addEdge(v5, v6Alt, new DeBruijnEdge(false, 55)); - graph.addEdge(v6Alt, v7, new DeBruijnEdge(false, 55)); - graph.addEdge(v7, postV, new DeBruijnEdge(false, 1)); + graph.addEdge(preV, v, new BaseEdge(false, 1)); + graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); + graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); + graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); + graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); + graph.addEdge(v3, v4Ref, new BaseEdge(true, 10)); + graph.addEdge(v4Ref, v5, new BaseEdge(true, 10)); + graph.addEdge(v3, v4Alt, new BaseEdge(false, 5)); + graph.addEdge(v4Alt, v5, new BaseEdge(false, 5)); + graph.addEdge(v5, v6Ref, new BaseEdge(true, 11)); + graph.addEdge(v6Ref, v7, new BaseEdge(true, 11)); + graph.addEdge(v5, v6Alt, new BaseEdge(false, 55)); + graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); + graph.addEdge(v7, postV, new BaseEdge(false, 1)); + + graph.printGraph(new File("test.debruijn.dot"), 10); // Construct the test path - KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph); + KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph); if( offRefBeginning ) { - path = new KBestPaths.Path(path, graph.getEdge(preV, v)); + path = new KBestPaths.Path(path, graph.getEdge(preV, v)); } - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); - path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref)); - path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5)); - path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7)); + path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); + path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); + path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref)); + path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5)); + path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt)); + path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7)); if( offRefEnding ) { - path = new KBestPaths.Path(path, graph.getEdge(v7,postV)); + path = new KBestPaths.Path(path, graph.getEdge(v7,postV)); } // Construct the actual cigar string implied by the test path @@ -211,7 +258,7 @@ public class KBestPathsUnitTest { if( offRefBeginning ) { expectedCigar.add(new CigarElement(preAltOption.length(), CigarOperator.I)); } - expectedCigar.add(new CigarElement(preRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); // first bubble if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); @@ -222,10 +269,10 @@ public class KBestPathsUnitTest { } else { expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); } - expectedCigar.add(new CigarElement(midRef1.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(midRef1.length(), CigarOperator.M)); // second bubble is ref path expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); - expectedCigar.add(new CigarElement(midRef2.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(midRef2.length(), CigarOperator.M)); // third bubble if( refBubbleLength > altBubbleLength ) { expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); @@ -236,9 +283,9 @@ public class KBestPathsUnitTest { } else { expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); } - expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); + expectedCigar.add(new CigarElement(postRef.length(), CigarOperator.M)); if( offRefEnding ) { - expectedCigar.add(new CigarElement(postAltOption.length() - (KMER_LENGTH - 1), CigarOperator.I)); + expectedCigar.add(new CigarElement(postAltOption.length(), CigarOperator.I)); } Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java index f88d7ee7f..a4edfcacc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java @@ -55,6 +55,8 @@ public class KMerErrorCorrectorUnitTest extends BaseTest { public void testMyData() { final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2); + Assert.assertNotNull(corrector.toString()); + corrector.addKmers( "ATG", "ATG", "ATG", "ATG", "ACC", "ACC", "ACC", @@ -66,13 +68,20 @@ public class KMerErrorCorrectorUnitTest extends BaseTest { "NNC" // => ACC [because of min count won't go to NNA] ); - Assert.assertEquals(corrector.getErrorCorrectedKmer("ATG"), "ATG"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("ACC"), "ACC"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("AAA"), "AAA"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("CTG"), "ATG"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("NNA"), "AAA"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("CCC"), "ACC"); - Assert.assertEquals(corrector.getErrorCorrectedKmer("NNN"), null); - Assert.assertEquals(corrector.getErrorCorrectedKmer("NNC"), "ACC"); + testCorrection(corrector, "ATG", "ATG"); + testCorrection(corrector, "ACC", "ACC"); + testCorrection(corrector, "AAA", "AAA"); + testCorrection(corrector, "CTG", "ATG"); + testCorrection(corrector, "NNA", "AAA"); + testCorrection(corrector, "CCC", "ACC"); + testCorrection(corrector, "NNN", null); + testCorrection(corrector, "NNC", "ACC"); + + Assert.assertNotNull(corrector.toString()); + } + + private void testCorrection(final KMerErrorCorrector corrector, final String in, final String out) { + Assert.assertEquals(corrector.getErrorCorrectedKmer(in), out); + Assert.assertEquals(corrector.getErrorCorrectedKmer(in.getBytes()), out == null ? null : out.getBytes()); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java new file mode 100644 index 000000000..b5089e878 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -0,0 +1,106 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class SeqGraphUnitTest extends BaseTest { + private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { + public byte[] sequence; + public int KMER_LENGTH; + + public MergeNodesWithNoVariationTestProvider(String seq, int kmer) { + super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq)); + sequence = seq.getBytes(); + KMER_LENGTH = kmer; + } + + public SeqGraph calcGraph() { + final DeBruijnGraph deBruijnGraph = new DeBruijnGraph(); + final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + for (int i = 0; i < kmersInSequence - 1; i++) { + // get the kmers + final byte[] kmer1 = new byte[KMER_LENGTH]; + System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH); + final byte[] kmer2 = new byte[KMER_LENGTH]; + System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH); + + deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1); + } + final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); + seqGraph.mergeNodes(); + return seqGraph; + } + } + + @DataProvider(name = "MergeNodesWithNoVariationTestProvider") + public Object[][] makeMergeNodesWithNoVariationTests() { + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6); + new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7); + new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6); + new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66); + new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76); + + return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); + } + + @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true) + public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { + logger.warn(String.format("Test: %s", cfg.toString())); + + final SeqGraph actual = cfg.calcGraph(); + Assert.assertEquals(actual.vertexSet().size(), 1); + final SeqVertex actualV = actual.vertexSet().iterator().next(); + Assert.assertEquals(actualV.getSequence(), cfg.sequence); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java new file mode 100644 index 000000000..ca38351cc --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java @@ -0,0 +1,109 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class SeqVertexUnitTest extends BaseTest { + @Test + public void testBasic() { + final byte[] bases = "ACT".getBytes(); + final SeqVertex v1 = new SeqVertex(bases); + final SeqVertex v2 = new SeqVertex(bases); + Assert.assertTrue(v1.getId() >= 0); + Assert.assertTrue(v2.getId() >= 0); + Assert.assertTrue(v2.getId() > v1.getId()); + } + + @Test + public void testEqualsAndHashCode() { + final byte[] bases = "ACT".getBytes(); + final SeqVertex v1 = new SeqVertex(bases); + final SeqVertex v1_neq = new SeqVertex(bases); + final SeqVertex v1_eq = new SeqVertex(v1); + + Assert.assertEquals(v1, v1); + Assert.assertEquals(v1.hashCode(), v1.hashCode()); + Assert.assertEquals(v1, v1_eq); + Assert.assertEquals(v1.hashCode(), v1_eq.hashCode()); + Assert.assertFalse(v1.equals(v1_neq)); + Assert.assertFalse(v1_neq.equals(v1)); + Assert.assertFalse(v1_neq.hashCode() == v1.hashCode()); + } + + @DataProvider(name = "WithoutSuffixData") + public Object[][] makeWithoutSuffixData() { + List tests = new ArrayList(); + + final String bases = "ACGTACGTACGT"; + final int l = bases.length(); + for ( int suffixLength = 0; suffixLength <= l; suffixLength++ ) { + final int suffixStart = l - suffixLength; + final String prefix = suffixLength == l ? null : bases.substring(0, suffixStart); + final String suffix = suffixStart == l ? "" : bases.substring(suffixStart, l); + tests.add(new Object[]{bases, suffix, prefix}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "WithoutSuffixData") + public void testWithoutSuffix(final String bases, final String suffix, final String expected) { + final SeqVertex basesSV = new SeqVertex(bases); + if ( expected == null ) + Assert.assertNull(basesSV.withoutSuffix(suffix.getBytes()), "Failed for bases " + bases + " with suffix " + suffix + " != " + expected); + else + Assert.assertEquals(basesSV.withoutSuffix(suffix.getBytes()).getSequenceString(), expected, "Failed for bases " + bases + " with suffix " + suffix + " != " + expected); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index ff64133a7..e50025ea1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -795,4 +795,17 @@ public class Utils { while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 return md5String; } + + /** + * Does big end with the exact sequence of bytes in suffix? + * + * @param big a non-null byte[] to test if it a prefix + suffix + * @param suffix a non-null byte[] to test if it's a suffix of big + * @return true if big is proper byte[] composed of some prefix + suffix + */ + public static boolean endsWith(final byte[] big, final byte[] suffix) { + if ( big == null ) throw new IllegalArgumentException("big cannot be null"); + if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null"); + return new String(big).endsWith(new String(suffix)); + } } From 1fa5050faf232bd9cff8edc5f521ec5cbd66ec22 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 19 Mar 2013 14:33:09 -0400 Subject: [PATCH 089/226] Cleanup, unit test, and optimize KBestPaths and Path -- Split Path from inner class of KBestPaths -- Use google MinMaxPriorityQueue to track best k paths, a more efficient implementation -- Path now properly typed throughout the code -- Path maintains a on-demand hashset of BaseEdges so that path.containsEdge is fast --- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../walkers/haplotypecaller/KBestPaths.java | 337 ++------------- .../gatk/walkers/haplotypecaller/Path.java | 394 ++++++++++++++++++ .../haplotypecaller/KBestPathsUnitTest.java | 176 +++++--- 4 files changed, 549 insertions(+), 360 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 9d84d611f..688d5336e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -325,7 +325,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } for( final SeqGraph graph : graphs ) { - for ( final KBestPaths.Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + for ( final Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 8c29cfa98..0724729a8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -46,293 +46,44 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import com.google.common.collect.MinMaxPriorityQueue; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.io.Serializable; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; /** - * Created by IntelliJ IDEA. - * User: ebanks, rpoplin + * Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. + * This is different from most graph traversals because we want to test paths from any source node to any sink node. + * + * User: ebanks, rpoplin, mdepristo * Date: Mar 23, 2011 */ -// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. -// This is different from most graph traversals because we want to test paths from any source node to any sink node. public class KBestPaths { - // static access only public KBestPaths() { } - private static int MAX_PATHS_TO_HOLD = 100; - protected static class MyInt { public int val = 0; } - // class to keep track of paths - protected static class Path { - // the last vertex seen in the path - private final T lastVertex; - - // the list of edges comprising the path - private final List edges; - - // the scores for the path - private final int totalScore; - - // the graph from which this path originated - private final BaseGraph graph; - - // used in the bubble state machine to apply Smith-Waterman to the bubble sequence - // these values were chosen via optimization against the NA12878 knowledge base - private static final double SW_MATCH = 20.0; - private static final double SW_MISMATCH = -15.0; - private static final double SW_GAP = -26.0; - private static final double SW_GAP_EXTEND = -1.1; - private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); - - public Path( final T initialVertex, final BaseGraph graph ) { - lastVertex = initialVertex; - edges = new ArrayList(0); - totalScore = 0; - this.graph = graph; - } - - public Path( final Path p, final BaseEdge edge ) { - if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } - - graph = p.graph; - lastVertex = p.graph.getEdgeTarget(edge); - edges = new ArrayList(p.edges); - edges.add(edge); - totalScore = p.totalScore + edge.getMultiplicity(); - } - - /** - * Does this path contain the given edge - * @param edge the given edge to test - * @return true if the edge is found in this path - */ - public boolean containsEdge( final BaseEdge edge ) { - if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - - for( final BaseEdge e : edges ) { - if( e.equals(graph, edge) ) { - return true; - } - } - - return false; - } - - /** - * Calculate the number of times this edge appears in the path - * @param edge the given edge to test - * @return number of times this edge appears in the path - */ - public int numInPath( final BaseEdge edge ) { - if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } - - int numInPath = 0; - for( final BaseEdge e : edges ) { - if( e.equals(graph, edge) ) { - numInPath++; - } - } - - return numInPath; - } - - public List getEdges() { return edges; } - - public int getScore() { return totalScore; } - - public T getLastVertexInPath() { return lastVertex; } - - /** - * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes - * @return non-null sequence of bases corresponding to this path - */ - @Ensures({"result != null"}) - public byte[] getBases() { - if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); } - - byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0))); - for( final BaseEdge e : edges ) { - bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); - } - return bases; - } - - /** - * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble - * @return non-null Cigar string with reference length equal to the refHaplotype's reference length - */ - @Ensures("result != null") - public Cigar calculateCigar() { - - final Cigar cigar = new Cigar(); - // special case for paths that start on reference but not at the reference source node - if( edges.get(0).isRef() && !graph.isRefSource(edges.get(0)) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edges.get(0))).getCigarElements() ) { - cigar.add(ce); - } - } - - // reset the bubble state machine - final BubbleStateMachine bsm = new BubbleStateMachine(cigar); - - for( final BaseEdge e : edges ) { - if( e.equals(graph, edges.get(0)) ) { - advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); - } - advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); - } - - // special case for paths that don't end on reference - if( bsm.inBubble ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { - bsm.cigar.add(ce); - } - } else if( edges.get(edges.size()-1).isRef() && !graph.isRefSink(edges.get(edges.size()-1)) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edges.get(edges.size()-1)), null).getCigarElements() ) { - bsm.cigar.add(ce); - } - } - - return AlignmentUtils.consolidateCigar(bsm.cigar); - } - - /** - * Advance the bubble state machine by incorporating the next node in the path. - * @param bsm the current bubble state machine - * @param node the node to be incorporated - * @param e the edge which generated this node in the path - */ - @Requires({"bsm != null", "graph != null", "node != null"}) - private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { - if( graph.isReferenceNode( node ) ) { - if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else - if( e !=null && !e.isRef() ) { - if( graph.referencePathExists( graph.getEdgeSource(e), node) ) { - for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) { - bsm.cigar.add(ce); - } - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) ); - } else { - bsm.inBubble = true; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = graph.getEdgeSource(e); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } - } else { - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } - } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } else { // close the bubble and use a local SW to determine the Cigar string - for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { - bsm.cigar.add(ce); - } - bsm.inBubble = false; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = null; - bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); - } - } else { // non-ref vertex - if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } else { // open up a bubble - bsm.inBubble = true; - bsm.bubbleBytes = null; - bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null ); - bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); - } - } - } - - /** - * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble - * @param bubbleBytes the bytes that comprise the alternate allele path in this bubble - * @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex) - * @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex) - * @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble - */ - @Requires({"graph != null"}) - @Ensures({"result != null"}) - private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { - final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); - - final Cigar returnCigar = new Cigar(); - - // add padding to anchor ref/alt bases in the SW matrix - byte[] padding = STARTING_SW_ANCHOR_BYTES; - boolean goodAlignment = false; - SWPairwiseAlignment swConsensus = null; - while( !goodAlignment && padding.length < 1000 ) { - padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time - final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding ); - final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding ); - swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); - if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) { - goodAlignment = true; - } - } - if( !goodAlignment ) { - returnCigar.add(new CigarElement(1, CigarOperator.N)); - return returnCigar; - } - - final Cigar swCigar = swConsensus.getCigar(); - if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference - returnCigar.add(new CigarElement(1, CigarOperator.N)); - } else { - for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { - // now we need to remove the padding from the cigar string - int length = swCigar.getCigarElement(iii).getLength(); - if( iii == 0 ) { length -= padding.length; } - if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } - if( length > 0 ) { - returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator())); - } - } - if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { - throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); - } - } - - return returnCigar; - } - - // class to keep track of the bubble state machine - protected static class BubbleStateMachine { - public boolean inBubble = false; - public byte[] bubbleBytes = null; - public T lastSeenReferenceNode = null; - public Cigar cigar = null; - - public BubbleStateMachine( final Cigar initialCigar ) { - inBubble = false; - bubbleBytes = null; - lastSeenReferenceNode = null; - cigar = initialCigar; - } - } - } - + /** + * Compare paths such that paths with greater weight are earlier in a list + */ protected static class PathComparatorTotalScore implements Comparator, Serializable { @Override public int compare(final Path path1, final Path path2) { - return path1.totalScore - path2.totalScore; + return path2.getScore() - path1.getScore(); } } + /** + * @see #getKBestPaths(BaseGraph, int) retriving the first 1000 paths + */ + public List> getKBestPaths( final BaseGraph graph ) { + return getKBestPaths(graph, 1000); + } + /** * Traverse the graph and pull out the best k paths. * Paths are scored via their comparator function. The default being PathComparatorTotalScore() @@ -341,51 +92,41 @@ public class KBestPaths { * @return a list with at most k top-scoring paths from the graph */ @Ensures({"result != null", "result.size() <= k"}) - public List getKBestPaths( final BaseGraph graph, final int k ) { + public List> getKBestPaths( final BaseGraph graph, final int k ) { if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } - if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); } - final ArrayList bestPaths = new ArrayList(); - + // a min max queue that will collect the best k paths + final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); + // run a DFS for best paths - for( final T v : graph.vertexSet() ) { - if( graph.inDegreeOf(v) == 0 ) { - findBestPaths(new Path(v, graph), bestPaths); + for ( final T v : graph.vertexSet() ) { + if ( graph.inDegreeOf(v) == 0 ) { + findBestPaths(new Path(v, graph), bestPaths, new MyInt()); } } - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - Collections.reverse(bestPaths); - return bestPaths.subList(0, Math.min(k, bestPaths.size())); + // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result + final List> toReturn = new ArrayList>(bestPaths); + Collections.sort(toReturn, new PathComparatorTotalScore()); + return toReturn; } - private void findBestPaths( final Path path, final List bestPaths ) { - findBestPaths(path, bestPaths, new MyInt()); - } - - private void findBestPaths( final Path path, final List bestPaths, final MyInt n ) { - + private void findBestPaths( final Path path, final MinMaxPriorityQueue> bestPaths, final MyInt n ) { // did we hit the end of a path? if ( allOutgoingEdgesHaveBeenVisited(path) ) { - if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) { - // clean out some low scoring paths - Collections.sort(bestPaths, new PathComparatorTotalScore() ); - for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20 - } bestPaths.add(path); - } else if( n.val > 10000) { - // do nothing, just return + } else if( n.val > 10000 ) { + // do nothing, just return, as we've done too much work already } else { // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(); - edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex)); + final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); for ( final BaseEdge edge : edgeArrayList ) { // make sure the edge is not already in the path if ( path.containsEdge(edge) ) continue; - final Path newPath = new Path(path, edge); + final Path newPath = new Path(path, edge); n.val++; findBestPaths(newPath, bestPaths, n); } @@ -393,11 +134,15 @@ public class KBestPaths { } /** + * Have all of the outgoing edges of the final vertex been visited? + * + * I.e., are all outgoing vertices of the current path in the list of edges of the graph? + * * @param path the path to test * @return true if all the outgoing edges at the end of this path have already been visited */ private boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { - for( final BaseEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) { + for( final BaseEdge edge : path.getOutgoingEdgesOfLastVertex() ) { if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles return false; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java new file mode 100644 index 000000000..895cffcca --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -0,0 +1,394 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; + +import java.util.*; + +/** + * A path thought a BaseGraph + * + * class to keep track of paths + * + * User: depristo + * Date: 3/19/13 + * Time: 2:34 PM + * + */ +class Path { + // the last vertex seen in the path + private final T lastVertex; + + // the list of edges comprising the path + private Set edgesAsSet = null; + private final LinkedList edgesInOrder; + + // the scores for the path + private final int totalScore; + + // the graph from which this path originated + private final BaseGraph graph; + + // used in the bubble state machine to apply Smith-Waterman to the bubble sequence + // these values were chosen via optimization against the NA12878 knowledge base + private static final double SW_MATCH = 20.0; + private static final double SW_MISMATCH = -15.0; + private static final double SW_GAP = -26.0; + private static final double SW_GAP_EXTEND = -1.1; + private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); + + /** + * Create a new Path containing no edges and starting at initialVertex + * @param initialVertex the starting vertex of the path + * @param graph the graph this path with follow through + */ + public Path(final T initialVertex, final BaseGraph graph) { + if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null"); + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph); + + lastVertex = initialVertex; + edgesInOrder = new LinkedList(); + totalScore = 0; + this.graph = graph; + } + + /** + * Create a new Path extending p with edge + * + * @param p the path to extend + * @param edge the edge to extend path by + */ + public Path(final Path p, final BaseEdge edge) { + if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); + if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); + if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); + if ( ! p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); } + + graph = p.graph; + lastVertex = p.graph.getEdgeTarget(edge); + edgesInOrder = new LinkedList(p.getEdges()); + edgesInOrder.add(edge); + totalScore = p.totalScore + edge.getMultiplicity(); + } + + /** + * Get the collection of edges leaving the last vertex of this path + * @return a non-null collection + */ + public Collection getOutgoingEdgesOfLastVertex() { + return getGraph().outgoingEdgesOf(getLastVertex()); + } + + /** + * Does this path contain the given edge + * @param edge the given edge to test + * @return true if the edge is found in this path + */ + public boolean containsEdge( final BaseEdge edge ) { + if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); } + if ( edgesInOrder.isEmpty() ) return false; + + // initialize contains cache if necessary + if ( edgesAsSet == null ) edgesAsSet = new HashSet(edgesInOrder); + return edgesAsSet.contains(edge); + } + + /** + * Check that two paths have the same edges and total score + * @param path the other path we might be the same as + * @return true if this and path are the same + */ + protected boolean pathsAreTheSame(Path path) { + return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path="); + boolean first = true; + for ( final T v : getVertices() ) { + if ( first ) { + b.append(" -> "); + first = false; + } + b.append(v.getSequenceString()); + } + return b.toString(); + } + + /** + * Get the graph of this path + * @return a non-null graph + */ + @Ensures("result != null") + public BaseGraph getGraph() { + return graph; + } + + /** + * Get the edges of this path in order + * @return a non-null list of edges + */ + @Ensures("result != null") + public List getEdges() { return edgesInOrder; } + + /** + * Get the list of vertices in this path in order defined by the edges of the path + * @return a non-null, non-empty list of vertices + */ + @Ensures({"result != null", "!result.isEmpty()"}) + public List getVertices() { + if ( getEdges().isEmpty() ) + return Collections.singletonList(lastVertex); + else { + final LinkedList vertices = new LinkedList(); + boolean first = true; + for ( final BaseEdge e : getEdges() ) { + if ( first ) { + vertices.add(graph.getEdgeSource(e)); + first = false; + } + vertices.add(graph.getEdgeTarget(e)); + } + return vertices; + } + } + + /** + * Get the total score of this path (bigger is better) + * @return a positive integer + */ + @Ensures("result >= 0") + public int getScore() { return totalScore; } + + /** + * Get the final vertex of the path + * @return a non-null vertex + */ + @Ensures("result != null") + public T getLastVertex() { return lastVertex; } + + /** + * The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes + * @return non-null sequence of bases corresponding to this path + */ + @Ensures({"result != null"}) + public byte[] getBases() { + if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); } + + byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst())); + for( final BaseEdge e : edgesInOrder ) { + bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); + } + return bases; + } + + /** + * Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble + * @return non-null Cigar string with reference length equal to the refHaplotype's reference length + */ + @Ensures("result != null") + public Cigar calculateCigar() { + final Cigar cigar = new Cigar(); + // special case for paths that start on reference but not at the reference source node + if( edgesInOrder.getFirst().isRef() && !graph.isRefSource(edgesInOrder.getFirst()) ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edgesInOrder.getFirst())).getCigarElements() ) { + cigar.add(ce); + } + } + + // reset the bubble state machine + final BubbleStateMachine bsm = new BubbleStateMachine(cigar); + + for( final BaseEdge e : getEdges() ) { + if( e.equals(graph, edgesInOrder.getFirst()) ) { + advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); + } + advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); + } + + // special case for paths that don't end on reference + if( bsm.inBubble ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) { + bsm.cigar.add(ce); + } + } else if( edgesInOrder.getLast().isRef() && !graph.isRefSink(edgesInOrder.getLast()) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edgesInOrder.getLast()), null).getCigarElements() ) { + bsm.cigar.add(ce); + } + } + + return AlignmentUtils.consolidateCigar(bsm.cigar); + } + + /** + * Advance the bubble state machine by incorporating the next node in the path. + * @param bsm the current bubble state machine + * @param node the node to be incorporated + * @param e the edge which generated this node in the path + */ + @Requires({"bsm != null", "graph != null", "node != null"}) + private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final T node, final BaseEdge e ) { + if( graph.isReferenceNode( node ) ) { + if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else + if( e !=null && !e.isRef() ) { + if( graph.referencePathExists( graph.getEdgeSource(e), node) ) { + for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) { + bsm.cigar.add(ce); + } + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); + } else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) ); + } else { + bsm.inBubble = true; + bsm.bubbleBytes = null; + bsm.lastSeenReferenceNode = graph.getEdgeSource(e); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } + } else { + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); + } + } else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } else { // close the bubble and use a local SW to determine the Cigar string + for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) { + bsm.cigar.add(ce); + } + bsm.inBubble = false; + bsm.bubbleBytes = null; + bsm.lastSeenReferenceNode = null; + bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) ); + } + } else { // non-ref vertex + if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } else { // open up a bubble + bsm.inBubble = true; + bsm.bubbleBytes = null; + bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null ); + bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) ); + } + } + } + + /** + * Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble + * @param bubbleBytes the bytes that comprise the alternate allele path in this bubble + * @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex) + * @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex) + * @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble + */ + @Requires({"graph != null"}) + @Ensures({"result != null"}) + private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) { + final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null); + + final Cigar returnCigar = new Cigar(); + + // add padding to anchor ref/alt bases in the SW matrix + byte[] padding = STARTING_SW_ANCHOR_BYTES; + boolean goodAlignment = false; + SWPairwiseAlignment swConsensus = null; + while( !goodAlignment && padding.length < 1000 ) { + padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time + final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding ); + final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding ); + swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) { + goodAlignment = true; + } + } + if( !goodAlignment ) { + returnCigar.add(new CigarElement(1, CigarOperator.N)); + return returnCigar; + } + + final Cigar swCigar = swConsensus.getCigar(); + if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference + returnCigar.add(new CigarElement(1, CigarOperator.N)); + } else { + for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { + // now we need to remove the padding from the cigar string + int length = swCigar.getCigarElement(iii).getLength(); + if( iii == 0 ) { length -= padding.length; } + if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; } + if( length > 0 ) { + returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator())); + } + } + if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) { + throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar()); + } + } + + return returnCigar; + } + + // class to keep track of the bubble state machine + private static class BubbleStateMachine { + public boolean inBubble = false; + public byte[] bubbleBytes = null; + public T lastSeenReferenceNode = null; + public Cigar cigar = null; + + public BubbleStateMachine( final Cigar initialCigar ) { + inBubble = false; + bubbleBytes = null; + lastSeenReferenceNode = null; + cigar = initialCigar; + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java index 10863cef9..34b4ba912 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java @@ -55,9 +55,9 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedList; import java.util.List; /** @@ -67,6 +67,72 @@ import java.util.List; */ public class KBestPathsUnitTest { + @DataProvider(name = "BasicPathFindingData") + public Object[][] makeBasicPathFindingData() { + List tests = new ArrayList(); +// for ( final int nStartNodes : Arrays.asList(1) ) { +// for ( final int nBranchesPerBubble : Arrays.asList(2) ) { +// for ( final int nEndNodes : Arrays.asList(1) ) { +// for ( final boolean addCycle : Arrays.asList(true) ) { + for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { + for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { + for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { + for ( final boolean addCycle : Arrays.asList(true, false) ) { + tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private static int weight = 1; + final List createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { + final List seqs = Arrays.asList("A", "C", "G", "T"); + final List vertices = new LinkedList(); + for ( int i = 0; i < n; i++ ) { + final SeqVertex v = new SeqVertex(seqs.get(i)); + graph.addVertex(v); + vertices.add(v); + if ( source != null ) graph.addEdge(source, v, new BaseEdge(false, weight++)); + if ( target != null ) graph.addEdge(v, target, new BaseEdge(false, weight++)); + } + return vertices; + } + + @Test(dataProvider = "BasicPathFindingData", enabled = true) + public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle) { + SeqGraph graph = new SeqGraph(); + + final SeqVertex middleTop = new SeqVertex("GTAC"); + final SeqVertex middleBottom = new SeqVertex("ACTG"); + graph.addVertices(middleTop, middleBottom); + final List starts = createVertices(graph, nStartNodes, null, middleTop); + final List bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); + final List ends = createVertices(graph, nEndNodes, middleBottom, null); + + if ( addCycle ) graph.addEdge(middleBottom, middleBottom); + + // enumerate all possible paths + final List> paths = new KBestPaths().getKBestPaths(graph); + + final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle ? 2 : 1) * nEndNodes; + Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); + + int lastScore = Integer.MAX_VALUE; + for ( final Path path : paths ) { + Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore); + lastScore = path.getScore(); + } + + // get the best path, and make sure it's the same as our optimal path overall + final Path best = paths.get(0); + final List> justOne = new KBestPaths().getKBestPaths(graph, 1); + Assert.assertEquals(justOne.size(), 1); + Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); + } + @DataProvider(name = "BasicBubbleDataProvider") public Object[][] makeBasicBubbleDataProvider() { List tests = new ArrayList(); @@ -99,12 +165,10 @@ public class KBestPathsUnitTest { graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); - graph.printGraph(new File("test.dot"), 10); - // Construct the test path - KBestPaths.Path path = new KBestPaths.Path(v, graph); - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); + Path path = new Path(v, graph); + path = new Path(path, graph.getEdge(v, v2Alt)); + path = new Path(path, graph.getEdge(v2Alt, v3)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); @@ -123,52 +187,40 @@ public class KBestPathsUnitTest { Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); } - // TODO -- test block substitution because it doesn't look like it's correct now -// @Test(dataProvider = "BasicBubbleDataProvider") -// public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { -// // Construct the assembly graph -// final int KMER_LENGTH = 3; -// SeqGraph graph = new SeqGraph(KMER_LENGTH); -// final String preRef = "ATGG"; -// final String postRef = "GGGGC"; -// -// SeqVertex v = new SeqVertex(preRef); -// SeqVertex v2Ref = new SeqVertex(Utils.dupString('A', refBubbleLength)); -// SeqVertex v2Alt = new SeqVertex(Utils.dupString('T', altBubbleLength)); -// SeqVertex v3 = new SeqVertex(postRef); -// -// graph.addVertex(v); -// graph.addVertex(v2Ref); -// graph.addVertex(v2Alt); -// graph.addVertex(v3); -// graph.addEdge(v, v2Ref, new BaseEdge(true, 10)); -// graph.addEdge(v2Ref, v3, new BaseEdge(true, 10)); -// graph.addEdge(v, v2Alt, new BaseEdge(false, 5)); -// graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); -// -// graph.printGraph(new File("test.dot"), 10); -// -// // Construct the test path -// KBestPaths.Path path = new KBestPaths.Path(v, graph); -// path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); -// path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); -// -// // Construct the actual cigar string implied by the test path -// Cigar expectedCigar = new Cigar(); -// expectedCigar.add(new CigarElement(preRef.length(), CigarOperator.M)); -// if( refBubbleLength > altBubbleLength ) { -// expectedCigar.add(new CigarElement(altBubbleLength, CigarOperator.M)); -// expectedCigar.add(new CigarElement(refBubbleLength - altBubbleLength, CigarOperator.D)); -// } else if ( refBubbleLength < altBubbleLength ) { -// expectedCigar.add(new CigarElement(altBubbleLength - refBubbleLength,CigarOperator.I)); -// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); -// } else { -// expectedCigar.add(new CigarElement(refBubbleLength, CigarOperator.M)); -// } -// expectedCigar.add(new CigarElement(postRef.length() - (KMER_LENGTH - 1), CigarOperator.M)); -// -// Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); -// } + @DataProvider(name = "GetBasesData") + public Object[][] makeGetBasesData() { + List tests = new ArrayList(); + + final List frags = Arrays.asList("ACT", "GAC", "CAT"); + + for ( int n = 1; n <= frags.size(); n++ ) { + for ( final List comb : Utils.makePermutations(frags, n, false) ) { + tests.add(new Object[]{comb}); + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetBasesData", enabled = true) + public void testGetBases(final List frags) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(3); + + SeqVertex prev = null; + for ( int i = 0; i < frags.size(); i++ ) { + SeqVertex v = new SeqVertex(frags.get(i)); + graph.addVertex(v); + if ( prev != null ) + graph.addEdge(prev, v); + prev = v; + } + + // enumerate all possible paths + final List> paths = new KBestPaths().getKBestPaths(graph); + Assert.assertEquals(paths.size(), 1); + final Path path = paths.get(0); + Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence"); + } @DataProvider(name = "TripleBubbleDataProvider") public Object[][] makeTripleBubbleDataProvider() { @@ -236,21 +288,19 @@ public class KBestPathsUnitTest { graph.addEdge(v6Alt, v7, new BaseEdge(false, 55)); graph.addEdge(v7, postV, new BaseEdge(false, 1)); - graph.printGraph(new File("test.debruijn.dot"), 10); - // Construct the test path - KBestPaths.Path path = new KBestPaths.Path( (offRefBeginning ? preV : v), graph); + Path path = new Path( (offRefBeginning ? preV : v), graph); if( offRefBeginning ) { - path = new KBestPaths.Path(path, graph.getEdge(preV, v)); + path = new Path(path, graph.getEdge(preV, v)); } - path = new KBestPaths.Path(path, graph.getEdge(v, v2Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v2Alt, v3)); - path = new KBestPaths.Path(path, graph.getEdge(v3, v4Ref)); - path = new KBestPaths.Path(path, graph.getEdge(v4Ref, v5)); - path = new KBestPaths.Path(path, graph.getEdge(v5, v6Alt)); - path = new KBestPaths.Path(path, graph.getEdge(v6Alt, v7)); + path = new Path(path, graph.getEdge(v, v2Alt)); + path = new Path(path, graph.getEdge(v2Alt, v3)); + path = new Path(path, graph.getEdge(v3, v4Ref)); + path = new Path(path, graph.getEdge(v4Ref, v5)); + path = new Path(path, graph.getEdge(v5, v6Alt)); + path = new Path(path, graph.getEdge(v6Alt, v7)); if( offRefEnding ) { - path = new KBestPaths.Path(path, graph.getEdge(v7,postV)); + path = new Path(path, graph.getEdge(v7,postV)); } // Construct the actual cigar string implied by the test path From 2e36f15861fc16ae095faf517af16e01f6db7450 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 19 Mar 2013 16:22:06 -0400 Subject: [PATCH 090/226] Update md5s to reflect new downsampling and assembly algorithm output -- Only minor differences, with improvement in allele discovery where the sites differ. The test of an insertion at the start of the MT no longer calls a 1 bp indel at position 0 in the genome --- ...plexAndSymbolicVariantsIntegrationTest.java | 8 ++++---- .../HaplotypeCallerIntegrationTest.java | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 72e06ddc6..fd16ed856 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b83b53741edb07218045d6f25f20a18b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "2b9355ab532314bce157c918c7606409"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -75,7 +75,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "298c1af47a515ea7c8c1ea704d7755ce"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8225fb59b9fcbe767a473c9eb8b21537"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "fd3412030628fccf77effdb1ec03dce7"); + "f2add041ba1692db576ae9763a14b8a6"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "633e8930a263e34def5e097889dd9805"); + "383320e81a1a3bee880fcc6cd0564452"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index fb267297f..c93e54f87 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "694d6ea7f0f305854d4108379d68de75"); + HCTest(CEUTRIO_BAM, "", "75dbef605b28f02616b13bb5d8bf2fbd"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "995501d8af646af3b6eaa4109e2fb4a0"); + HCTest(NA12878_BAM, "", "fa8705a5d3ada66470019fa7ddcb9b2c"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "627124af27dc4556d83df1a04e4b9f97"); + "9f9062a6eb93f984658492400102b0c7"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,12 +96,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "205fc8647b908c0dab7b5c6d6b78c0c2"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a38f6fade253577d205a00db3e67828"); } @Test public void testHaplotypeCallerInsertionOnEdgeOfContig() { - HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "e6f7bbab7cf96cbb25837b7a94bf0f82"); + HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ccd30e226f097a40cdeebaa035a290a7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("1e7b1bda6be5d3835ae318f2977cfbdd")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f1250a8ecd404443dcca20741a74ec4f")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6d63f558259883262ea84f339acb767")); executeTest("HCTestStructuralIndels: ", spec); } @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("fd1b51b17f8f9c88abdf66a9372bce5a")); + Arrays.asList("5280f1a50ca27d8e435da0bd5b26ae93")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("d3eb900eecdafafda3170f67adff42ae")); + Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 5226b24a119a99c8139996fec65a64ae711ad234 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 19 Mar 2013 18:09:23 -0400 Subject: [PATCH 091/226] HaplotypeCaller instructure cleanup and unit testing -- UnitTest for isRootOfDiamond along with key bugfix detected while testing -- Fix up the equals methods in BaseEdge. Now called hasSameSourceAndTarget and seqEquals. A much more meaningful naming -- Generalize graphEquals to use seqEquals, so it works equally well with Debruijn and SeqGraphs -- Add BaseVertex method called seqEquals that returns true if two BaseVertex objects have the same sequence -- Reorganize SeqGraph mergeNodes into a single master function that does zipping, branch merging, and zipping again, rather than having this done in the DeBruijnAssembler itself -- Massive expansion of the SeqGraph unit tests. We now really test out the zipping and branch merging code. -- Near final cleanup of the current codebase -- DeBruijnVertex cleanup and optimizations. Since kmer graphs don't allow sequences longer than the kmer size, the suffix is always a byte, not a byte[]. Optimize the code to make use of this constraint --- .../walkers/haplotypecaller/BaseEdge.java | 15 +- .../walkers/haplotypecaller/BaseGraph.java | 51 ++++- .../walkers/haplotypecaller/BaseVertex.java | 10 + .../haplotypecaller/DeBruijnAssembler.java | 11 +- .../haplotypecaller/DeBruijnVertex.java | 44 +++- .../gatk/walkers/haplotypecaller/Path.java | 2 +- .../walkers/haplotypecaller/SeqGraph.java | 175 ++++++++++----- .../DeBruijnVertexUnitTest.java | 3 +- .../haplotypecaller/SeqGraphUnitTest.java | 210 +++++++++++++++++- 9 files changed, 435 insertions(+), 86 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index 053f0e1a1..7b5fd2bbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -116,14 +116,21 @@ public class BaseEdge { this.isRef = isRef; } - // For use when comparing edges pulled from the same graph - public boolean equals( final BaseGraph graph, final BaseEdge edge ) { + /** + * Does this and edge have the same source and target vertices in graph? + * + * @param graph the graph containing both this and edge + * @param edge our comparator edge + * @param + * @return true if we have the same source and target vertices + */ + public boolean hasSameSourceAndTarget(final BaseGraph graph, final BaseEdge edge) { return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge))); } // For use when comparing edges across graphs! - public boolean equals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { - return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); + public boolean seqEquals( final BaseGraph graph, final BaseEdge edge, final BaseGraph graph2 ) { + return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge))); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index 6aa687312..ec5c99bb1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -310,6 +310,19 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph the type of the nodes in those graphs + * @return true if g1 and g2 are equals + */ public static boolean graphEquals(final BaseGraph g1, BaseGraph g2) { - if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) { + final Set vertices1 = g1.vertexSet(); + final Set vertices2 = g2.vertexSet(); + final Set edges1 = g1.edgeSet(); + final Set edges2 = g2.edgeSet(); + + if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() ) return false; + + for ( final T v1 : vertices1 ) { + boolean found = false; + for ( final T v2 : vertices2 ) + found = found || v1.getSequenceString().equals(v2.getSequenceString()); + if ( ! found ) return false; } - for( BaseEdge e1 : g1.edgeSet() ) { + + for( final BaseEdge e1 : g1.edgeSet() ) { boolean found = false; for( BaseEdge e2 : g2.edgeSet() ) { - if( e1.equals(g1, e2, g2) ) { found = true; break; } + if( e1.seqEquals(g1, e2, g2) ) { found = true; break; } } if( !found ) { return false; } } - for( BaseEdge e2 : g2.edgeSet() ) { + for( final BaseEdge e2 : g2.edgeSet() ) { boolean found = false; for( BaseEdge e1 : g1.edgeSet() ) { - if( e2.equals(g2, e1, g1) ) { found = true; break; } + if( e2.seqEquals(g2, e1, g1) ) { found = true; break; } } if( !found ) { return false; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index fad7a51d1..b6d278105 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -99,6 +99,16 @@ public class BaseVertex { return true; } + /** + * Are b and this equal according to their base sequences? + * + * @param b the vertex to compare ourselves to + * @return true if b and this have the same sequence, regardless of other attributes that might differentiate them + */ + public boolean seqEquals(final BaseVertex b) { + return Arrays.equals(this.getSequence(), b.getSequence()); + } + @Override public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect return Arrays.hashCode(sequence); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 688d5336e..6d295ff97 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -194,15 +194,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); seqGraph.pruneGraph(PRUNE_FACTOR); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); - seqGraph.mergeNodes(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.preclean.dot"), PRUNE_FACTOR); seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), PRUNE_FACTOR); - seqGraph.mergeBranchingNodes(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.simplified.dot"), PRUNE_FACTOR); - seqGraph.mergeNodes(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.6.simplified.merged.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + seqGraph.simplifyGraph(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); return seqGraph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 47716b7c5..0a2c26ca4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -47,17 +47,20 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; - -import java.util.Arrays; /** * simple node class for storing kmer sequences * - * User: ebanks + * User: ebanks, mdepristo * Date: Mar 23, 2011 */ public class DeBruijnVertex extends BaseVertex { + private final static byte[][] sufficesAsByteArray = new byte[256][]; + static { + for ( int i = 0; i < sufficesAsByteArray.length; i++ ) + sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)}; + } + public DeBruijnVertex( final byte[] sequence ) { super(sequence); } @@ -85,17 +88,38 @@ public class DeBruijnVertex extends BaseVertex { */ @Ensures({"result != null", "result.length() >= 1"}) public String getSuffixString() { - return new String(getSuffix()); + return new String(getSuffixAsArray()); } - @Ensures("result != null") - // TODO this could be replaced with byte as the suffix is guarenteed to be exactly 1 base - public byte[] getSuffix() { - return Arrays.copyOfRange( sequence, getKmer() - 1, sequence.length ); + /** + * Get the suffix byte of this DeBruijnVertex + * + * The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT + * getSuffix would return T + * + * @return a byte + */ + public byte getSuffix() { + return sequence[getKmer() - 1]; } + /** + * Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory. + * + * Should not be modified + * + * @return a byte[] that contains 1 byte == getSuffix() + */ + @Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"}) + private byte[] getSuffixAsArray() { + return sufficesAsByteArray[getSuffix()]; + } + + /** + * {@inheritDoc} + */ @Override public byte[] getAdditionalSequence(boolean source) { - return source ? super.getAdditionalSequence(source) : getSuffix(); + return source ? super.getAdditionalSequence(source) : getSuffixAsArray(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 895cffcca..7546155a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -254,7 +254,7 @@ class Path { final BubbleStateMachine bsm = new BubbleStateMachine(cigar); for( final BaseEdge e : getEdges() ) { - if( e.equals(graph, edgesInOrder.getFirst()) ) { + if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) { advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null ); } advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index 960f2cdd7..f67815b92 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; @@ -77,67 +79,83 @@ public class SeqGraph extends BaseGraph { super(kmer); } - protected void mergeNodes() { + /** + * Simplify this graph, merging vertices together and restructuring the graph in an + * effort to minimize the number of overall vertices in the graph without changing + * in any way the sequences implied by a complex enumeration of all paths through the graph. + */ + public void simplifyGraph() { + zipLinearChains(); + mergeBranchingNodes(); zipLinearChains(); } + /** + * Zip up all of the simple linear chains present in this graph. + */ protected void zipLinearChains() { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; - - for( final BaseEdge e : edgeSet() ) { - final SeqVertex outgoingVertex = getEdgeTarget(e); - final SeqVertex incomingVertex = getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) - && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 - && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { - - final Set outEdges = outgoingEdgesOf(outgoingVertex); - final Set inEdges = incomingEdgesOf(incomingVertex); - if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); - } - - final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); - addVertex(addedVertex); - for( final BaseEdge edge : outEdges ) { - addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); - } - for( final BaseEdge edge : inEdges ) { - addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); - } - - removeVertex(incomingVertex); - removeVertex(outgoingVertex); - foundNodesToMerge = true; - break; - } - } + while( zipOneLinearChain() ) { + // just keep going until zipOneLinearChain says its done } } - // - // X -> ABC -> Y - // -> aBC -> Y - // - // becomes - // - // X -> A -> BCY - // -> a -> BCY - // - public void mergeBranchingNodes() { + /** + * Merge together two vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence + * + * Only works on vertices where v1's only outgoing edge is to v2 and v2's only incoming edge is from v1. + * + * If such a pair of vertices is found, they are merged and the graph is update. Otherwise nothing is changed. + * + * @return true if any such pair of vertices could be found, false otherwise + */ + protected boolean zipOneLinearChain() { + for( final BaseEdge e : edgeSet() ) { + final SeqVertex outgoingVertex = getEdgeTarget(e); + final SeqVertex incomingVertex = getEdgeSource(e); + if( !outgoingVertex.equals(incomingVertex) + && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 + && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { + + final Set outEdges = outgoingEdgesOf(outgoingVertex); + final Set inEdges = incomingEdgesOf(incomingVertex); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + } else if( inEdges.size() == 1 ) { + inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } else if( outEdges.size() == 1 ) { + outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + } + + final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); + addVertex(addedVertex); + for( final BaseEdge edge : outEdges ) { + addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + for( final BaseEdge edge : inEdges ) { + addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); + } + + removeVertex(incomingVertex); + removeVertex(outgoingVertex); + return true; + } + } + + return false; + } + + /** + * Perform as many branch simplifications and merging operations as possible on this graph, + * modifying it in place. + */ + private void mergeBranchingNodes() { boolean foundNodesToMerge = true; while( foundNodesToMerge ) { foundNodesToMerge = false; for( final SeqVertex v : vertexSet() ) { - foundNodesToMerge = simplifyDiamond(v); + foundNodesToMerge = simplifyDiamondIfPossible(v); if ( foundNodesToMerge ) break; } @@ -153,8 +171,11 @@ public class SeqGraph extends BaseGraph { * \ | / / * b * - * @param v - * @return + * Only returns true if all outgoing edges of v go to vertices that all only connect to + * a single bottom node, and that all middle nodes have only the single edge + * + * @param v the vertex to test if its the top of a diamond pattern + * @return true if v is the root of a diamond */ protected boolean isRootOfDiamond(final SeqVertex v) { final Set ve = outgoingEdgesOf(v); @@ -173,6 +194,7 @@ public class SeqGraph extends BaseGraph { if ( inDegreeOf(mi) != 1 ) return false; + // make sure that all outgoing vertices of mi go only to the bottom node for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { if ( bottom == null ) bottom = mt; @@ -181,9 +203,24 @@ public class SeqGraph extends BaseGraph { } } + // bottom has some connections coming in from other nodes, don't allow + if ( inDegreeOf(bottom) != ve.size() ) + return false; + return true; } + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return + */ + @Requires("!middleVertices.isEmpty()") private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { final String[] kmers = new String[middleVertices.size()]; @@ -196,6 +233,14 @@ public class SeqGraph extends BaseGraph { return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); } + /** + * Get the node that is the bottom of a diamond configuration in the graph starting at top + * + * @param top + * @return + */ + @Requires("top != null") + @Ensures({"result != null"}) private SeqVertex getDiamondBottom(final SeqVertex top) { final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); final SeqVertex middle = getEdgeTarget(topEdge); @@ -203,6 +248,13 @@ public class SeqGraph extends BaseGraph { return getEdgeTarget(middleEdge); } + /** + * Get the set of vertices that are in the middle of a diamond starting at top + * @param top + * @return + */ + @Requires("top != null") + @Ensures({"result != null", "!result.isEmpty()"}) final Set getMiddleVertices(final SeqVertex top) { final Set middles = new HashSet(); for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { @@ -211,7 +263,26 @@ public class SeqGraph extends BaseGraph { return middles; } - private boolean simplifyDiamond(final SeqVertex top) { + /** + * Simply a diamond configuration in the current graph starting at top, if possible + * + * If top is actually the top of a diamond that can be simplified (i.e., doesn't have any + * random edges or other structure that would cause problems with the transformation), then this code + * performs the following transformation on this graph (modifying it): + * + * A -> M1 -> B, A -> M2 -> B, A -> Mn -> B + * + * becomes + * + * A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B' + * + * where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each + * middle vertex without their shared suffix. + * + * @param top a proposed vertex in this graph that might start a diamond (but doesn't have to) + * @return true top actually starts a diamond and it could be simplified + */ + private boolean simplifyDiamondIfPossible(final SeqVertex top) { if ( ! isRootOfDiamond(top) ) return false; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java index 2db35e173..dfbe50668 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java @@ -58,8 +58,7 @@ public class DeBruijnVertexUnitTest extends BaseTest { Assert.assertEquals(v.getSequence(), bases); Assert.assertEquals(v.getSequenceString(), new String(bases)); Assert.assertEquals(v.length(), bases.length); - Assert.assertEquals(v.getSuffix().length, 1); - Assert.assertEquals(v.getSuffix()[0], (byte)'T'); + Assert.assertEquals(v.getSuffix(), (byte)'T'); Assert.assertEquals(v.getSuffixString(), "T"); Assert.assertEquals(v.getAdditionalSequence(true), bases); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index b5089e878..c63996d66 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,6 +51,10 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + public class SeqGraphUnitTest extends BaseTest { private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { public byte[] sequence; @@ -75,7 +79,7 @@ public class SeqGraphUnitTest extends BaseTest { deBruijnGraph.addKmersToGraph(kmer1, kmer2, false, 1); } final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); - seqGraph.mergeNodes(); + seqGraph.simplifyGraph(); return seqGraph; } } @@ -103,4 +107,208 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex actualV = actual.vertexSet().iterator().next(); Assert.assertEquals(actualV.getSequence(), cfg.sequence); } + + @DataProvider(name = "IsDiamondData") + public Object[][] makeIsDiamondData() throws Exception { + List tests = new ArrayList(); + + SeqGraph graph; + SeqVertex pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2; + + graph = new SeqGraph(); + + pre1 = new SeqVertex("ACT"); + pre2 = new SeqVertex("AGT"); + top = new SeqVertex("A"); + middle1 = new SeqVertex("CT"); + middle2 = new SeqVertex("CG"); + middle3 = new SeqVertex("CA"); + bottom = new SeqVertex("AA"); + tail1 = new SeqVertex("GC"); + tail2 = new SeqVertex("GC"); + + graph.addVertices(pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2); + graph.addEdges(pre1, top, middle1, bottom, tail1); + graph.addEdges(pre2, top, middle2, bottom, tail1); + graph.addEdges(top, middle3, bottom); + graph.addEdges(bottom, tail2); + + for ( final SeqVertex no : Arrays.asList(pre1, pre2, middle1, middle2, middle3, bottom, tail1, tail2)) { + tests.add(new Object[]{graph, no, false}); + } + tests.add(new Object[]{graph, top, true}); + + final SeqGraph danglingMiddleGraph = (SeqGraph)graph.clone(); + final SeqVertex danglingMiddle = new SeqVertex("A"); + danglingMiddleGraph.addVertex(danglingMiddle); + danglingMiddleGraph.addEdge(top, danglingMiddle); + tests.add(new Object[]{danglingMiddleGraph, top, false}); + + final SeqGraph strangerToBottom = (SeqGraph)graph.clone(); + final SeqVertex notAttachedToTop = new SeqVertex("A"); + strangerToBottom.addVertex(notAttachedToTop); + strangerToBottom.addEdge(notAttachedToTop, bottom); + tests.add(new Object[]{strangerToBottom, top, false}); + + final SeqGraph strangerToMiddle = (SeqGraph)graph.clone(); + final SeqVertex attachedToMiddle = new SeqVertex("A"); + strangerToMiddle.addVertex(attachedToMiddle); + strangerToMiddle.addEdge(attachedToMiddle, middle1); + tests.add(new Object[]{strangerToMiddle, top, false}); + + // middle1 has outgoing edge to non-bottom + final SeqGraph middleExtraOut = (SeqGraph)graph.clone(); + final SeqVertex fromMiddle = new SeqVertex("A"); + middleExtraOut.addVertex(fromMiddle); + middleExtraOut.addEdge(middle1, fromMiddle); + tests.add(new Object[]{middleExtraOut, top, false}); + + // top connects to bottom directly as well + { + final SeqGraph topConnectsToBottomToo = new SeqGraph(); + final SeqVertex top2 = new SeqVertex("A"); + final SeqVertex middle4 = new SeqVertex("C"); + final SeqVertex bottom2 = new SeqVertex("G"); + topConnectsToBottomToo.addVertices(top2, middle4, bottom2); + topConnectsToBottomToo.addEdges(top2, middle4, bottom2); + topConnectsToBottomToo.addEdges(top2, bottom2); + tests.add(new Object[]{topConnectsToBottomToo, top2, false}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "IsDiamondData", enabled = true) + public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) { + Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond); + } + + @DataProvider(name = "MergingData") + public Object[][] makeMergingData() throws Exception { + List tests = new ArrayList(); + + final SeqGraph graph = new SeqGraph(); + + SeqVertex pre1 = new SeqVertex("ACT"); + SeqVertex pre2 = new SeqVertex("AGT"); + SeqVertex top = new SeqVertex("A"); + SeqVertex middle1 = new SeqVertex("GC"); + SeqVertex middle2 = new SeqVertex("TC"); + SeqVertex middle3 = new SeqVertex("AC"); + SeqVertex middle4 = new SeqVertex("GCAC"); + SeqVertex bottom = new SeqVertex("AA"); + SeqVertex tail1 = new SeqVertex("GC"); + SeqVertex tail2 = new SeqVertex("GC"); + + // just a single vertex + graph.addVertices(pre1); + tests.add(new Object[]{graph.clone(), graph.clone()}); + + // pre1 -> top = pre1 + top + { + graph.addVertices(top); + graph.addEdges(pre1, top); + final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); + final SeqGraph expected = new SeqGraph(); + expected.addVertex(pre1_top); + tests.add(new Object[]{graph.clone(), expected.clone()}); + } + + // pre1 -> top -> middle1 = pre1 + top + middle1 + { + graph.addVertices(middle1); + graph.addEdges(top, middle1); + final SeqGraph expected = new SeqGraph(); + final SeqVertex pre1_top_middle1 = new SeqVertex(pre1.getSequenceString() + top.getSequenceString() + middle1.getSequenceString()); + expected.addVertex(pre1_top_middle1); + tests.add(new Object[]{graph.clone(), expected}); + } + + // pre1 -> top -> middle1 & top -> middle2 = pre1 + top -> middle1 & -> middle2 + { + graph.addVertices(middle2); + graph.addEdges(top, middle2); + final SeqGraph expected = new SeqGraph(); + final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); + expected.addVertices(pre1_top, middle1, middle2); + expected.addEdges(pre1_top, middle1); + expected.addEdges(pre1_top, middle2); + tests.add(new Object[]{graph.clone(), expected}); + } + + // An actual diamond event to merge! + { + graph.addVertices(bottom); + graph.addEdges(middle1, bottom); + graph.addEdges(middle2, bottom); + final SeqGraph expected = new SeqGraph(); + final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); + final SeqVertex newMiddle1 = new SeqVertex("G"); + final SeqVertex newMiddle2 = new SeqVertex("T"); + final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); + expected.addVertices(pre1_top, newMiddle1, newMiddle2, newBottom); + expected.addEdges(pre1_top, newMiddle1, newBottom); + expected.addEdges(pre1_top, newMiddle2, newBottom); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + graph.addVertices(middle3); + graph.addEdges(top, middle3, bottom); + final SeqVertex newMiddle3 = new SeqVertex("A"); + expected.addVertices(newMiddle3); + expected.addEdges(pre1_top, newMiddle3, newBottom); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + graph.addVertices(middle4); + graph.addEdges(top, middle4, bottom); + final SeqVertex newMiddle4 = new SeqVertex("GCA"); + expected.addVertices(newMiddle4); + expected.addEdges(pre1_top, newMiddle4, newBottom); + tests.add(new Object[]{graph.clone(), expected.clone()}); + } + + { + final SeqGraph all = new SeqGraph(); + all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2); + all.addEdges(pre1, top, middle1, bottom, tail1); + all.addEdges(pre2, top, middle2, bottom, tail2); + + final SeqGraph expected = new SeqGraph(); + final SeqVertex newMiddle1 = new SeqVertex("G"); + final SeqVertex newMiddle2 = new SeqVertex("T"); + final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); + expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2); + expected.addEdges(pre1, top, newMiddle1, newBottom, tail1); + expected.addEdges(pre2, top, newMiddle2, newBottom, tail2); + tests.add(new Object[]{all.clone(), expected.clone()}); + } + + // test the case where we delete a middle node away because the common sequence is all of its sequence + { + final SeqGraph graph2 = new SeqGraph(); + final SeqVertex mytop = new SeqVertex("A"); + final SeqVertex mid1 = new SeqVertex("AC"); + final SeqVertex mid2 = new SeqVertex("C"); + final SeqVertex bot = new SeqVertex("G"); + graph2.addVertices(mytop, mid1, mid2, bot); + graph2.addEdges(mytop, mid1, bot); + graph2.addEdges(mytop, mid2, bot); + + final SeqGraph expected = new SeqGraph(); + final SeqVertex newMid1 = new SeqVertex("A"); + final SeqVertex newBottom = new SeqVertex("CG"); + expected.addVertices(mytop, newMid1, newBottom); + expected.addEdges(mytop, newMid1, newBottom); + expected.addEdges(mytop, newBottom); + tests.add(new Object[]{graph2, expected}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MergingData", enabled = true) + public void testMerging(final SeqGraph graph, final SeqGraph expected) { + final SeqGraph merged = (SeqGraph)graph.clone(); + merged.simplifyGraph(); + Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + } } From d3b756bdc737ef880fe1416989803e71716ccdea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 08:39:01 -0400 Subject: [PATCH 092/226] BaseVertex optimization: don't clone byte[] unnecessarily -- Don't clone sequence upon construction or in getSequence(), as these are frequently called, memory allocating routines and cloning will be prohibitively expensive --- .../walkers/haplotypecaller/BaseVertex.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index b6d278105..93bd4f5c5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -61,14 +61,16 @@ public class BaseVertex { /** * Create a new sequence vertex with sequence + * + * This code doesn't copy sequence for efficiency reasons, so sequence should absolutely not be modified + * in any way after passing this sequence to the BaseVertex + * * @param sequence a non-null, non-empty sequence of bases contained in this vertex */ public BaseVertex(final byte[] sequence) { if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); - - // TODO -- should we really be cloning here? - this.sequence = sequence.clone(); + this.sequence = sequence; } /** @@ -81,7 +83,7 @@ public class BaseVertex { /** * For testing purposes only -- low performance - * @param sequence + * @param sequence the sequence as a string */ protected BaseVertex(final String sequence) { this(sequence.getBytes()); @@ -109,8 +111,13 @@ public class BaseVertex { return Arrays.equals(this.getSequence(), b.getSequence()); } + /** + * necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + * @return + */ @Override - public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect + public int hashCode() { + // TODO -- optimization, could compute upfront once and cached in debruijn graph return Arrays.hashCode(sequence); } @@ -128,8 +135,7 @@ public class BaseVertex { */ @Ensures("result != null") public byte[] getSequence() { - // TODO -- why is this cloning? It's likely extremely expensive - return sequence.clone(); + return sequence; } /** From 3a8f001c276808dbb78e199202e7174d66e5e6c6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 14:26:37 -0400 Subject: [PATCH 093/226] Misc. fixes upon pull request review -- DeBruijnAssemblerUnitTest and AlignmentUtilsUnitTest were both in DEBUG = true mode (bad!) -- Remove the maxHaplotypesToConsider feature of HC as it's not useful --- .../haplotypecaller/DeBruijnAssembler.java | 34 ++++--------------- .../haplotypecaller/HaplotypeCaller.java | 6 +--- .../DeBruijnAssemblerUnitTest.java | 3 +- .../utils/sam/AlignmentUtilsUnitTest.java | 2 +- 4 files changed, 10 insertions(+), 35 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6d295ff97..f3db422e7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -92,7 +92,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final boolean debugGraphTransformations; private final PrintStream graphWriter; private final int minKmer; - private final int maxHaplotypesToConsider; private final byte minBaseQualityToUseInAssembly; private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; @@ -100,14 +99,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private int PRUNE_FACTOR = 2; protected DeBruijnAssembler() { - this(false, -1, null, 11, 1000, DEFAULT_MIN_BASE_QUALITY_TO_USE); + this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE); } public DeBruijnAssembler(final boolean debug, final int debugGraphTransformations, final PrintStream graphWriter, final int minKmer, - final int maxHaplotypesToConsider, final byte minBaseQualityToUseInAssembly) { super(); this.debug = debug; @@ -115,7 +113,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; this.graphWriter = graphWriter; this.minKmer = minKmer; - this.maxHaplotypesToConsider = maxHaplotypesToConsider; this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } @@ -371,39 +368,22 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - final List finalHaplotypes = selectHighestScoringHaplotypes(returnHaplotypes); - if ( finalHaplotypes.size() < returnHaplotypes.size() ) - logger.info("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); + if ( returnHaplotypes.size() < returnHaplotypes.size() ) + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); if( debug ) { - if( finalHaplotypes.size() > 1 ) { - System.out.println("Found " + finalHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + if( returnHaplotypes.size() > 1 ) { + System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { System.out.println("Found only the reference haplotype in the assembly graph."); } - for( final Haplotype h : finalHaplotypes ) { + for( final Haplotype h : returnHaplotypes ) { System.out.println( h.toString() ); System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } - return finalHaplotypes; - } - - /** - * Select the best scoring haplotypes among all present, returning no more than maxHaplotypesToConsider - * - * @param haplotypes a list of haplotypes to consider - * @return a sublist of the best haplotypes, with size() <= maxHaplotypesToConsider - */ - private List selectHighestScoringHaplotypes(final List haplotypes) { - if ( haplotypes.size() <= maxHaplotypesToConsider ) - return haplotypes; - else { - final List sorted = new ArrayList(haplotypes); - Collections.sort(sorted, new Haplotype.ScoreComparator()); - return sorted.subList(0, maxHaplotypesToConsider); - } + return returnHaplotypes; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7bec4bee5..31751d8f0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -206,10 +206,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) protected int minKmer = 11; - @Advanced - @Argument(fullName="maxHaplotypesToConsider", shortName="maxHaplotypesToConsider", doc="Maximum number of haplotypes to consider in the likelihood calculation. Setting this number too high can have dramatic performance implications", required = false) - protected int maxHaplotypesToConsider = 100000; - /** * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the @@ -393,7 +389,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, maxHaplotypesToConsider, minBaseQualityToUseInAssembly ); + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index fa581f7fd..663d619a8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -61,13 +61,12 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; public class DeBruijnAssemblerUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; @Test(enabled = !DEBUG) public void testReferenceCycleGraph() { diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index 660dadc00..125450257 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,7 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private SAMFileHeader header; /** Basic aligned and mapped read. */ From 6d7d21ca47a35b9925db55401464a3e7d86d9418 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 15:57:10 -0400 Subject: [PATCH 094/226] Bugfix for incorrect branch diamond merging algorithm -- Previous version was just incorrectly accumulating information about nodes that were completely eliminated by the common suffix, so we were dropping some reference connections between vertices. Fixed. In the process simplified the entire algorithm and codebase -- Resolves https://jira.broadinstitute.org/browse/GSA-884 --- .../walkers/haplotypecaller/BaseEdge.java | 14 +++++ .../walkers/haplotypecaller/BaseGraph.java | 19 +++++- .../walkers/haplotypecaller/SeqGraph.java | 60 ++++++++----------- .../haplotypecaller/SeqGraphUnitTest.java | 30 ++++++++++ 4 files changed, 85 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index 7b5fd2bbd..d49b63672 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -143,4 +143,18 @@ public class BaseEdge { return edge2.multiplicity - edge1.multiplicity; } } + + /** + * Add edge to this edge, updating isRef and multiplicity as appropriate + * + * isRef is simply the or of this and edge + * multiplicity is the sum + * + * @param edge the edge to add + */ + public void add(final BaseEdge edge) { + if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); + this.multiplicity += edge.getMultiplicity(); + this.isRef = this.isRef || edge.isRef(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index ec5c99bb1..c77ec4222 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -47,11 +47,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; -import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -64,7 +64,7 @@ import java.util.*; * User: rpoplin * Date: 2/6/13 */ - +@Invariant("!this.isAllowingMultipleEdges()") public class BaseGraph extends DefaultDirectedGraph { protected final static Logger logger = Logger.getLogger(BaseGraph.class); private final int kmerSize; @@ -513,4 +513,19 @@ public class BaseGraph extends DefaultDirectedGraph edges = getAllEdges(source, target); + return edges.isEmpty() ? null : edges.iterator().next(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index f67815b92..b855390c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -51,6 +51,7 @@ import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; +import java.io.File; import java.util.*; /** @@ -149,7 +150,7 @@ public class SeqGraph extends BaseGraph { * Perform as many branch simplifications and merging operations as possible on this graph, * modifying it in place. */ - private void mergeBranchingNodes() { + protected void mergeBranchingNodes() { boolean foundNodesToMerge = true; while( foundNodesToMerge ) { foundNodesToMerge = false; @@ -288,61 +289,48 @@ public class SeqGraph extends BaseGraph { final SeqVertex diamondBottom = getDiamondBottom(top); final Set middleVertices = getMiddleVertices(top); - final List verticesToRemove = new LinkedList(); final List edgesToRemove = new LinkedList(); // all of the edges point to the same sink, so it's time to merge final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); if ( commonSuffix != null ) { - boolean newBottomEdgeIsRef = false; - int newBottomEdgeMultiplicity = 0; - + final BaseEdge botToNewBottom = new BaseEdge(false, 0); + final BaseEdge elimMiddleNodeEdge = new BaseEdge(false, 0); final SeqVertex newBottomV = new SeqVertex(commonSuffix); addVertex(newBottomV); for ( final SeqVertex middle : middleVertices ) { - boolean missingNodeEdgeIsRef = false; - int missingNodeMultiplicity = 0; final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); + final BaseEdge topToMiddleEdge = getEdge(top, middle); + final BaseEdge middleToBottomE = getEdge(middle, diamondBottom); - if ( withoutSuffix != null ) // this node is a deletion + // clip out the two edges, since we'll be replacing them later + edgesToRemove.add(topToMiddleEdge); + edgesToRemove.add(middleToBottomE); + + if ( withoutSuffix != null ) { // this node is a deletion addVertex(withoutSuffix); - - // update all edges from top -> middle to be top -> without suffix - for( final BaseEdge topToMiddleEdge : getAllEdges(top, middle) ) { - edgesToRemove.add(topToMiddleEdge); - missingNodeMultiplicity += topToMiddleEdge.getMultiplicity(); - missingNodeEdgeIsRef = missingNodeEdgeIsRef || topToMiddleEdge.isRef(); - if ( withoutSuffix != null ) // this node is a deletion - addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge.isRef(), topToMiddleEdge.getMultiplicity())); + // update edge from top -> middle to be top -> without suffix + addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge)); + addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE)); + } else { + // this middle node is == the common suffix, wo we're removing the edge + elimMiddleNodeEdge.add(topToMiddleEdge); } - - // reattached prefix to the new bottom V by updating all edges from middleV -> bottom - for ( final BaseEdge middleToBottomE : getAllEdges(middle, diamondBottom) ) { - missingNodeMultiplicity += middleToBottomE.getMultiplicity(); - missingNodeEdgeIsRef = missingNodeEdgeIsRef || middleToBottomE.isRef(); - - if ( withoutSuffix != null ) // this node is a deletion - addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE.isRef(), middleToBottomE.getMultiplicity())); - edgesToRemove.add(middleToBottomE); - - // update the info for the new bottom edge - newBottomEdgeIsRef = newBottomEdgeIsRef || middleToBottomE.isRef(); - newBottomEdgeMultiplicity += middleToBottomE.getMultiplicity(); - } - - if ( withoutSuffix == null ) // add an edge from top to new bottom - addEdge(top, newBottomV, new BaseEdge(missingNodeEdgeIsRef, missingNodeMultiplicity)); - + // include the ref and multi of mid -> bot in our edge from new bot -> bot + botToNewBottom.add(middleToBottomE); verticesToRemove.add(middle); } - addEdge(newBottomV, diamondBottom, new BaseEdge(newBottomEdgeIsRef, newBottomEdgeMultiplicity)); + // add an edge from top to new bottom, because some middle nodes were removed + if ( elimMiddleNodeEdge.getMultiplicity() > 0 ) + addEdge(top, newBottomV, elimMiddleNodeEdge); + + addEdge(newBottomV, diamondBottom, botToNewBottom); removeAllEdges(edgesToRemove); removeAllVertices(verticesToRemove); - return true; } else { return false; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index c63996d66..83a4f4c50 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,6 +51,7 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -311,4 +312,33 @@ public class SeqGraphUnitTest extends BaseTest { merged.simplifyGraph(); Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); } + + // A -> ACT -> C [non-ref] + // A -> ACT -> C [non-ref] + // A -> ACT -> C [ref] + // + // Should become A -> ACT -> C [ref and non-ref edges] + // + @Test + public void testBubbleSameBasesWithRef() { + final SeqGraph graph = new SeqGraph(); + final SeqVertex top = new SeqVertex("A"); + final SeqVertex mid1 = new SeqVertex("ACT"); + final SeqVertex mid2 = new SeqVertex("ACT"); + final SeqVertex bot = new SeqVertex("C"); + graph.addVertices(top, mid1, mid2, bot); + graph.addEdges(top, mid2, bot); + graph.addEdge(top, mid1, new BaseEdge(true, 1)); + graph.addEdge(mid1, bot, new BaseEdge(true, 1)); + + final SeqGraph expected = new SeqGraph(); + expected.addVertices(top, mid1, bot); + expected.addEdge(top, mid1, new BaseEdge(true, 2)); + expected.addEdge(mid1, bot, new BaseEdge(true, 2)); + + final SeqGraph actual = ((SeqGraph)graph.clone()); + actual.mergeBranchingNodes(); + + Assert.assertTrue(BaseGraph.graphEquals(actual, expected)); + } } From d94b3f85bcd56583bc25aa94d9ece3916df39908 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 20:56:58 -0400 Subject: [PATCH 095/226] Increase NUM_BEST_PATHS_PER_KMER_GRAPH in DeBruijnAssembler to 25 -- The value of 11 was too small to properly return a real low-frequency variant in our the 1000G AFR integration test. --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index f3db422e7..7cf4cc8d3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -78,7 +78,11 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11; + + // TODO -- this number is very low, and limits our ability to explore low-frequnecy variants. It should + // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where + // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases + private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; From aa7f172b18ff5ad8e5e881dce451c30ad362d61a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 20 Mar 2013 22:40:10 -0400 Subject: [PATCH 096/226] Cap the computational cost of the kmer based error correction in the DeBruijnGraph -- Simply don't do more than MAX_CORRECTION_OPS_TO_ALLOW = 5000 * 1000 operations to correct a graph. If the number of ops would exceed this threshold, the original graph is used. -- Overall the algorithm is just extremely computational expensive, and actually doesn't implement the correct correction. So we live with this limitations while we continue to explore better algorithms -- Updating MD5s to reflect changes in assembly algorithms --- .../haplotypecaller/DeBruijnGraph.java | 25 ++-- .../haplotypecaller/KMerErrorCorrector.java | 135 ++++++++++++++---- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 2 +- 4 files changed, 127 insertions(+), 37 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java index d9df03539..0e20c311b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -90,7 +90,8 @@ public class DeBruijnGraph extends BaseGraph { /** * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return a freshly allocated graph + * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason + * we cannot actually do the error correction */ protected DeBruijnGraph errorCorrect() { final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables @@ -101,19 +102,23 @@ public class DeBruijnGraph extends BaseGraph { corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); } } - corrector.computeErrorCorrectionMap(); - final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); + if ( corrector.computeErrorCorrectionMap() ) { + final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); - for( final BaseEdge e : edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + for( final BaseEdge e : edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } } - } - return correctedGraph; + return correctedGraph; + } else { + // the error correction wasn't possible, simply return this graph + return this; + } } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java index 05bd1b881..b051e5411 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import org.apache.log4j.Logger; + import java.util.*; /** @@ -69,15 +71,54 @@ import java.util.*; * TODO -- be added to hashmaps (more specifically, those don't implement .equals). A more efficient * TODO -- version would use the byte[] directly * + * TODO -- this is just not the right way to implement error correction in the graph. Basically, the + * right way to think about this is error correcting reads: + * + * * + * ACTGAT + * ACT + * CTG + * TGA + * GAT + * + * Now suppose the G is an error. What you are doing is asking for each 3mer in the read whether it's high quality + * or not. Suppose the answer is + * + * * + * ACTGAT + * ACT -- yes + * CTG -- no [CTG is unusual] + * TGA -- no [TGA is unusual] + * GAT -- yes [maybe GAT is just common, even through its an error] + * + * As we do this process it's clear how we can figure out which positions in the read likely harbor errors, and + * then go search around those bases in the read in an attempt to fix the read. We don't have to compute for + * every bad kmer it's best match, as that's just not the problem we are thinking looking to solve. We are actually + * looking for a change to a read such that all spanning kmers are well-supported. This class is being disabled + * until we figure implement this change. + * + * * User: depristo * Date: 3/8/13 * Time: 1:16 PM */ public class KMerErrorCorrector { + private final static Logger logger = Logger.getLogger(KMerErrorCorrector.class); + + /** + * The maximum number of bad kmer -> good kmer correction operations we'll consider doing before + * aborting for efficiency reasons. Basically, the current algorithm sucks, and is O(n^2), and + * so we cannot simply error correct 10K bad kmers against a db of 100K kmers if we ever want + * to finish running in a reasonable amount of time. This isn't worth fixing because fundamentally + * the entire error correction algorithm is just not right (i.e., it's correct but not ideal conceptually + * so we'll just fix the conceptual problem than the performance issue). + */ + private final static int MAX_CORRECTION_OPS_TO_ALLOW = 5000 * 1000; + /** * A map of for each kmer to its num occurrences in addKmers */ - Map countsByKMer = new HashMap(); + Map countsByKMer = new HashMap(); /** * A map from raw kmer -> error corrected kmer @@ -154,35 +195,45 @@ public class KMerErrorCorrector { * Indicate that no more kmers will be added to the kmer error corrector, so that the * error correction data structure should be computed from the added kmers. Enabled calls * to getErrorCorrectedKmer, and disable calls to addKmer. + * + * @return true if the error correction map could actually be computed, false if for any reason + * (efficiency, memory, we're out to lunch) a correction map couldn't be created. */ - public void computeErrorCorrectionMap() { + public boolean computeErrorCorrectionMap() { if ( countsByKMer == null ) throw new IllegalStateException("computeErrorCorrectionMap can only be called once"); - final LinkedList needsCorrection = new LinkedList(); - final LinkedList goodKmers = new LinkedList(); + final LinkedList needsCorrection = new LinkedList(); + final List goodKmers = new ArrayList(countsByKMer.size()); - rawToErrorCorrectedMap = new HashMap(); - for ( Map.Entry kmerCounts: countsByKMer.entrySet() ) { - if ( kmerCounts.getValue() <= maxCountToCorrect ) - needsCorrection.add(kmerCounts.getKey()); + rawToErrorCorrectedMap = new HashMap(countsByKMer.size()); + for ( final CountedKmer countedKmer: countsByKMer.values() ) { + if ( countedKmer.count <= maxCountToCorrect ) + needsCorrection.add(countedKmer); else { // todo -- optimization could make not in map mean == - rawToErrorCorrectedMap.put(kmerCounts.getKey(), kmerCounts.getKey()); + rawToErrorCorrectedMap.put(countedKmer.kmer, countedKmer.kmer); // only allow corrections to kmers with at least this count - if ( kmerCounts.getValue() >= minCountOfKmerToBeCorrection ) - goodKmers.add(kmerCounts.getKey()); + if ( countedKmer.count >= minCountOfKmerToBeCorrection ) + goodKmers.add(countedKmer); } } - for ( final String toCorrect : needsCorrection ) { - final String corrected = findClosestKMer(toCorrect, goodKmers); - rawToErrorCorrectedMap.put(toCorrect, corrected); - } - // cleanup memory -- we don't need the counts for each kmer any longer countsByKMer = null; + + if ( goodKmers.size() * needsCorrection.size() > MAX_CORRECTION_OPS_TO_ALLOW ) + return false; + else { + Collections.sort(goodKmers); + for ( final CountedKmer toCorrect : needsCorrection ) { + final String corrected = findClosestKMer(toCorrect, goodKmers); + rawToErrorCorrectedMap.put(toCorrect.kmer, corrected); + } + + return true; + } } protected void addKmer(final String rawKmer, final int kmerCount) { @@ -190,30 +241,42 @@ public class KMerErrorCorrector { if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); if ( countsByKMer == null ) throw new IllegalStateException("Cannot add kmers to an already finalized error corrector"); - final Integer countFromMap = countsByKMer.get(rawKmer); - final int count = countFromMap == null ? 0 : countFromMap; - countsByKMer.put(rawKmer, count + kmerCount); + CountedKmer countFromMap = countsByKMer.get(rawKmer); + if ( countFromMap == null ) { + countFromMap = new CountedKmer(rawKmer); + countsByKMer.put(rawKmer, countFromMap); + } + countFromMap.count += kmerCount; } - protected String findClosestKMer(final String kmer, final Collection goodKmers) { + protected String findClosestKMer(final CountedKmer kmer, final Collection goodKmers) { String bestMatch = null; int minMismatches = Integer.MAX_VALUE; - for ( final String goodKmer : goodKmers ) { - final int mismatches = countMismatches(kmer, goodKmer); + for ( final CountedKmer goodKmer : goodKmers ) { + final int mismatches = countMismatches(kmer.kmer, goodKmer.kmer, minMismatches); if ( mismatches < minMismatches ) { minMismatches = mismatches; - bestMatch = goodKmer; + bestMatch = goodKmer.kmer; } + + // if we find an edit-distance 1 result, abort early, as we know there can be no edit distance 0 results + if ( mismatches == 1 ) + break; } return minMismatches > maxMismatchesToCorrect ? null : bestMatch; } - protected int countMismatches(final String one, final String two) { + protected int countMismatches(final String one, final String two, final int currentBest) { int mismatches = 0; - for ( int i = 0; i < one.length(); i++ ) + for ( int i = 0; i < one.length(); i++ ) { mismatches += one.charAt(i) == two.charAt(i) ? 0 : 1; + if ( mismatches > currentBest ) + break; + if ( mismatches > maxMismatchesToCorrect ) + return Integer.MAX_VALUE; + } return mismatches; } @@ -238,4 +301,26 @@ public class KMerErrorCorrector { b.append("\n}"); return b.toString(); } + + private static class CountedKmer implements Comparable { + final String kmer; + int count; + + private CountedKmer(String kmer) { + this.kmer = kmer; + } + + @Override + public String toString() { + return "CountedKmer{" + + "kmer='" + kmer + '\'' + + ", count=" + count + + '}'; + } + + @Override + public int compareTo(CountedKmer o) { + return o.count - count; + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index fd16ed856..12dc71799 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "2b9355ab532314bce157c918c7606409"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "91f4880910e436bf5aca0abbebd58948"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c93e54f87..5ee0a6b81 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "9f9062a6eb93f984658492400102b0c7"); + "d41a886f69a67e01af2ba1a6b4a681d9"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { From b9c331c2fa3299244f87694bf1afd94b16a868f6 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 21 Mar 2013 10:59:34 -0400 Subject: [PATCH 097/226] Bug fix in HC gga mode. -- Don't try to test alleles which haven't had haplotypes assigned to them --- .../sting/gatk/walkers/haplotypecaller/GenotypingEngine.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 400de6485..cc9d94b1b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -631,6 +631,10 @@ public class GenotypingEngine { if( eventToTest.getKey().equals(new Event(null)) ) continue; + // only try to disambiguate for alleles that have had haplotypes previously assigned above + if( eventToTest.getValue().isEmpty() ) + continue; + final Haplotype artificialHaplotype = eventToTest.getValue().get(0); if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) { matchingEvent = eventToTest.getKey(); From 7ae15dadbe21fdd8364da2c33d7e850e91a413c8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 21 Mar 2013 11:27:00 -0400 Subject: [PATCH 098/226] HC now by default only uses reads with MAPQ >= 20 for assembly and calling -- Previously we tried to include lots of these low mapping quality reads in the assembly and calling, but we effectively were just filtering them out anyway while generating an enormous amount of computational expense to handle them, as well as much larger memory requirements. The new version simply uses a read filter to remove them upfront. This causes no major problems -- at least, none that don't have other underlying causes -- compared to 10-11mb of the KB -- Update MD5s to reflect changes due to no longer including mmq < 20 by default --- .../haplotypecaller/HaplotypeCaller.java | 4 +- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +-- .../HaplotypeCallerIntegrationTest.java | 12 ++--- .../HCMappingQualityFilter.java | 44 +++++++++++++++++++ 4 files changed, 55 insertions(+), 11 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 31751d8f0..81ff3dfbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -56,7 +56,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -134,9 +134,9 @@ import java.util.*; @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @ActiveRegionTraversalParameters(extension=85, maxRegion=300) +@ReadFilters({HCMappingQualityFilter.class}) @Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { - /** * A raw, unfiltered, highly sensitive callset in VCF format. */ diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 12dc71799..830152903 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "91f4880910e436bf5aca0abbebd58948"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6dd29d6fec056419ab0fa03a7d43d85e"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "f2add041ba1692db576ae9763a14b8a6"); + "84616464aed68f4d9bc9e08472eff9c0"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "383320e81a1a3bee880fcc6cd0564452"); + "e2d1023b846bfac31b4f7a3a4b90d931"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5ee0a6b81..1b98b2239 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "75dbef605b28f02616b13bb5d8bf2fbd"); + HCTest(CEUTRIO_BAM, "", "9859b136d05085b5ec0833035289106a"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "fa8705a5d3ada66470019fa7ddcb9b2c"); + HCTest(NA12878_BAM, "", "27f660bf1c9a6ed7167d77022d401b73"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "d41a886f69a67e01af2ba1a6b4a681d9"); + "e25fc2196401a16347e0c730dbcbe828"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "3a38f6fade253577d205a00db3e67828"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "325d7d73e0bd86b6cb146b249eda959a"); } @Test @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("1e7b1bda6be5d3835ae318f2977cfbdd")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6d63f558259883262ea84f339acb767")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec97a0a65890169358842e765ff8dd15")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java new file mode 100644 index 000000000..3892ffe27 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.filters.ReadFilter; + +/** + * Filter out reads with low mapping qualities. + * + * @author mdepristo + */ +public class HCMappingQualityFilter extends ReadFilter { + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for analysis with the HaplotypeCaller", required = false) + public int MIN_MAPPING_QUALTY_SCORE = 20; + + public boolean filterOut(SAMRecord rec) { + return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE); + } +} \ No newline at end of file From eb33da6820e71f62219f111ef0c831f8ef43b6ca Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 21 Mar 2013 15:29:27 -0400 Subject: [PATCH 099/226] Added support to reduce reads to Callable Loci -- added calls to representativeCount() of the pileup instead of using ++ -- renamed CallableLoci integration test -- added integration test for reduce read support on callable loci --- .../sting/gatk/walkers/coverage/CallableLoci.java | 8 ++++---- ...nTest.java => CallableLociIntegrationTest.java} | 14 ++++++++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) rename public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/{CallableLociWalkerIntegrationTest.java => CallableLociIntegrationTest.java} (77%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java index a2efa626c..6af6723f2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java @@ -40,7 +40,6 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.BaseUtils; import java.io.File; import java.io.FileNotFoundException; @@ -314,13 +313,14 @@ public class CallableLoci extends LocusWalker= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { - QCDepth++; + QCDepth += depth; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java similarity index 77% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java index 859f6c4c7..6472a10bb 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java @@ -30,8 +30,9 @@ import org.testng.annotations.Test; import java.util.Arrays; -public class CallableLociWalkerIntegrationTest extends WalkerTest { - final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; +public class CallableLociIntegrationTest extends WalkerTest { + final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; + final static String reduceReadArgs = "-R " + b37KGReference + " -T CallableLoci -I " + " private/testdata/NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s"; final static String SUMMARY_MD5 = "ffdbd9cdcb4169ebed5ae4bec797260f"; @@ -66,4 +67,13 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { Arrays.asList("46a53379aaaf9803276a0a34b234f6ab", "da431d393f7c2b2b3e27556b86c1dbc7")); executeTest("formatBed lots of arguments", spec); } + + @Test(enabled=true) + public void testWithReducedRead() { + String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, + Arrays.asList("684069ffe94a1175051066ed53f0fd9d", "ebc310cf734d98e26d2d83e16b1144d1")); + executeTest("CallableLoci with ReducedRead", spec); + } + } From 965043472a2ea1b8c0a63f9441e83fe36960a451 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Mar 2013 08:30:49 -0400 Subject: [PATCH 100/226] Vastly more powerful, cleaner graph simplification approach -- Generalizes previous node merging and splitting approaches. Can split common prefixes and suffices among nodes, build a subgraph representing this new structure, and incorporate it into the original graph. Introduces the concept of edges with 0 multiplicity (for purely structural reasons) as well as vertices with no sequence (again, for structural reasons). Fully UnitTested. These new algorithms can now really simplify diamond configurations as well as ones sources and sinks that arrive / depart linearly at a common single root node. -- This new suite of algorithms is fully integrated into the HC, replacing previous approaches -- SeqGraph transformations are applied iteratively (zipping, splitting, merging) until no operations can be performed on the graph. This further simplifies the graphs, as splitting nodes may enable other merging / zip operations to go. --- .../walkers/haplotypecaller/BaseEdge.java | 32 +- .../walkers/haplotypecaller/BaseGraph.java | 84 +++- .../walkers/haplotypecaller/BaseVertex.java | 13 +- .../haplotypecaller/DeBruijnAssembler.java | 10 +- .../gatk/walkers/haplotypecaller/Path.java | 20 + .../walkers/haplotypecaller/SeqGraph.java | 365 +++++++++--------- .../walkers/haplotypecaller/SeqVertex.java | 15 + .../SharedVertexSequenceSplitter.java | 341 ++++++++++++++++ .../haplotypecaller/BaseVertexUnitTest.java | 5 +- .../haplotypecaller/SeqGraphUnitTest.java | 34 +- .../SharedVertexSequenceSplitterUnitTest.java | 253 ++++++++++++ 11 files changed, 966 insertions(+), 206 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java index d49b63672..07a6629d7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import java.io.Serializable; +import java.util.Collection; import java.util.Comparator; /** @@ -151,10 +152,39 @@ public class BaseEdge { * multiplicity is the sum * * @param edge the edge to add + * @return this */ - public void add(final BaseEdge edge) { + public BaseEdge add(final BaseEdge edge) { if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); this.multiplicity += edge.getMultiplicity(); this.isRef = this.isRef || edge.isRef(); + return this; + } + + /** + * Create a new BaseEdge with multiplicity and isRef that's an or of all edges + * + * @param edges a collection of edges to or their isRef values + * @param multiplicity our desired multiplicity + * @return a newly allocated BaseEdge + */ + public static BaseEdge orRef(final Collection edges, final int multiplicity) { + for ( final BaseEdge e : edges ) + if ( e.isRef() ) + return new BaseEdge(true, multiplicity); + return new BaseEdge(false, multiplicity); + } + + /** + * Return a new edge that the max of this and edge + * + * isRef is simply the or of this and edge + * multiplicity is the max + * + * @param edge the edge to max + */ + public BaseEdge max(final BaseEdge edge) { + if ( edge == null ) throw new IllegalArgumentException("edge cannot be null"); + return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity())); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index c77ec4222..c3f371ec7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -48,10 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -219,6 +221,15 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph vertices) { + for ( final T v : vertices ) + addVertex(v); + } + /** * Convenience function to add multiple edges to the graph * @param start the first vertex to connect @@ -426,6 +446,23 @@ public class BaseGraph extends DefaultDirectedGraph verticesToRemove = new HashSet(vertexSet()); + final DepthFirstIterator dfi = new DepthFirstIterator(this, getReferenceSourceVertex()); + while ( dfi.hasNext() ) { + final T accessibleFromRefSource = dfi.next(); + // we also want to prune all sinks that aren't the reference sink + if ( ! isNonRefSink(accessibleFromRefSource) ) + verticesToRemove.remove(accessibleFromRefSource); + } + + removeAllVertices(verticesToRemove); + } + protected void pruneGraph( final int pruneFactor ) { final List edgesToRemove = new ArrayList(); for( final BaseEdge e : edgeSet() ) { @@ -525,7 +562,52 @@ public class BaseGraph extends DefaultDirectedGraph edges = getAllEdges(source, target); + return getSingletonEdge(getAllEdges(source, target)); + } + + /** + * Get the incoming edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single incoming edge to v, or null if none exists + */ + public BaseEdge incomingEdgeOf(final T v) { + return getSingletonEdge(incomingEdgesOf(v)); + } + + /** + * Get the outgoing edge of v. Requires that there be only one such edge or throws an error + * @param v our vertex + * @return the single outgoing edge from v, or null if none exists + */ + public BaseEdge outgoingEdgeOf(final T v) { + return getSingletonEdge(outgoingEdgesOf(v)); + } + + /** + * Helper function that gets the a single edge from edges, null if edges is empty, or + * throws an error is edges has more than 1 element + * @param edges a set of edges + * @return a edge + */ + @Requires("edges != null") + private BaseEdge getSingletonEdge(final Collection edges) { + if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges); return edges.isEmpty() ? null : edges.iterator().next(); } + + /** + * Add edge between source -> target if none exists, or add e to an already existing one if present + * + * @param source source vertex + * @param target vertex + * @param e edge to add + */ + public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) { + final BaseEdge prev = getEdge(source, target); + if ( prev != null ) { + prev.add(e); + } else { + addEdge(source, target, e); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java index 93bd4f5c5..a6436f0a3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertex.java @@ -69,10 +69,21 @@ public class BaseVertex { */ public BaseVertex(final byte[] sequence) { if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); - if ( sequence.length == 0 ) throw new IllegalArgumentException("Sequence cannot be empty"); this.sequence = sequence; } + /** + * Does this vertex have an empty sequence? + * + * That is, is it a dummy node that's only present for structural reasons but doesn't actually + * contribute to the sequence of the graph? + * + * @return true if sequence is empty, false otherwise + */ + public boolean isEmpty() { + return length() == 0; + } + /** * Get the length of this sequence * @return a positive integer >= 1 diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 7cf4cc8d3..6aec9c7a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -173,7 +173,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); graph = graph.errorCorrect(); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); - graph.cleanNonRefPaths(); final SeqGraph seqGraph = toSeqGraph(graph); @@ -199,6 +198,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); seqGraph.simplifyGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); + + // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef + // might blow up because there's no reference source node + if ( seqGraph.vertexSet().size() == 1 ) + return seqGraph; + seqGraph.removePathsNotConnectedToRef(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR); + return seqGraph; } @@ -274,6 +281,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + graph.cleanNonRefPaths(); return graph; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 7546155a6..4adfe6612 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -391,4 +391,24 @@ class Path { cigar = initialCigar; } } + + /** + * Tests that this and other have the same score and vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalScoreAndSequence(final Path other) { + if ( other == null ) throw new IllegalArgumentException("other cannot be null"); + + if ( getScore() != other.getScore() ) + return false; + final List mine = getVertices(); + final List yours = other.getVertices(); + if ( mine.size() == yours.size() ) { // hehehe + for ( int i = 0; i < mine.size(); i++ ) + if ( ! mine.get(i).seqEquals(yours.get(i)) ) + return false; + } + return true; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java index b855390c6..da24a06a4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java @@ -46,13 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.lang.StringUtils; -import java.io.File; -import java.util.*; +import java.util.Set; /** * A graph that contains base sequence at each node @@ -61,6 +57,8 @@ import java.util.*; * @since 03/2013 */ public class SeqGraph extends BaseGraph { + private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; + /** * Construct an empty SeqGraph */ @@ -86,18 +84,38 @@ public class SeqGraph extends BaseGraph { * in any way the sequences implied by a complex enumeration of all paths through the graph. */ public void simplifyGraph() { + simplifyGraph(Integer.MAX_VALUE); + } + + protected void simplifyGraph(final int maxCycles) { + boolean didSomeWork; + int i = 0; + + // start off with one round of zipping of chains for performance reasons zipLinearChains(); - mergeBranchingNodes(); - zipLinearChains(); + do { + //logger.info("simplifyGraph iteration " + i); + // iterate until we haven't don't anything useful + didSomeWork = false; + //printGraph(new File("simplifyGraph." + i + ".dot"), 0); + didSomeWork |= new MergeDiamonds().transformUntilComplete(); + didSomeWork |= new MergeTails().transformUntilComplete(); + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); + didSomeWork |= zipLinearChains(); + i++; + } while (didSomeWork && i < maxCycles); } /** * Zip up all of the simple linear chains present in this graph. */ - protected void zipLinearChains() { + protected boolean zipLinearChains() { + boolean foundOne = false; while( zipOneLinearChain() ) { // just keep going until zipOneLinearChain says its done + foundOne = true; } + return foundOne; } /** @@ -147,193 +165,168 @@ public class SeqGraph extends BaseGraph { } /** - * Perform as many branch simplifications and merging operations as possible on this graph, - * modifying it in place. + * Base class for transformation operations that need to iterate over proposed vertices, where + * each proposed vertex is a seed vertex for a potential transformation. + * + * transformUntilComplete will iteratively apply the tryToTransform function on each vertex in the graph + * until no vertex can be found that can be transformed. + * + * Note that in order to eventually terminate tryToTransform must transform the graph such that eventually + * no vertices are candidates for further transformations. */ - protected void mergeBranchingNodes() { - boolean foundNodesToMerge = true; - while( foundNodesToMerge ) { - foundNodesToMerge = false; + private abstract class VertexBasedTransformer { + /** + * For testing purposes we sometimes want to test that can be transformed capabilities are working + * without actually modifying the graph */ + private boolean dontModifyGraphEvenIfPossible = false; - for( final SeqVertex v : vertexSet() ) { - foundNodesToMerge = simplifyDiamondIfPossible(v); - if ( foundNodesToMerge ) - break; - } - } - } + public boolean dontModifyGraphEvenIfPossible() { return dontModifyGraphEvenIfPossible; } + public void setDontModifyGraphEvenIfPossible() { this.dontModifyGraphEvenIfPossible = true; } - /** - * A simple structure that looks like: - * - * v - * / | \ \ - * m1 m2 m3 ... mn - * \ | / / - * b - * - * Only returns true if all outgoing edges of v go to vertices that all only connect to - * a single bottom node, and that all middle nodes have only the single edge - * - * @param v the vertex to test if its the top of a diamond pattern - * @return true if v is the root of a diamond - */ - protected boolean isRootOfDiamond(final SeqVertex v) { - final Set ve = outgoingEdgesOf(v); - if ( ve.size() <= 1 ) - return false; + /** + * Merge until the graph has no vertices that are candidates for merging + */ + public boolean transformUntilComplete() { + boolean didAtLeastOneTranform = false; + boolean foundNodesToMerge = true; + while( foundNodesToMerge ) { + foundNodesToMerge = false; - SeqVertex bottom = null; - for ( final BaseEdge e : ve ) { - final SeqVertex mi = getEdgeTarget(e); - - // all nodes must have at least 1 connection - if ( outDegreeOf(mi) < 1 ) - return false; - - // can only have 1 incoming node, the root vertex - if ( inDegreeOf(mi) != 1 ) - return false; - - // make sure that all outgoing vertices of mi go only to the bottom node - for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { - if ( bottom == null ) - bottom = mt; - else if ( ! bottom.equals(mt) ) - return false; - } - } - - // bottom has some connections coming in from other nodes, don't allow - if ( inDegreeOf(bottom) != ve.size() ) - return false; - - return true; - } - - /** - * Return the longest suffix of bases shared among all provided vertices - * - * For example, if the vertices have sequences AC, CC, and ATC, this would return - * a single C. However, for ACC and TCC this would return CC. And for AC and TG this - * would return null; - * - * @param middleVertices a non-empty set of vertices - * @return - */ - @Requires("!middleVertices.isEmpty()") - private byte[] commonSuffixOfEdgeTargets(final Set middleVertices) { - final String[] kmers = new String[middleVertices.size()]; - - int i = 0; - for ( final SeqVertex v : middleVertices ) { - kmers[i++] = (StringUtils.reverse(v.getSequenceString())); - } - - final String commonPrefix = StringUtils.getCommonPrefix(kmers); - return commonPrefix.equals("") ? null : StringUtils.reverse(commonPrefix).getBytes(); - } - - /** - * Get the node that is the bottom of a diamond configuration in the graph starting at top - * - * @param top - * @return - */ - @Requires("top != null") - @Ensures({"result != null"}) - private SeqVertex getDiamondBottom(final SeqVertex top) { - final BaseEdge topEdge = outgoingEdgesOf(top).iterator().next(); - final SeqVertex middle = getEdgeTarget(topEdge); - final BaseEdge middleEdge = outgoingEdgesOf(middle).iterator().next(); - return getEdgeTarget(middleEdge); - } - - /** - * Get the set of vertices that are in the middle of a diamond starting at top - * @param top - * @return - */ - @Requires("top != null") - @Ensures({"result != null", "!result.isEmpty()"}) - final Set getMiddleVertices(final SeqVertex top) { - final Set middles = new HashSet(); - for ( final BaseEdge topToMiddle : outgoingEdgesOf(top) ) { - middles.add(getEdgeTarget(topToMiddle)); - } - return middles; - } - - /** - * Simply a diamond configuration in the current graph starting at top, if possible - * - * If top is actually the top of a diamond that can be simplified (i.e., doesn't have any - * random edges or other structure that would cause problems with the transformation), then this code - * performs the following transformation on this graph (modifying it): - * - * A -> M1 -> B, A -> M2 -> B, A -> Mn -> B - * - * becomes - * - * A -> M1' -> B', A -> M2' -> B', A -> Mn' -> B' - * - * where B' is composed of the longest common suffix of all Mi nodes + B, and Mi' are each - * middle vertex without their shared suffix. - * - * @param top a proposed vertex in this graph that might start a diamond (but doesn't have to) - * @return true top actually starts a diamond and it could be simplified - */ - private boolean simplifyDiamondIfPossible(final SeqVertex top) { - if ( ! isRootOfDiamond(top) ) - return false; - - final SeqVertex diamondBottom = getDiamondBottom(top); - final Set middleVertices = getMiddleVertices(top); - final List verticesToRemove = new LinkedList(); - final List edgesToRemove = new LinkedList(); - - // all of the edges point to the same sink, so it's time to merge - final byte[] commonSuffix = commonSuffixOfEdgeTargets(middleVertices); - if ( commonSuffix != null ) { - final BaseEdge botToNewBottom = new BaseEdge(false, 0); - final BaseEdge elimMiddleNodeEdge = new BaseEdge(false, 0); - final SeqVertex newBottomV = new SeqVertex(commonSuffix); - addVertex(newBottomV); - - for ( final SeqVertex middle : middleVertices ) { - final SeqVertex withoutSuffix = middle.withoutSuffix(commonSuffix); - final BaseEdge topToMiddleEdge = getEdge(top, middle); - final BaseEdge middleToBottomE = getEdge(middle, diamondBottom); - - // clip out the two edges, since we'll be replacing them later - edgesToRemove.add(topToMiddleEdge); - edgesToRemove.add(middleToBottomE); - - if ( withoutSuffix != null ) { // this node is a deletion - addVertex(withoutSuffix); - // update edge from top -> middle to be top -> without suffix - addEdge(top, withoutSuffix, new BaseEdge(topToMiddleEdge)); - addEdge(withoutSuffix, newBottomV, new BaseEdge(middleToBottomE)); - } else { - // this middle node is == the common suffix, wo we're removing the edge - elimMiddleNodeEdge.add(topToMiddleEdge); + for( final SeqVertex v : vertexSet() ) { + foundNodesToMerge = tryToTransform(v); + if ( foundNodesToMerge ) { + didAtLeastOneTranform = true; + break; + } } - // include the ref and multi of mid -> bot in our edge from new bot -> bot - botToNewBottom.add(middleToBottomE); - verticesToRemove.add(middle); } - // add an edge from top to new bottom, because some middle nodes were removed - if ( elimMiddleNodeEdge.getMultiplicity() > 0 ) - addEdge(top, newBottomV, elimMiddleNodeEdge); + return didAtLeastOneTranform; + } - addEdge(newBottomV, diamondBottom, botToNewBottom); + /** + * Merge, if possible, seeded on the vertex v + * @param v the proposed seed vertex to merge + * @return true if some useful merging happened, false otherwise + */ + abstract boolean tryToTransform(final SeqVertex v); + } - removeAllEdges(edgesToRemove); - removeAllVertices(verticesToRemove); - return true; - } else { - return false; + /** + * Merge diamond configurations: + * + * Performance the transformation: + * + * { A -> x + S_i + y -> Z } + * + * goes to: + * + * { A -> x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + */ + protected class MergeDiamonds extends VertexBasedTransformer { + @Override + protected boolean tryToTransform(final SeqVertex top) { + final Set middles = outgoingVerticesOf(top); + if ( middles.size() <= 1 ) + // we can only merge if there's at least two middle nodes + return false; + + SeqVertex bottom = null; + for ( final SeqVertex mi : middles ) { + // all nodes must have at least 1 connection + if ( outDegreeOf(mi) < 1 ) + return false; + + // can only have 1 incoming node, the root vertex + if ( inDegreeOf(mi) != 1 ) + return false; + + // make sure that all outgoing vertices of mi go only to the bottom node + for ( final SeqVertex mt : outgoingVerticesOf(mi) ) { + if ( bottom == null ) + bottom = mt; + else if ( ! bottom.equals(mt) ) + return false; + } + } + + // bottom has some connections coming in from other nodes, don't allow + if ( inDegreeOf(bottom) != middles.size() ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + // actually do the merging, returning true if at least 1 base was successfully split + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles); + return splitter.splitAndUpdate(top, bottom, 1); + } + } + + /** + * Merge tail configurations: + * + * Performs the transformation: + * + * { A -> x + S_i + y } + * + * goes to: + * + * { A -> x -> S_i -> y } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no bottom node is required + */ + protected class MergeTails extends VertexBasedTransformer { + @Override + protected boolean tryToTransform(final SeqVertex top) { + final Set tails = outgoingVerticesOf(top); + if ( tails.size() <= 1 ) + return false; + + for ( final SeqVertex t : tails ) + if ( ! isSink(t) || inDegreeOf(t) > 1 ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails); + return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS); + } + } + + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class MergeHeadlessIncomingSources extends VertexBasedTransformer { + @Override + boolean tryToTransform(final SeqVertex bottom) { + final Set incoming = incomingVerticesOf(bottom); + if ( incoming.size() <= 1 ) + return false; + + for ( final SeqVertex inc : incoming ) + if ( ! isSource(inc) || outDegreeOf(inc) > 1 ) + return false; + + if ( dontModifyGraphEvenIfPossible() ) return true; + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming); + return splitter.splitAndUpdate(null, bottom, 1); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java index b45ac0c34..523312dcf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java @@ -150,4 +150,19 @@ public class SeqVertex extends BaseVertex { final int prefixSize = sequence.length - suffix.length; return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null; } + + /** + * Return a new SeqVertex derived from this one but not including prefix or suffix bases + * + * @param prefix the previx bases to remove + * @param suffix the suffix bases to remove from this vertex + * @return a newly allocated SeqVertex + */ + @Requires("Utils.endsWith(sequence, suffix)") + public SeqVertex withoutPrefixAndSuffix(final byte[] prefix, final byte[] suffix) { + final int start = prefix.length; + final int length = sequence.length - suffix.length - prefix.length; + final int stop = start + length; + return length > 0 ? new SeqVertex(Arrays.copyOfRange(sequence, start, stop)) : null; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java new file mode 100644 index 000000000..e0501da52 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java @@ -0,0 +1,341 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; + +/** + * Split a collection of middle nodes in a graph into their shared prefix and suffix values + * + * This code performs the following transformation. Suppose I have a set of vertices V, such + * that each vertex is composed of sequence such that + * + * Vi = prefix + seq_i + suffix + * + * where prefix and suffix are shared sequences across all vertices V + * + * This algorithm creates a new SeqGraph with the following configuration + * + * prefix -> has outgoing edges to all seq_i + * suffix -> has incoming edges for all seq_i + * + * There are a few special cases that must be handled. First, Vi could be simply + * == to the prefix or the suffix. These generate direct connections between + * the prefix and suffix nodes, and they are handled internally by the algorithm. + * + * Note that for convenience, we will always create newTop and newBottom nodes, but + * these may be empty node (i.e., they contain no sequence). That allows them to be + * trivially merged, if desired, when the graph is incorporated into an overall + * graph. + * + * The product of this operation is a SeqGraph that contains the split. There's a + * function to merge reconnect this graph into the graph that contains the middle nodes + * + * The process guarentees a few things about the output: + * + * -- Preserves the paths and weights among all vertices + * + * It produces a graph that has some unusual properties + * + * -- May add nodes with no sequence (isEmpty() == true) to preserve connectivity among the graph + * -- May introduce edges with no multiplicity to preserve paths through the graph + * + * The overall workflow of using this class is simple: + * + * find vertices V in graph that you want to split out + * s = new SharedVertexSequenceSplitter(graph, V) + * s.updateGraph(graph) + * + * to update the graph with the modifications created by this splitter + * + * User: depristo + * Date: 3/22/13 + * Time: 8:31 AM + */ +public class SharedVertexSequenceSplitter { + final private SeqGraph outer; + final protected SeqVertex prefixV, suffixV; + final protected Collection toSplits; + + // updated in split routine + protected SeqGraph splitGraph = null; + protected Collection newMiddles = null; + protected List edgesToRemove = null; + + /** + * Create a new graph that contains the vertices in toSplitsArg with their shared suffix and prefix + * sequences extracted out. + * + * @param graph the graph containing the vertices in toSplitsArg + * @param toSplitsArg a collection of vertices to split. Must be contained within graph, and have only connections + * from a single shared top and/or bottom node + */ + public SharedVertexSequenceSplitter(final SeqGraph graph, final Collection toSplitsArg) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( toSplitsArg == null ) throw new IllegalArgumentException("toSplitsArg cannot be null"); + if ( toSplitsArg.size() < 2 ) throw new IllegalArgumentException("Can only split at least 2 vertices but only got " + toSplitsArg); + if ( ! graph.vertexSet().containsAll(toSplitsArg) ) throw new IllegalArgumentException("graph doesn't contain all of the vertices to split"); + + this.outer = graph; + this.toSplits = toSplitsArg; + + // all of the edges point to the same sink, so it's time to merge + final Pair prefixAndSuffix = commonPrefixAndSuffixOfVertices(toSplits); + prefixV = prefixAndSuffix.getFirst(); + suffixV = prefixAndSuffix.getSecond(); + } + + /** + * Simple single-function interface to split and then update a graph + * + * @see #updateGraph(SeqVertex, SeqVertex) for a full description of top and bottom + * + * @param top the top vertex, may be null + * @param bottom the bottom vertex, may be null + * @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up + * before we'll go ahead and actually do the splitting. Allows one to determine + * whether there's actually any useful splitting to do, as well as protect + * yourself against spurious splitting of nodes based on trivial amounts of overall + * @return true if some useful splitting was done, false otherwise + */ + public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) { + if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence ) + return false; + split(); + updateGraph(top, bottom); + return true; + } + + /** + * Actually do the splitting up of the vertices + * + * Must be called before calling updateGraph + */ + public void split() { + splitGraph = new SeqGraph(); + newMiddles = new LinkedList(); + edgesToRemove = new LinkedList(); + + splitGraph.addVertices(prefixV, suffixV); + + for ( final SeqVertex mid : toSplits ) { + final BaseEdge toMid = processEdgeToRemove(mid, outer.incomingEdgeOf(mid)); + final BaseEdge fromMid = processEdgeToRemove(mid, outer.outgoingEdgeOf(mid)); + + final SeqVertex remaining = mid.withoutPrefixAndSuffix(prefixV.getSequence(), suffixV.getSequence()); + if ( remaining != null ) { + // there's some sequence prefix + seq + suffix, so add the node and make edges + splitGraph.addVertex(remaining); + newMiddles.add(remaining); + // update edge from top -> middle to be top -> without suffix + splitGraph.addEdge(prefixV, remaining, toMid); + splitGraph.addEdge(remaining, suffixV, fromMid); + } else { + // prefix + suffix completely explain this node + splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid)); + } + } + } + + /** + * Update graph outer, replacing the previous middle vertices that were split out with the new + * graph structure of the split, linking this subgraph into the graph at top and bot (the + * vertex connecting the middle nodes and the vertex outgoing of all middle node) + * + * @param top an optional top node that must have outgoing edges to all split vertices. If null, this subgraph + * will be added without any incoming edges + * @param bot an optional bottom node that must have incoming edges to all split vertices. If null, this subgraph + * will be added without any outgoing edges to the rest of the graph + */ + public void updateGraph(final SeqVertex top, final SeqVertex bot) { + if ( ! outer.vertexSet().containsAll(toSplits) ) throw new IllegalArgumentException("graph doesn't contain all of the original vertices to split"); + if ( top == null && bot == null ) throw new IllegalArgumentException("Cannot update graph without at least one top or bot vertex, but both were null"); + if ( top != null && ! outer.containsVertex(top) ) throw new IllegalArgumentException("top " + top + " not found in graph " + outer); + if ( bot != null && ! outer.containsVertex(bot) ) throw new IllegalArgumentException("bot " + bot + " not found in graph " + outer); + if ( splitGraph == null ) throw new IllegalStateException("Cannot call updateGraph until split() has been called"); + + outer.removeAllVertices(toSplits); + outer.removeAllEdges(edgesToRemove); + + outer.addVertices(newMiddles); + + final boolean hasPrefixSuffixEdge = splitGraph.getEdge(prefixV, suffixV) != null; + final boolean hasOnlyPrefixSuffixEdges = hasPrefixSuffixEdge && splitGraph.outDegreeOf(prefixV) == 1; + final boolean needPrefixNode = ! prefixV.isEmpty() || (top == null && ! hasOnlyPrefixSuffixEdges); + final boolean needSuffixNode = ! suffixV.isEmpty() || (bot == null && ! hasOnlyPrefixSuffixEdges); + + // if prefix / suffix are needed, keep them + final SeqVertex topForConnect = needPrefixNode ? prefixV : top; + final SeqVertex botForConnect = needSuffixNode ? suffixV : bot; + + if ( needPrefixNode ) { + outer.addVertex(prefixV); + if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0)); + } + + if ( needSuffixNode ) { + outer.addVertex(suffixV); + if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0)); + } + + if ( topForConnect != null ) { + for ( final BaseEdge e : splitGraph.outgoingEdgesOf(prefixV) ) { + final SeqVertex target = splitGraph.getEdgeTarget(e); + + if ( target == suffixV ) { // going straight from prefix -> suffix + if ( botForConnect != null ) + outer.addEdge(topForConnect, botForConnect, e); + } else { + outer.addEdge(topForConnect, target, e); + } + } + } + + if ( botForConnect != null ) { + for ( final BaseEdge e : splitGraph.incomingEdgesOf(suffixV) ) { + outer.addEdge(splitGraph.getEdgeSource(e), botForConnect, e); + } + } + } + + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return + */ + @Requires("!middleVertices.isEmpty()") + protected static Pair commonPrefixAndSuffixOfVertices(final Collection middleVertices) { + final List kmers = new ArrayList(middleVertices.size()); + + int min = Integer.MAX_VALUE; + for ( final SeqVertex v : middleVertices ) { + kmers.add(v.getSequence()); + min = Math.min(min, v.getSequence().length); + } + + final int prefixLen = compPrefixLen(kmers, min); + final int suffixLen = compSuffixLen(kmers, min - prefixLen); + + final byte[] kmer = kmers.get(0); + final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); + final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); + return new Pair(new SeqVertex(prefix), new SeqVertex(suffix)); + } + + /** + * Compute the maximum shared prefix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the start of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compPrefixLen(final List listOfBytes, final int minLength) { + for ( int i = 0; i < minLength; i++ ) { + final byte b = listOfBytes.get(0)[i]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[i] ) + return i; + } + } + + return minLength; + } + + /** + * Compute the maximum shared suffix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the end of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compSuffixLen(final List listOfBytes, final int minLength) { + for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { + final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) + return suffixLen; + } + } + return minLength; + } + + /** + * Helper function that returns an edge that we should use for splitting + * + * If e is null, creates a new 0 multiplicity edge, set to ref is any edges to V are ref + * If e is not null, returns a new copy of e, and schedules e for removal + * + * @param e a non-null edge + * @return a non-null edge + */ + @Requires("v != null") + @Ensures("result != null") + private BaseEdge processEdgeToRemove(final SeqVertex v, final BaseEdge e) { + if ( e == null ) { + // there's no edge, so we return a newly allocated one and don't schedule e for removal + // the weight must be 0 to preserve sum through the diamond + return new BaseEdge(outer.isReferenceNode(v), 0); + } else { + // schedule edge for removal, and return a freshly allocated one for our graph to use + edgesToRemove.add(e); + return new BaseEdge(e); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java index cd27c7183..8f682d474 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java @@ -68,9 +68,10 @@ public class BaseVertexUnitTest extends BaseTest { new BaseVertex((byte[])null); } - @Test(expectedExceptions = IllegalArgumentException.class) + @Test() public void testCreationEmptySeq() { - new BaseVertex(new byte[0]); + final BaseVertex v = new BaseVertex(new byte[0]); + Assert.assertTrue(v.isEmpty(), "Version with length == 0 should be empty"); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java index 83a4f4c50..6b6826e45 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java @@ -51,7 +51,6 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -181,7 +180,9 @@ public class SeqGraphUnitTest extends BaseTest { @Test(dataProvider = "IsDiamondData", enabled = true) public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) { - Assert.assertEquals(graph.isRootOfDiamond(v), isRootOfDiamond); + final SeqGraph.MergeDiamonds merger = graph.new MergeDiamonds(); + merger.setDontModifyGraphEvenIfPossible(); + Assert.assertEquals(merger.tryToTransform(v), isRootOfDiamond); } @DataProvider(name = "MergingData") @@ -267,7 +268,7 @@ public class SeqGraphUnitTest extends BaseTest { tests.add(new Object[]{graph.clone(), expected.clone()}); } - { + { // all the nodes -> lots of merging and motion of nodes final SeqGraph all = new SeqGraph(); all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2); all.addEdges(pre1, top, middle1, bottom, tail1); @@ -277,9 +278,13 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex newMiddle1 = new SeqVertex("G"); final SeqVertex newMiddle2 = new SeqVertex("T"); final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); - expected.addVertices(pre1, pre2, top, newMiddle1, newMiddle2, newBottom, tail1, tail2); - expected.addEdges(pre1, top, newMiddle1, newBottom, tail1); - expected.addEdges(pre2, top, newMiddle2, newBottom, tail2); + final SeqVertex newTop = new SeqVertex("A"); + final SeqVertex newTopDown1 = new SeqVertex("G"); + final SeqVertex newTopDown2 = new SeqVertex("C"); + final SeqVertex newTopBottomMerged = new SeqVertex("TA"); + expected.addVertices(newTop, newTopDown1, newTopDown2, newTopBottomMerged, newMiddle1, newMiddle2, newBottom, tail1, tail2); + expected.addEdges(newTop, newTopDown1, newTopBottomMerged, newMiddle1, newBottom, tail1); + expected.addEdges(newTop, newTopDown2, newTopBottomMerged, newMiddle2, newBottom, tail2); tests.add(new Object[]{all.clone(), expected.clone()}); } @@ -309,7 +314,12 @@ public class SeqGraphUnitTest extends BaseTest { @Test(dataProvider = "MergingData", enabled = true) public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); - merged.simplifyGraph(); + merged.simplifyGraph(1); +// if ( ! SeqGraph.graphEquals(merged, expected) ) { +// graph.printGraph(new File("graph.dot"), 0); +// merged.printGraph(new File("merged.dot"), 0); +// expected.printGraph(new File("expected.dot"), 0); +// } Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); } @@ -332,13 +342,9 @@ public class SeqGraphUnitTest extends BaseTest { graph.addEdge(mid1, bot, new BaseEdge(true, 1)); final SeqGraph expected = new SeqGraph(); - expected.addVertices(top, mid1, bot); - expected.addEdge(top, mid1, new BaseEdge(true, 2)); - expected.addEdge(mid1, bot, new BaseEdge(true, 2)); - + expected.addVertex(new SeqVertex("AACTC")); final SeqGraph actual = ((SeqGraph)graph.clone()); - actual.mergeBranchingNodes(); - - Assert.assertTrue(BaseGraph.graphEquals(actual, expected)); + actual.simplifyGraph(); + Assert.assertTrue(BaseGraph.graphEquals(actual, expected), "Wrong merging result after complete merging"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java new file mode 100644 index 000000000..52ab36064 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java @@ -0,0 +1,253 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class SharedVertexSequenceSplitterUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = false; + + @DataProvider(name = "PrefixSuffixData") + public Object[][] makePrefixSuffixData() { + List tests = new ArrayList(); + + tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0}); + tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACCT", "AGT"), 1, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACT"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTG"), 3, 0}); + tests.add(new Object[]{Arrays.asList("ACTA", "ACTGA"), 3, 1}); + tests.add(new Object[]{Arrays.asList("GCTGA", "ACTGA"), 0, 4}); + + tests.add(new Object[]{Arrays.asList("A", "C", "A"), 0, 0}); + tests.add(new Object[]{Arrays.asList("A", "A", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "AA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("A", "ACA", "A"), 1, 0}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("ACT", "ACAT", "ACGT"), 2, 1}); + tests.add(new Object[]{Arrays.asList("AAAT", "AAA", "CAAA"), 0, 0}); + tests.add(new Object[]{Arrays.asList("AACTTT", "AAGTTT", "AAGCTTT"), 2, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "CAAA"), 0, 3}); + tests.add(new Object[]{Arrays.asList("AAA", "AAA", "AAA"), 3, 0}); + + tests.add(new Object[]{Arrays.asList("AC", "ACA", "AC"), 2, 0}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List bytes = new ArrayList(); + int min = Integer.MAX_VALUE; + for ( final String s : strings ) { + bytes.add(s.getBytes()); + min = Math.min(min, s.length()); + } + + final int actualPrefixLen = SharedVertexSequenceSplitter.compPrefixLen(bytes, min); + Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); + + final int actualSuffixLen = SharedVertexSequenceSplitter.compSuffixLen(bytes, min - actualPrefixLen); + Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final Pair result = SharedVertexSequenceSplitter.commonPrefixAndSuffixOfVertices(v); + Assert.assertEquals(result.getFirst().getSequenceString(), expectedPrefix, "Failed suffix test"); + Assert.assertEquals(result.getSecond().getSequenceString(), expectedSuffix, "Failed suffix test"); + + Assert.assertEquals(result.getFirst().isEmpty(), expectedPrefix.isEmpty()); + Assert.assertEquals(result.getSecond().isEmpty(), expectedSuffix.isEmpty()); + } + + @Test(dataProvider = "PrefixSuffixData") + public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { + final SeqGraph graph = new SeqGraph(); + + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + + graph.addVertices(v.toArray(new SeqVertex[]{})); + + final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); + final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); +// splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0); + + Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); + Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); + + Assert.assertTrue(splitter.splitGraph.outDegreeOf(splitter.prefixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.inDegreeOf(splitter.prefixV), 0); + + Assert.assertTrue(splitter.splitGraph.inDegreeOf(splitter.suffixV) <= strings.size()); + Assert.assertEquals(splitter.splitGraph.outDegreeOf(splitter.suffixV), 0); + + for ( final SeqVertex mid : splitter.newMiddles ) { + Assert.assertNotNull(splitter.splitGraph.getEdge(splitter.prefixV, mid)); + Assert.assertNotNull(splitter.splitGraph.getEdge(mid, splitter.suffixV)); + } + } + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + List tests = new ArrayList(); + + for ( final boolean hasTop : Arrays.asList(true, false) ) { + for ( final boolean hasBot : Arrays.asList(true, false) ) { + if ( ! hasTop && ! hasBot ) continue; + tests.add(new Object[]{Arrays.asList("A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "AC"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "CA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AT", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("ATAACA", "ACA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "AAA"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CCCAAAAAA", "CCCAAA"), hasTop, hasBot}); + + tests.add(new Object[]{Arrays.asList("A", "A", "A"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "A", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("A", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("AC", "C", "C"), hasTop, hasBot}); + tests.add(new Object[]{Arrays.asList("CA", "C", "C"), hasTop, hasBot}); + // all merged + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGA"), hasTop, hasBot}); + // prefix and suffix + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "ACA"), hasTop, hasBot}); + // 2 -> prefix, leave C + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGAC"), hasTop, hasBot}); + // 2 -> prefix, leave CCC + tests.add(new Object[]{Arrays.asList("AGA", "AGA", "AGACCC"), hasTop, hasBot}); + // 2 -> suffix, leave A/T + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AAGA"), hasTop, hasBot}); + // 2 -> suffix, leave T, delete 1 + tests.add(new Object[]{Arrays.asList("TAGA", "TAGA", "AGA"), hasTop, hasBot}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CompleteCycleData") + public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) { + final SeqGraph graph = new SeqGraph(); + + int edgeWeight = 1; + final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; + final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; + final List v = new ArrayList(); + for ( final String s : strings ) { + v.add(new SeqVertex(s)); + } + graph.addVertices(v.toArray(new SeqVertex[]{})); + final SeqVertex first = v.get(0); + + if ( hasTop ) { + graph.addVertex(top); + for ( final SeqVertex vi : v ) + graph.addEdge(top, vi, new BaseEdge(vi == first, edgeWeight++)); + } + + if ( hasBot ) { + graph.addVertex(bot); + for ( final SeqVertex vi : v ) + graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); + } + + final Set haplotypes = new HashSet(); + final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); + for ( final Path path : originalPaths ) + haplotypes.add(new String(path.getBases())); + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + splitter.split(); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".original.dot"), 0); + if ( PRINT_GRAPHS ) splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".split.dot"), 0); + splitter.updateGraph(top, bot); + if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); + + final List> splitPaths = new KBestPaths().getKBestPaths(graph); + for ( final Path path : splitPaths ) { + final String h = new String(path.getBases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } +} From 593d3469d4c0198316d1809535d0c6b691605751 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 15 Mar 2013 12:20:23 -0400 Subject: [PATCH 101/226] Refactored the het (polyploid) consensus creation in ReduceReads. * It is now cleaner and easier to test; added tests for newly implemented methods. * Many fixes to the logic to make it work * The most important change was that after triggering het compression we actually need to back it out if it creates reads that incorporated too many softclips at any one position (because they get unclipped). * There was also an off-by-one error in the general code that only manifested itself with het compression. * Removed support for creating a het consensus around deletions (which was broken anyways). * Mauricio gave his blessing for this. * Het compression now works only against known sites (with -known argument). * The user can pass in one or more VCFs with known SNPs (other variants are ignored). * If no known SNPs are provided het compression will automatically be disabled. * Added SAM tag to stranded (i.e. het compressed) reduced reads to distinguish their strandedness from normal reduced reads. * GATKSAMRecord now checks for this tag when determining whether or not the read is stranded. * This allows us to update the FisherStrand annotation to count het compressed reduced reads towards the FS calculation. * [It would have been nice to mark the normal reads as unstranded but then we wouldn't be backwards compatible.] * Updated integration tests accordingly with new het compressed bams (both for RR and UG). * In the process of fixing the FS annotation I noticed that SpanningDeletions wasn't handling RR properly, so I fixed it too. * Also, the test in the UG engine for determining whether there are too many overlapping deletions is updated to handle RR. * I added a special hook in the RR integration tests to additionally run the systematic coverage checking tool I wrote earlier. * AssessReducedCoverage is now run against all RR integration tests to ensure coverage is not lost from original to reduced bam. * This helped uncover a huge bug in the MultiSampleCompressor where it would drop reads from all but 1 sample (now fixed). * AssessReducedCoverage moved from private to protected for packaging reasons. * #resolve GSA-639 At this point, this commit encompasses most of what is needed for het compression to go live. There are still a few TODO items that I want to get in before the 2.5 release, but I will save those for a separate branch because as it is I feel bad for the person who needs to review all these changes (sorry, Mauricio). --- .../gatk/walkers/annotator/FisherStrand.java | 21 +- .../walkers/annotator/SpanningDeletions.java | 11 +- .../reducereads/HeaderElement.java | 61 +++- .../reducereads/MultiSampleCompressor.java | 43 ++- .../compression/reducereads/ReduceReads.java | 69 +++- .../reducereads/ReduceReadsStash.java | 16 +- .../reducereads/SingleSampleCompressor.java | 42 ++- .../reducereads/SlidingWindow.java | 323 +++++++++++------- .../reducereads/SyntheticRead.java | 31 +- .../genotyper/UnifiedGenotyperEngine.java | 4 +- .../walkers/qc/AssessReducedCoverage.java | 175 ++++++++++ .../reducereads/HeaderElementUnitTest.java | 34 +- .../ReduceReadsIntegrationTest.java | 128 +++++-- .../reducereads/ReduceReadsUnitTest.java | 105 +++++- .../reducereads/SlidingWindowUnitTest.java | 156 ++++++++- .../reducereads/SyntheticReadUnitTest.java | 2 +- ...dGenotyperReducedReadsIntegrationTest.java | 6 +- .../sting/utils/sam/GATKSAMRecord.java | 8 +- .../coverage/CallableLociIntegrationTest.java | 2 +- .../utils/sam/GATKSAMRecordUnitTest.java | 1 - 20 files changed, 978 insertions(+), 260 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 7960a3ce2..eb42325e1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -313,10 +313,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { - // ignore reduced reads because they are always on the forward strand! - // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test - if ( read.isReducedRead() ) - return; final boolean matchesRef = allele.equals(ref, true); final boolean matchesAlt = allele.equals(alt, true); @@ -325,12 +321,17 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat final int row = matchesRef ? 0 : 1; if ( read.isStrandless() ) { - // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 - // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even - // if the read is only seen once, because it's a merged read or other) - final int toAdd = Math.max(representativeCount / 2, 1); - table[row][0] += toAdd; - table[row][1] += toAdd; + + // ignore strandless reduced reads because they are always on the forward strand! + if ( !read.isReducedRead() ) { + + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + final int toAdd = Math.max(representativeCount / 2, 1); + table[row][0] += toAdd; + table[row][1] += toAdd; + } } else { // a normal read with an actual strand final boolean isFW = !read.getReadNegativeStrandFlag(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index c3a0618ef..dd57c8ac6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -88,10 +89,12 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn int deletions = 0; int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - deletions += pileup.getNumberOfDeletions(); - depth += pileup.getNumberOfElements(); + for ( final PileupElement p : sample.getValue().getBasePileup() ) { + final int actualSampleDepth = p.getRepresentativeCount(); + depth += actualSampleDepth; + if ( p.isDeletion() ) + deletions += actualSampleDepth; + } } Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 1cd9c1bc0..3532a74fb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -50,6 +50,10 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + /** * The element that describes the header of the sliding window. @@ -264,29 +268,56 @@ public class HeaderElement { } /** - * Calculates the number of haplotypes necessary to represent this site. + * Calculates the number of alleles necessary to represent this site. * * @param minVariantProportion the minimum proportion to call a site variant. - * @return the number of alleles necessary to represent this site. + * @param allowDeletions should we allow deletions? + * @return the number of alleles necessary to represent this site or -1 if allowDeletions is false and there are a sufficient number of them */ - public int getNumberOfAlleles(final double minVariantProportion) { + public int getNumberOfAlleles(final double minVariantProportion, final boolean allowDeletions) { + final List alleles = getAlleles(minVariantProportion, allowDeletions); + return alleles == null ? -1 : alleles.size(); + } + + /** + * Calculates the alleles necessary to represent this site. + * + * @param minVariantProportion the minimum proportion to call a site variant. + * @param allowDeletions should we allow deletions? + * @return the list of alleles necessary to represent this site or null if allowDeletions is false and there are a sufficient number of them + */ + public List getAlleles(final double minVariantProportion, final boolean allowDeletions) { final int totalBaseCount = consensusBaseCounts.totalCount(); - if (totalBaseCount == 0) - return 0; + if ( totalBaseCount == 0 ) + return Collections.emptyList(); - final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount); + final int minBaseCountForRelevantAlleles = Math.max(1, (int)(minVariantProportion * totalBaseCount)); - int nAlleles = 0; - for ( BaseIndex base : BaseIndex.values() ) { + final List alleles = new ArrayList(4); + for ( final BaseIndex base : BaseIndex.values() ) { final int baseCount = consensusBaseCounts.countOfBase(base); - // don't consider this allele if the count is 0 - if ( baseCount == 0 ) - continue; - - if ( baseCount >= minBaseCountForRelevantAlleles ) - nAlleles++; + if ( baseCount >= minBaseCountForRelevantAlleles ) { + if ( !allowDeletions && base == BaseIndex.D ) + return null; + alleles.add(base); + } } - return nAlleles; + return alleles; + } + + /* + * Checks whether there are a significant number of softclips. + * + * @param minVariantProportion the minimum proportion to consider something significant. + * @return true if there are significant softclips, false otherwise + */ + public boolean hasSignificantSoftclips(final double minVariantProportion) { + final int totalBaseCount = consensusBaseCounts.totalCount(); + if ( totalBaseCount == 0 ) + return false; + + final int minBaseCountForSignificance = Math.max(1, (int)(minVariantProportion * totalBaseCount)); + return nSoftClippedBases >= minBaseCountForSignificance; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 2f377bee8..42873964d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -46,9 +46,11 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Ensures; import it.unimi.dsi.fastutil.objects.*; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -97,43 +99,62 @@ public class MultiSampleCompressor { final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy, - final boolean allowPolyploidReduction) { + final ReduceReads.DownsampleStrategy downsampleStrategy) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } } - public ObjectSet addAlignment(GATKSAMRecord read) { + /** + * Add an alignment to the compressor + * + * @param read the read to be added + * @param knownSnpPositions the set of known SNP positions + * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) + */ + public ObjectSet addAlignment(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { String sampleName = read.getReadGroup().getSample(); SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); if ( compressor == null ) throw new ReviewedStingException("No compressor for sample " + sampleName); - Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); + Pair, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions); ObjectSet reads = readsAndStash.getFirst(); CompressionStash regions = readsAndStash.getSecond(); - reads.addAll(closeVariantRegionsInAllSamples(regions)); + reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions)); return reads; } - public ObjectSet close() { + /** + * Properly closes the compressor. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + @Ensures("result != null") + public ObjectSet close(final ObjectSortedSet knownSnpPositions) { ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { - Pair, CompressionStash> readsAndStash = sample.close(); - reads = readsAndStash.getFirst(); + Pair, CompressionStash> readsAndStash = sample.close(knownSnpPositions); + reads.addAll(readsAndStash.getFirst()); } return reads; } - private ObjectSet closeVariantRegionsInAllSamples(CompressionStash regions) { + /** + * Finalizes current variant regions. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + private ObjectSet closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { for (SingleSampleCompressor sample : compressorsPerSample.values()) { - reads.addAll(sample.closeVariantRegions(regions)); + reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions)); } } return reads; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 62410d191..5e9429284 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -54,9 +54,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.util.SequenceUtil; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -75,6 +73,10 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.Collections; +import java.util.List; /** @@ -147,10 +149,12 @@ public class ReduceReads extends ReadWalker, Redu private byte minTailQuality = 2; /** - * Allow the experimental polyploid-based reduction capabilities of this tool + * Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction. + * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored. + * Note that polyploid ("het") compression will work only when a single SNP is present in a consensus window. */ - @Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false) - private boolean USE_POLYPLOID_REDUCTION = false; + @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false) + public List> known = Collections.emptyList(); /** * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals @@ -249,6 +253,8 @@ public class ReduceReads extends ReadWalker, Redu ObjectSortedSet intervalList; + final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); + // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; @@ -359,8 +365,22 @@ public class ReduceReads extends ReadWalker, Redu for (GATKSAMRecord mappedRead : mappedReads) System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd()); - return mappedReads; + // add the SNPs to the list of known positions + populateKnownSNPs(metaDataTracker); + return mappedReads; + } + + /* + * Add the positions of known SNPs to the set so that we can keep track of it + * + * @param metaDataTracker the ref meta data tracker + */ + protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) { + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( vc.isSNP() ) + knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc)); + } } /** @@ -373,7 +393,7 @@ public class ReduceReads extends ReadWalker, Redu */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } /** @@ -405,7 +425,7 @@ public class ReduceReads extends ReadWalker, Redu if (debugLevel == 1) System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd()); - for (GATKSAMRecord compressedRead : stash.compress(readReady)) + for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions)) outputRead(compressedRead); // We only care about maintaining the link between read pairs if they are in the same variant @@ -422,6 +442,10 @@ public class ReduceReads extends ReadWalker, Redu firstRead = false; } + // reduce memory requirements by removing old positions + if ( !mappedReads.isEmpty() ) + clearStaleKnownPositions(mappedReads.get(0)); + return stash; } @@ -434,13 +458,38 @@ public class ReduceReads extends ReadWalker, Redu public void onTraversalDone(ReduceReadsStash stash) { // output any remaining reads in the compressor - for (GATKSAMRecord read : stash.close()) + for (GATKSAMRecord read : stash.close(knownSnpPositions)) outputRead(read); if (nwayout) writerToUse.close(); } + /** + * Removes known positions that are no longer relevant for use with het compression. + * + * @param read the current read, used for checking whether there are stale positions we can remove + */ + protected void clearStaleKnownPositions(final GATKSAMRecord read) { + // nothing to clear if empty + if ( knownSnpPositions.isEmpty() ) + return; + + // not ready to be cleared until we encounter a read from a different contig + final int contigIndexOfRead = read.getReferenceIndex(); + if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead ) + return; + + // because we expect most elements to be stale, it's not going to be efficient to remove them one at a time + final ObjectAVLTreeSet goodLocs = new ObjectAVLTreeSet(); + for ( final GenomeLoc loc : knownSnpPositions ) { + if ( loc.getContigIndex() == contigIndexOfRead ) + goodLocs.add(loc); + } + knownSnpPositions.clear(); + knownSnpPositions.addAll(goodLocs); + } + /** * Hard clips away all parts of the read that doesn't agree with the intervals selected. * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java index 0a446bab7..52c5f0903 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.objects.ObjectSortedSet; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -106,11 +108,12 @@ public class ReduceReadsStash { /** * sends the read to the MultiSampleCompressor * - * @param read the read to be compressed + * @param read the read to be compressed + * @param knownSnpPositions the set of known SNP positions * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) */ - public Iterable compress(GATKSAMRecord read) { - return compressor.addAlignment(read); + public Iterable compress(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { + return compressor.addAlignment(read, knownSnpPositions); } /** @@ -125,18 +128,19 @@ public class ReduceReadsStash { /** * Close the stash, processing all remaining reads in order * + * @param knownSnpPositions the set of known SNP positions * @return a list of all the reads produced by the SlidingWindow machinery) */ - public Iterable close() { + public Iterable close(final ObjectSortedSet knownSnpPositions) { LinkedList result = new LinkedList(); // compress all the stashed reads (in order) for (GATKSAMRecord read : outOfOrderReads) - for (GATKSAMRecord compressedRead : compressor.addAlignment(read)) + for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions)) result.add(compressedRead); // output any remaining reads from the compressor - for (GATKSAMRecord read : compressor.close()) + for (GATKSAMRecord read : compressor.close(knownSnpPositions)) result.add(read); return result; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 42db83c04..db1e0baaf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -46,7 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Ensures; import it.unimi.dsi.fastutil.objects.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -64,7 +66,6 @@ public class SingleSampleCompressor { final private double minIndelProportionToTriggerVariant; final private int minBaseQual; final private ReduceReads.DownsampleStrategy downsampleStrategy; - final private boolean allowPolyploidReduction; private SlidingWindow slidingWindow; private int slidingWindowCounter; @@ -77,8 +78,7 @@ public class SingleSampleCompressor { final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy, - final boolean allowPolyploidReduction) { + final ReduceReads.DownsampleStrategy downsampleStrategy) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -87,10 +87,16 @@ public class SingleSampleCompressor { this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minBaseQual = minBaseQual; this.downsampleStrategy = downsampleStrategy; - this.allowPolyploidReduction = allowPolyploidReduction; } - public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { + /** + * Add an alignment to the compressor + * + * @param read the read to be added + * @param knownSnpPositions the set of known SNP positions + * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) + */ + public Pair, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions ) { ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash stash = new CompressionStash(); int readOriginalStart = read.getUnclippedStart(); @@ -101,14 +107,14 @@ public class SingleSampleCompressor { (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window // close the current sliding window - Pair, CompressionStash> readsAndStash = slidingWindow.close(); + Pair, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions); reads = readsAndStash.getFirst(); stash = readsAndStash.getSecond(); slidingWindow = null; // so we create a new one on the next if } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); slidingWindowCounter++; } @@ -116,12 +122,26 @@ public class SingleSampleCompressor { return new Pair, CompressionStash>(reads, stash); } - public Pair, CompressionStash> close() { - return (slidingWindow != null) ? slidingWindow.close() : emptyPair; + /** + * Properly closes the compressor. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + @Ensures("result != null") + public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { + return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair; } - public ObjectSet closeVariantRegions(CompressionStash regions) { - return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions); + /** + * Finalizes current variant regions. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + @Ensures("result != null") + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { + return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 11e023b9b..8a80c5570 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -57,6 +57,7 @@ import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; @@ -65,10 +66,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.util.Comparator; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.ListIterator; +import java.util.*; /** @@ -109,8 +107,6 @@ public class SlidingWindow { protected ReduceReads.DownsampleStrategy downsampleStrategy; private boolean hasIndelQualities; - private boolean allowPolyploidReductionInGeneral; - private static CompressionStash emptyRegions = new CompressionStash(); /** @@ -154,7 +150,7 @@ public class SlidingWindow { this.readsInWindow = new ObjectAVLTreeSet(); } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -188,8 +184,6 @@ public class SlidingWindow { this.downsampleStrategy = downsampleStrategy; this.hasIndelQualities = hasIndelQualities; - - this.allowPolyploidReductionInGeneral = allowPolyploidReduction; } /** @@ -403,12 +397,12 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param isNegativeStrand should the synthetic read be represented as being on the negative strand? + * @param strandType the strandedness that the synthetic read should be represented as having * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - protected ObjectArrayList addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) { + protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { ObjectArrayList reads = new ObjectArrayList(); if (start < end) { ListIterator headerElementIterator = header.listIterator(start); @@ -422,22 +416,22 @@ public class SlidingWindow { reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); int endOfConsensus = findNextNonConsensusElement(header, start, end); - addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand); + addToRunningConsensus(header, start, endOfConsensus, strandType); if (endOfConsensus <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); - reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand)); + reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType)); } else if (headerElement.hasFilteredData()) { reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); int endOfFilteredData = findNextNonFilteredDataElement(header, start, end); - reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand)); + reads.addAll(addToFilteredData(header, start, endOfFilteredData, strandType)); if (endOfFilteredData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); - reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand)); + reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, strandType)); } else if (headerElement.isEmpty()) { reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); @@ -446,7 +440,7 @@ public class SlidingWindow { if (endOfEmptyData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); - reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand)); + reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType)); } else throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start)); @@ -558,16 +552,16 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param isNegativeStrand should the synthetic read be represented as being on the negative strand? + * @param strandType the strandedness that the synthetic read should be represented as having * @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated. */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - private ObjectArrayList addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + private ObjectArrayList addToFilteredData(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { ObjectArrayList result = new ObjectArrayList(); if (filteredDataConsensus == null) - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); ListIterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { @@ -583,7 +577,7 @@ public class SlidingWindow { if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) { result.add(finalizeFilteredDataConsensus()); - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, isNegativeStrand); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType); } genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); @@ -601,12 +595,12 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param isNegativeStrand should the synthetic read be represented as being on the negative strand? + * @param strandType the strandedness that the synthetic read should be represented as having */ @Requires({"start >= 0 && (end >= start || end == 0)"}) - private void addToRunningConsensus(LinkedList header, int start, int end, boolean isNegativeStrand) { + private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { if (runningConsensus == null) - runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand); + runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); Iterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { @@ -642,29 +636,39 @@ public class SlidingWindow { * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here? + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here * @return a non-null list of all reads contained in the variant region */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { + protected ObjectList compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { ObjectList allReads = new ObjectArrayList(); // Try to compress into a polyploid consensus - int hetRefPosition = -1; - final Object[] header = windowHeader.toArray(); + // Optimization: don't bother if there are no known SNPs + final int hetRefPosition = knownSnpPositions.isEmpty() ? -1 : findSinglePolyploidCompressiblePosition(start, stop); - if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) - hetRefPosition = findSinglePolyploidCompressiblePosition(header, start, stop); + boolean successfullyCreatedPolyploidConsensus = false; - // Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress - // variant regions that are created by insertions (since we can't confirm here that they represent the same allele) - if ( hetRefPosition != -1 ) { - allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation()); + // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by + // insertions (which we don't want because we can't confirm that they represent the same allele). + // Also, we only allow polyploid consensus creation at known sites. + if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) { + + // try to create the polyploid consensus + final ObjectList polyploidReads = createPolyploidConsensus(start, stop, hetRefPosition); + + // if successful we are good to go! + if ( polyploidReads != null ) { + allReads.addAll(polyploidReads); + successfullyCreatedPolyploidConsensus = true; + } } - // Return all reads that overlap the variant region and remove them from the window header entirely - // also remove all reads preceding the variant region (since they will be output as consensus right after compression - else { + + // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them + // from the window header entirely; also remove all reads preceding the variant region (since they will be output + // as consensus right after compression) + if ( !successfullyCreatedPolyploidConsensus ) { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); @@ -678,35 +682,50 @@ public class SlidingWindow { toRemove.add(read); } } - removeReadsFromWindow(toRemove); + + // remove all used reads + for ( final GATKSAMRecord read : toRemove ) + readsInWindow.remove(read); } return allReads; } + /** + * Determines whether the given position match one of the known sites + * + * @param targetPosition the position of the het site + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here + * @return true if the targetPosition matches a known SNP position, false otherwise + */ + @Requires({"targetPosition >= 1 && knownSnpPositions != null"}) + protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) { + final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition); + return knownSnpPositions.contains(targetLoc); + } + /* * Finds the het variant position located within start and stop (inclusive) if one exists. * - * @param header the header element array * @param start the first header index in the region to check (inclusive) * @param stop the last header index of the region to check (inclusive) * @return the window header index of the single het position or -1 if either none or more than one exists */ - @Requires("header != null && start >= 0 && (stop >= start || stop == 0)") - protected int findSinglePolyploidCompressiblePosition(final Object[] header, final int start, final int stop) { + @Requires("start >= 0 && (stop >= start || stop == 0)") + protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) { int hetRefPosition = -1; for ( int i = start; i <= stop; i++ ) { - final int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT); + final int nAlleles = windowHeader.get(i).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false); - // we will only work on diploid cases because we just don't want to handle/test other scenarios - if ( nAlleles > 2 ) + // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios + if ( nAlleles > 2 || nAlleles == -1 ) return -1; if ( nAlleles == 2 ) { // make sure that there is only 1 site in the region that contains more than one allele - if ( hetRefPosition >= 0 ) + if ( hetRefPosition != -1 ) return -1; hetRefPosition = i; @@ -716,21 +735,43 @@ public class SlidingWindow { return hetRefPosition; } + /* + * Checks whether there's a position in the header with a significant number of softclips. + * + * @param header the window header to examine + * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument) + * @return true if there exists a position with significant softclips, false otherwise + */ + @Requires("header != null") + protected boolean hasSignificantSoftclipPosition(final List header, final int positionToSkip) { + + for ( final HeaderElement headerElement : header ) { + + if ( headerElement.getLocation() == positionToSkip ) + continue; + + if ( headerElement.hasSignificantSoftclips(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT) ) + return true; + } + + return false; + } + /** * Finalizes a variant region, any adjacent synthetic reads. * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here? + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here * @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { - ObjectList allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition); + protected ObjectList closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { + ObjectList allReads = compressVariantRegion(start, stop, knownSnpPositions); ObjectList result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; - result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); + result.addAll(addToSyntheticReads(windowHeader, 0, stop+1, SyntheticRead.StrandType.STRANDLESS)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary @@ -739,10 +780,11 @@ public class SlidingWindow { /* * Finalizes the list of regions requested (and any regions preceding them) * - * @param regions the list of regions to finalize + * @param regions the list of regions to finalize + * @param knownSnpPositions the set of known SNP positions * @return a non-null set of reduced reads representing the finalized regions */ - public ObjectSet closeVariantRegions(final CompressionStash regions) { + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if ( !regions.isEmpty() ) { @@ -754,7 +796,7 @@ public class SlidingWindow { final int start = region.getStart() - windowHeaderStart; final int stop = region.getStop() - windowHeaderStart; - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track + allReads.addAll(closeVariantRegion(start, stop, knownSnpPositions)); // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads @@ -772,6 +814,7 @@ public class SlidingWindow { if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() ) windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight())); } + return allReads; } @@ -804,10 +847,11 @@ public class SlidingWindow { * regions that still exist regardless of being able to fulfill the * context size requirement in the end. * + * @param knownSnpPositions the set of known SNP positions * @return A non-null set/list of all reads generated */ @Ensures("result != null") - public Pair, CompressionStash> close() { + public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { // mark variant regions ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash regions = new CompressionStash(); @@ -816,10 +860,10 @@ public class SlidingWindow { if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); - finalizedReads = closeVariantRegions(regions); + finalizedReads = closeVariantRegions(regions, knownSnpPositions); if (!windowHeader.isEmpty()) { - finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false)); + finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } } @@ -863,86 +907,135 @@ public class SlidingWindow { return finalizedRead; } + // define this so that we can use Java generics below + private static class HeaderElementList extends LinkedList {} + /** - * Finalizes a variant region, any adjacent synthetic reads. + * Finalizes a variant region for point mutations, and any adjacent synthetic reads. Indel sites are not supported. * - * @param start the first window header index in the variant region (inclusive) + * @param start the first window header index of the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param hetRefPosition reference position (in global coordinates) of the het site - * @return a non-null list of all reads contained in the variant region as a polyploid consensus + * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE! + * @return a list of all reads contained in the variant region as a polyploid consensus, or null if not possible */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures("result != null") - private ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { - // we will create two (positive strand, negative strand) headers for each contig - ObjectList> headersPosStrand = new ObjectArrayList>(); - ObjectList> headersNegStrand = new ObjectArrayList>(); - ObjectList hetReads = new ObjectArrayList(); - Byte2IntMap haplotypeHeaderMap = new Byte2IntArrayMap(2); - int currentHaplotype = 0; - int refStart = windowHeader.get(start).getLocation(); - int refStop = windowHeader.get(stop).getLocation(); - ObjectList toRemove = new ObjectArrayList(); - for (GATKSAMRecord read : readsInWindow) { - int haplotype; + protected ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { + // we will create two (positive strand, negative strand) headers for each haplotype + final HeaderElementList[] headersPosStrand = new HeaderElementList[2]; + final HeaderElementList[] headersNegStrand = new HeaderElementList[2]; - // check if the read is either before or inside the variant region - if (read.getSoftStart() <= refStop) { - // check if the read is inside the variant region - if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) { - // check if the read contains the het site - if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) { - int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - // TODO -- THIS IS A HUGE BUG AS IT WILL NOT WORK FOR DELETIONS; see commented out unit test - byte base = read.getReadBases()[readPos]; - byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos]; + final int refStart = windowHeader.get(start).getLocation(); + final int refStop = windowHeader.get(stop).getLocation(); + final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation(); - // check if base passes the filters! - if (qual >= MIN_BASE_QUAL_TO_COUNT) { - // check which haplotype this read represents and take the index of it from the list of headers - if (haplotypeHeaderMap.containsKey(base)) { - haplotype = haplotypeHeaderMap.get(base); - } - // create new lists if this haplotype has not been seen yet - else { - haplotype = currentHaplotype; - haplotypeHeaderMap.put(base, currentHaplotype); - headersPosStrand.add(new LinkedList()); - headersNegStrand.add(new LinkedList()); - currentHaplotype++; - } - LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype); - // add to the polyploid header + // initialize the mapping from base (allele) to header + final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); + for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false) ) { + final int currentIndex = alleleHeaderMap.size(); + if ( currentIndex > 1 ) + throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); + + alleleHeaderMap.put(allele.b, currentIndex); + headersPosStrand[currentIndex] = new HeaderElementList(); + headersNegStrand[currentIndex] = new HeaderElementList(); + } + + // sanity check that we saw 2 alleles + if ( alleleHeaderMap.size() != 2 ) + throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); + + final ObjectList toRemoveFromReadCache = new ObjectArrayList(); + final ObjectList toRemoveFromHeader = new ObjectArrayList(); + + for ( final GATKSAMRecord read : readsInWindow ) { + + // if the read falls after the region, just skip it for now (we'll get to it later) + if ( read.getSoftStart() > refStop ) + continue; + + // if the read falls before the region, remove it + if ( read.getSoftEnd() < refStart ) { + toRemoveFromReadCache.add(read); + continue; + } + + // check whether the read spans the het site + if ( read.getSoftStart() <= globalHetRefPosition && read.getSoftEnd() >= globalHetRefPosition ) { + + // make sure it meets the minimum mapping quality requirement (if not, we'll remove it and not use it for the consensuses) + if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { + + // where on the read is the het position? + final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); + + // this is safe because indels are not supported + final byte base = read.getReadBases()[readPosOfHet]; + final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; + + // make sure that the base passes filters (if not, we'll remove it and not use it for the consensuses) + if ( qual >= MIN_BASE_QUAL_TO_COUNT ) { + + // check which allele this read represents + final Integer allele = alleleHeaderMap.get(base); + + // ignore the read if it represents a base that's not part of the consensus + if ( allele != null ) { + // add to the appropriate polyploid header + final LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; addToHeader(header, read); - // remove from the standard header so that we don't double count it - removeFromHeader(windowHeader, read); } } } - // we remove all reads before and inside the variant region from the window - toRemove.add(read); + // remove from the standard header so that we don't double count it + toRemoveFromHeader.add(read); } + + // we remove all reads falling inside the variant region from the window + toRemoveFromReadCache.add(read); } - for (LinkedList header : headersPosStrand) { - if (header.size() > 0) - hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false)); - if (runningConsensus != null) - hetReads.add(finalizeRunningConsensus()); + // sanity check that no new "variant region" exists on just a single consensus strand + // due to softclips now that we've broken everything out into their component parts + for ( final LinkedList header : headersPosStrand ) { + if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) + return null; } - for (LinkedList header : headersNegStrand) { - if (header.size() > 0) - hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true)); - if (runningConsensus != null) - hetReads.add(finalizeRunningConsensus()); + for ( final LinkedList header : headersNegStrand ) { + if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) + return null; } - removeReadsFromWindow(toRemove); + // create the polyploid synthetic reads + final ObjectList hetReads = new ObjectArrayList(); + for ( final LinkedList header : headersPosStrand ) + finalizeHetConsensus(header, false, hetReads); + for ( final LinkedList header : headersNegStrand ) + finalizeHetConsensus(header, true, hetReads); + + // remove all used reads + for ( final GATKSAMRecord read : toRemoveFromReadCache ) + readsInWindow.remove(read); + for ( final GATKSAMRecord read : toRemoveFromHeader ) + removeFromHeader(windowHeader, read); return hetReads; } + /* + * Finalizes a particular het consensus for the given header representation + * + * @param header the list of header elements representing the header for the consensus + * @param isNegativeStrand does this header represent reads on the negative strand? + * @param result list in which to store results + */ + protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) { + if ( header.size() > 0 ) + result.addAll(addToSyntheticReads(header, 0, header.size(), isNegativeStrand ? SyntheticRead.StrandType.NEGATIVE : SyntheticRead.StrandType.POSITIVE)); + if ( runningConsensus != null ) + result.add(finalizeRunningConsensus()); + } + private void addToHeader(LinkedList header, GATKSAMRecord read) { updateHeaderCounts(header, read, false); } @@ -1101,11 +1194,5 @@ public class SlidingWindow { } } } - - private void removeReadsFromWindow (final ObjectList readsToRemove) { - for (final GATKSAMRecord read : readsToRemove) { - readsInWindow.remove(read); - } - } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index 451e50286..b1ac19f50 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -76,9 +76,17 @@ import java.util.Iterator; * @since 8/26/11 */ public class SyntheticRead { - // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce - // memory footprint. - // TODO: better name + + /** + * The types of strandedness for synthetic reads + */ + public enum StrandType { + POSITIVE, + NEGATIVE, + STRANDLESS + } + + // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint. private static class SingleBaseInfo { byte baseIndexOrdinal; // enum BaseIndex.ordinal byte count; @@ -134,7 +142,7 @@ public class SyntheticRead { private String readName; private int refStart; private boolean hasIndelQualities = false; - private boolean isNegativeStrand = false; + private StrandType strandType = StrandType.STRANDLESS; /** * Full initialization of the running consensus if you have all the information and are ready to @@ -147,7 +155,7 @@ public class SyntheticRead { * @param readName the read's name * @param refStart the alignment start (reference based) */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { final int initialCapacity = 10000; basesCountsQuals = new ObjectArrayList(initialCapacity); mappingQuality = 0.0; @@ -159,10 +167,10 @@ public class SyntheticRead { this.readName = readName; this.refStart = refStart; this.hasIndelQualities = hasIndelQualities; - this.isNegativeStrand = isNegativeRead; + this.strandType = strandType; } - public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { basesCountsQuals = new ObjectArrayList(bases.size()); for (int i = 0; i < bases.size(); ++i) { basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i))); @@ -175,7 +183,7 @@ public class SyntheticRead { this.readName = readName; this.refStart = refStart; this.hasIndelQualities = hasIndelQualities; - this.isNegativeStrand = isNegativeRead; + this.strandType = strandType; } /** @@ -216,8 +224,11 @@ public class SyntheticRead { read.setReferenceIndex(contigIndex); read.setReadPairedFlag(false); read.setReadUnmappedFlag(false); - read.setReadNegativeStrandFlag(isNegativeStrand); - read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) + if ( strandType != StrandType.STRANDLESS ) { + read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line + read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE); + } + read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) read.setAlignmentStart(refStart); read.setReadName(readName); read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 4259dbdb6..4e13e0d9d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -599,9 +599,9 @@ public class UnifiedGenotyperEngine { int numDeletions = 0; for ( final PileupElement p : rawContext.getBasePileup() ) { if ( p.isDeletion() ) - numDeletions++; + numDeletions += p.getRepresentativeCount(); } - if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { return null; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java new file mode 100644 index 000000000..2f8295008 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java @@ -0,0 +1,175 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.HashSet; +import java.util.Set; + +/** + * Emits intervals present in either the original or reduced bam but not the other. + * + *

        Input

        + *

        + * The original and reduced BAM files. + *

        + * + *

        Output

        + *

        + * A list of intervals present in one bam but not the other. + *

        + * + *

        Examples

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -I:original original.bam \
        + *   -I:reduced reduced.bam \
        + *   -R ref.fasta \
        + *   -T AssessReducedCoverage \
        + *   -o output.intervals
        + * 
        + * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) +@Hidden +public class AssessReducedCoverage extends LocusWalker implements TreeReducible { + + private static final String original = "original"; + private static final String reduced = "reduced"; + + @Output + protected PrintStream out; + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) + public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; + + public void initialize() {} + + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if ( tracker == null ) + return null; + + final Set tags = getAllTags(context.getBasePileup()); + return (tags.contains(original) && !tags.contains(reduced)) || + (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; + } + + private Set getAllTags(final ReadBackedPileup pileup) { + + final Set tags = new HashSet(10); + + for ( final PileupElement p : pileup ) { + if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) + tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); + } + + return tags; + } + + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index c48c7cdc7..2f744e914 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -53,6 +53,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; public class HeaderElementUnitTest extends BaseTest { @@ -136,10 +137,12 @@ public class HeaderElementUnitTest extends BaseTest { private class AllelesTest { public final int[] counts; public final double proportion; + public final boolean allowDeletions; - private AllelesTest(final int[] counts, final double proportion) { + private AllelesTest(final int[] counts, final double proportion, final boolean allowDeletions) { this.counts = counts; this.proportion = proportion; + this.allowDeletions = allowDeletions; } } @@ -150,12 +153,16 @@ public class HeaderElementUnitTest extends BaseTest { final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 }; - for ( final int count1 : counts ) { - for ( final int count2 : counts ) { - for ( final int count3 : counts ) { - for ( final int count4 : counts ) { - for ( final double proportion : proportions ) { - tests.add(new Object[]{new AllelesTest(new int[]{count1, count2, count3, count4}, proportion)}); + for ( final int countA : counts ) { + for ( final int countC : counts ) { + for ( final int countG : counts ) { + for ( final int countT : counts ) { + for ( final int countD : counts ) { + for ( final double proportion : proportions ) { + for ( final boolean allowDeletions : Arrays.asList(true, false) ) { + tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, proportion, allowDeletions)}); + } + } } } } @@ -170,24 +177,27 @@ public class HeaderElementUnitTest extends BaseTest { HeaderElement headerElement = new HeaderElement(1000, 0); for ( int i = 0; i < test.counts.length; i++ ) { - BaseIndex base = BaseIndex.values()[i]; + final BaseIndex base = BaseIndex.values()[i]; for ( int j = 0; j < test.counts[i]; j++ ) headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false); } - final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion); - final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion); + final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion, test.allowDeletions); + final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion, test.allowDeletions); Assert.assertEquals(nAllelesSeen, nAllelesExpected); } - private static int calculateExpectedAlleles(final int[] counts, final double proportion) { + private static int calculateExpectedAlleles(final int[] counts, final double proportion, final boolean allowDeletions) { double total = 0.0; for ( final int count : counts ) { total += count; } - final int minCount = (int)(proportion * total); + final int minCount = Math.max(1, (int)(proportion * total)); + + if ( !allowDeletions && counts[BaseIndex.D.index] >= minCount ) + return -1; int result = 0; for ( final int count : counts ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 0cbd537ed..de95b5e9a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -47,12 +47,16 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.collections.Pair; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; +import java.util.List; public class ReduceReadsIntegrationTest extends WalkerTest { final static String REF = b37KGReference; + final static String DBSNP = b37dbSNP132; final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam"; final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam"; @@ -67,48 +71,128 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; - private void RRTest(String testName, String args, String md5) { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s "; - WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); + final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; + + protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { + final Pair, List> result = super.executeTest(name, spec); + + // perform some Reduce Reads specific testing now + if ( result != null ) { + + // generate a new command-line based on the old one + spec.disableImplicitArgs(); + final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" "); + + final StringBuilder newArgs = new StringBuilder(); + for ( int i = 0; i < originalArgs.length; i++ ) { + final String arg = originalArgs[i]; + if ( arg.equals("-T") ) { + newArgs.append("-T AssessReducedCoverage "); + } else if ( arg.startsWith("-I") ) { + newArgs.append("-I:original "); + newArgs.append(originalArgs[++i]); + newArgs.append(" "); + } else if ( arg.equals("-R") || arg.equals("-L") ) { + newArgs.append(arg); + newArgs.append(" "); + newArgs.append(originalArgs[++i]); + newArgs.append(" "); + } + } + for ( final File file : result.getFirst() ) { + newArgs.append("-I:reduced "); + newArgs.append(file.getAbsolutePath()); + newArgs.append(" "); + } + newArgs.append("-o %s"); + + super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(newArgs.toString(), Arrays.asList(emptyFileMd5))); + } + + return result; + } + + protected Pair, List> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) { + return super.executeTest(name, spec); + } + + private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; + WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); executeTest(testName, spec); } @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "16d97a47b8dbfae4ea64fbdf522b693c"); + RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false); } @Test(enabled = true) - public void testInsertionsAtEdgeOfConsensus() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("f7a9a27c5eaf791b67a768fff960a9e1"))); + public void testDefaultCompressionWithKnowns() { + RRTest("testDefaultCompressionWithKnowns ", L, "79cdbd997196957af63f46353cff710b", true); } + private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; + @Test(enabled = true) public void testMultipleIntervals() { - String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "8886ba383e21883241b386882e8e5063"); + RRTest("testMultipleIntervals ", intervals, "6733b25e87e3fce5753cf7936ccf934f", false); + } + + @Test(enabled = true) + public void testMultipleIntervalsWithKnowns() { + RRTest("testMultipleIntervalsWithKnowns ", intervals, "99e2a79befc71eaadb4197c66a0d6df8", true); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "54253f25d363852a1182aff33e500b92"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e3b7e14655973c8950d7fec96321e483", false); + } + + @Test(enabled = true) + public void testHighCompressionWithKnowns() { + RRTest("testHighCompressionWithKnowns ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "30a7ed079b3a41ed63e520260fa6afe3", true); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "1d7d2d28900db57dad65a8beef64b8cb"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false); + } + + @Test(enabled = true) + public void testLowCompressionWithKnowns() { + RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true); } @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f58ae2154e0e5716be0e850b7605856e"); + final String md5 = "f58ae2154e0e5716be0e850b7605856e"; + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); + RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); } @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bfe0693aea74634f1035a9bd11302517"))); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517"))); + } + + @Test(enabled = true) + public void testCoReduction() { + String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022"))); + } + + @Test(enabled = true) + public void testCoReductionWithKnowns() { + String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7"))); + } + + @Test(enabled = true) + public void testInsertionsAtEdgeOfConsensus() { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4445db7aeddaf2f1d86e1af0cdc74c8"))); } /** @@ -122,7 +206,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("f118e83c394d21d901a24230379864fc"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("f118e83c394d21d901a24230379864fc"))); } /** @@ -132,26 +216,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); - } - - @Test(enabled = true) - public void testCoReduction() { - String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("81312c31b9910a42bff6acb5167592ab"))); + // we expect to lose coverage due to the downsampling so don't run the systematic coverage test + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); } /** - * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will + * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will * break the current hard-clipping routine that protects reduce reads from such reads. */ @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); } - /** * Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same. */ @@ -159,7 +237,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("9bed260b6245f5ff47db8541405504aa"))); + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa"))); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java index b9399bb1b..15b79b78a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java @@ -49,12 +49,29 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Random; @@ -96,7 +113,7 @@ public class ReduceReadsUnitTest extends BaseTest { /** * Test the read name compression functionality */ - @Test(dataProvider = "ReadNameProvider") + @Test(dataProvider = "ReadNameProvider", enabled = false) public void testReadNameCompression(final String name, final boolean alreadySeen) { GATKSAMRecord read = GATKSAMRecord.createRandomRead(1); read.setReadName(name); @@ -108,4 +125,90 @@ public class ReduceReadsUnitTest extends BaseTest { Assert.assertTrue(hash.containsKey(name)); } + + ///////////////////////////////////////////////////////////////////////////// + //// This section tests the functionality related to known SNP positions //// + ///////////////////////////////////////////////////////////////////////////// + + private static SAMFileHeader header; + private static GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 100); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + @DataProvider(name = "PopulateKnownsProvider") + public Object[][] populateKnownsProvider() { + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + final Allele AC = Allele.create("AC"); + + final VariantContext snp_1_10 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, C)).make(); + final VariantContext snp_1_10_2 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, G)).make(); + final VariantContext snp_1_20 = new VariantContextBuilder("known", "chr1", 20, 20, Arrays.asList(A, C)).make(); + final VariantContext snp_1_30 = new VariantContextBuilder("known", "chr1", 30, 30, Arrays.asList(A, C)).make(); + final VariantContext snp_2_10 = new VariantContextBuilder("known", "chr2", 10, 10, Arrays.asList(A, C)).make(); + final VariantContext snp_3_10 = new VariantContextBuilder("known", "chr3", 10, 10, Arrays.asList(A, C)).make(); + final VariantContext indel_1_40 = new VariantContextBuilder("known", "chr1", 40, 40, Arrays.asList(A, AC)).make(); + final VariantContext indel_2_40 = new VariantContextBuilder("known", "chr2", 40, 40, Arrays.asList(A, AC)).make(); + + final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "foo1", 0, 1, 1); + final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "foo2", 1, 1, 1); + final GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "foo3", 2, 1, 1); + + final ObjectArrayList tests = new ObjectArrayList(); + + // test single + tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))}); + + // test multiple at one position + tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))}); + + // test multiple + tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); + + // test indel not used + tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))}); + tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))}); + + // test read clears + tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); + tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); + tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); + tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); + tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))}); + tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))}); + + return tests.toArray(new Object[][]{}); + } + + private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) { + final List x = new ArrayList(); + x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known")); + final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd())); + return new RefMetaDataTracker(Arrays.asList(rods)); + } + + @Test(dataProvider = "PopulateKnownsProvider") + public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List trackers) { + final ReduceReads rr = new ReduceReads(); + RodBinding.resetNameCounter(); + rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known")); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + rr.setToolkit(engine); + + for ( final RefMetaDataTracker tracker : trackers ) + rr.populateKnownSNPs(tracker); + Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear); + + rr.clearStaleKnownPositions(read); + Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear); + } + } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index 054f7aa15..f081b9f8a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -46,9 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import it.unimi.dsi.fastutil.objects.ObjectList; -import it.unimi.dsi.fastutil.objects.ObjectSet; +import it.unimi.dsi.fastutil.objects.*; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; @@ -317,6 +315,7 @@ public class SlidingWindowUnitTest extends BaseTest { private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); + private static final GenomeLoc loc312 = new UnvalidatingGenomeLoc("1", 0, 1000312, 1000312); private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); @DataProvider(name = "ConsensusCreation") @@ -325,10 +324,11 @@ public class SlidingWindowUnitTest extends BaseTest { // test high quality reads and bases tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 5)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 6)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc312), false, false, 11, 8)}); // test low quality reads tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1)}); @@ -349,8 +349,7 @@ public class SlidingWindowUnitTest extends BaseTest { tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 3, 3)}); // test I/D operators - // TODO -- uncomment this test when the deletion bug is fixed! - // tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 5)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11)}); @@ -364,23 +363,66 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "ConsensusCreation", enabled = true) public void testConsensusCreationTest(ConsensusCreationTest test) { - // test WITHOUT het compression allowed - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false); + final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); + + // test WITHOUT het compression + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(); + Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); - // test WITH het compression allowed - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, true); + // test WITH het compression + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - result = slidingWindow.close(); + for ( int i = 0; i < 1200; i++ ) + knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); + result = slidingWindow.close(knownSNPs); Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); } + @Test + public void testConsensusCreationForMultiallelic() { + + final int totalNumReads = 7; + final ObjectList myReads = new ObjectArrayList(totalNumReads); + + for ( int i = 0; i < totalNumReads; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + + final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G'); + read.setReadBases(Utils.dupBytes((byte) base, readLength)); + + myReads.add(read); + } + + final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); + + // test WITHOUT het compression + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty + + Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all + + // test WITH het compression + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + for ( int i = 0; i < readLength; i++ ) + knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); + result = slidingWindow.close(knownSNPs); + + Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all + } + /////////////////////////////////////////////////////////// //// This section tests the downsampling functionality //// @@ -398,7 +440,7 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "Downsampling", enabled = true) public void testDownsamplingTest(final int dcov) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); @@ -446,10 +488,10 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "ConsensusQuals", enabled = true) public void testConsensusQualsTest(QualsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(); + final Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); Assert.assertEquals(result.getFirst().size(), 1); final GATKSAMRecord read = result.getFirst().iterator().next(); @@ -515,7 +557,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); read.setMappingQuality(30); - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start); Assert.assertEquals(newIndex, start > 0 ? start : 0); @@ -559,7 +601,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setMappingQuality(30); // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); for ( int i = 0; i < start; i++ ) Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); @@ -573,4 +615,84 @@ public class SlidingWindowUnitTest extends BaseTest { for ( int i = 0; i < currentHeaderLength; i++ ) Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); } + + + ////////////////////////////////////////////////////////////////////////////////// + //// This section tests functionality related to polyploid consensus creation //// + ////////////////////////////////////////////////////////////////////////////////// + + @DataProvider(name = "MatchesKnownProvider") + public Object[][] matchesKnownProvider() { + + final ObjectArrayList tests = new ObjectArrayList(); + + // test no knowns + tests.add(new Object[]{new ObjectAVLTreeSet(), loc290.getStart(), false}); + + final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); + knownSnpPositions.add(loc290); + knownSnpPositions.add(loc295); + knownSnpPositions.add(loc310); + + // test overlap + tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true}); + tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true}); + tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true}); + tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MatchesKnownProvider") + public void testMatchesKnown(final ObjectSortedSet knownSnpPositions, final int targetLoc, final boolean expectedResult) { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); + Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult); + } + + @DataProvider(name = "SignificantSoftclipsProvider") + public Object[][] SignificantSoftclipsTestData() { + List tests = new ArrayList(); + + for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) { + for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) { + tests.add(new Object[]{indexWithSoftclips, indexToSkip}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SignificantSoftclipsProvider", enabled = true) + public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) { + + // set up the window header + final int currentHeaderStart = 100; + final int currentHeaderLength = 10; + final LinkedList windowHeader = new LinkedList(); + for ( int i = 0; i < currentHeaderLength; i++ ) + windowHeader.add(new HeaderElement(currentHeaderStart + i)); + + // set up the normal read + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength); + read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength)); + read.setMappingQuality(30); + + // add the read + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); + + // set up and add a soft-clipped read if requested + if ( indexWithSoftclips != -1 ) { + final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1); + softclippedRead.setReadBases(new byte[]{(byte) 'A'}); + softclippedRead.setBaseQualities(new byte[]{(byte) 30}); + softclippedRead.setMappingQuality(30); + softclippedRead.setCigarString("1S"); + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips); + } + + final boolean result = slidingWindow.hasSignificantSoftclipPosition(windowHeader, currentHeaderStart + indexToSkip); + Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java index 570b797ca..6886568e8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java @@ -77,7 +77,7 @@ public void testBaseCounts() { new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; for (TestRead testRead : testReads) { - SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false); + SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, SyntheticRead.StrandType.STRANDLESS); Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index b5fe79993..0620f15df 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -69,18 +69,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c"); + testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942"); + testReducedCalling("INDEL", "9a702e7a85465f6c42d6c1828aee6c38"); } private void testReducedCalling(final String model, final String md5) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, Arrays.asList(md5)); executeTest("test calling on a ReducedRead BAM with " + model, spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c5f9f606b..01f39a67b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -52,6 +52,7 @@ import java.util.*; public class GATKSAMRecord extends BAMRecord { // ReduceReads specific attribute tags public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool + public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end @@ -74,7 +75,7 @@ public class GATKSAMRecord extends BAMRecord { private int softEnd = UNINITIALIZED; private Integer adapterBoundary = null; - private boolean isStrandlessRead = false; + private Boolean isStrandlessRead = null; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -158,6 +159,9 @@ public class GATKSAMRecord extends BAMRecord { * @return true if this read doesn't have meaningful strand information */ public boolean isStrandless() { + if ( isStrandlessRead == null ) { + isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null; + } return isStrandlessRead; } @@ -175,7 +179,7 @@ public class GATKSAMRecord extends BAMRecord { } @Override - public void setReadNegativeStrandFlag(boolean flag) { + public void setReadNegativeStrandFlag(final boolean flag) { if ( isStrandless() ) throw new IllegalStateException("Cannot set the strand of a strandless read"); super.setReadNegativeStrandFlag(flag); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java index 6472a10bb..c07bf171a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java @@ -72,7 +72,7 @@ public class CallableLociIntegrationTest extends WalkerTest { public void testWithReducedRead() { String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, - Arrays.asList("684069ffe94a1175051066ed53f0fd9d", "ebc310cf734d98e26d2d83e16b1144d1")); + Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0")); executeTest("CallableLoci with ReducedRead", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 38840fab1..18a501b51 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -64,7 +64,6 @@ public class GATKSAMRecordUnitTest extends BaseTest { for (int i = 0; i < reducedRead.getReadLength(); i++) { Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); } - Assert.assertEquals(reducedRead.isStrandless(), false, "Reduced reads don't have meaningful strandedness information"); } @Test From 464e65ea96161bcd6a213c16f22c8622dea0fc3a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 23 Mar 2013 14:10:54 -0400 Subject: [PATCH 102/226] Disable error correcting kmers by default in the HC -- The error correction algorithm can break the reference graph in some cases by error correcting us into a bad state for the reference sequence. Because we know that the error correction algorithm isn't ideal, and worse, doesn't actually seem to improve the calling itself on chr20, I've simply disabled error correction by default and allowed it to be turned on with a hidden argument. -- In the process I've changed a bit the assembly interface, moving some common arguments us into the LocalAssemblyEngine, which are turned on/off via setter methods. -- Went through the updated arguments in the HC to be @Hidden and @Advanced as appropriate -- Don't write out an errorcorrected graph when debugging and error correction isn't enabled --- .../haplotypecaller/DeBruijnAssembler.java | 44 +++++++------------ .../haplotypecaller/HaplotypeCaller.java | 25 +++++++++-- .../haplotypecaller/LocalAssemblyEngine.java | 42 ++++++++++++++++-- 3 files changed, 77 insertions(+), 34 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6aec9c7a5..c219fab00 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -65,7 +65,6 @@ import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.PrintStream; import java.util.*; /** @@ -83,7 +82,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; private static final int GRAPH_KMER_STEP = 6; // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode @@ -94,30 +92,23 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final boolean debug; private final boolean debugGraphTransformations; - private final PrintStream graphWriter; private final int minKmer; - private final byte minBaseQualityToUseInAssembly; private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - private int PRUNE_FACTOR = 2; protected DeBruijnAssembler() { - this(false, -1, null, 11, DEFAULT_MIN_BASE_QUALITY_TO_USE); + this(false, -1, 11); } public DeBruijnAssembler(final boolean debug, final int debugGraphTransformations, - final PrintStream graphWriter, - final int minKmer, - final byte minBaseQualityToUseInAssembly) { + final int minKmer) { super(); this.debug = debug; this.debugGraphTransformations = debugGraphTransformations > 0; this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; - this.graphWriter = graphWriter; this.minKmer = minKmer; - this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; } /** @@ -126,19 +117,15 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { * @param refHaplotype reference haplotype object * @param fullReferenceWithPadding byte array holding the reference sequence with padding * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode * @return a non-empty list of all the haplotypes that are produced during assembly */ @Ensures({"result.contains(refHaplotype)"}) - public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List activeAllelesToGenotype ) { + public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype ) { if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } - if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } - - // set the pruning factor for this run of the assembly engine - this.PRUNE_FACTOR = PRUNE_FACTOR; + if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } // create the graphs final List graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype ); @@ -170,13 +157,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready - if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), PRUNE_FACTOR); - graph = graph.errorCorrect(); - if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); + + if ( shouldErrorCorrectKmers() ) { + graph = graph.errorCorrect(); + if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); + } final SeqGraph seqGraph = toSeqGraph(graph); - if( seqGraph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference + if ( seqGraph != null ) { // if the graph contains interesting variation from the reference sanityCheckReferenceGraph(seqGraph, refHaplotype); graphs.add(seqGraph); @@ -192,19 +182,19 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), PRUNE_FACTOR); - seqGraph.pruneGraph(PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), pruneFactor); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef // might blow up because there's no reference source node if ( seqGraph.vertexSet().size() == 1 ) return seqGraph; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), PRUNE_FACTOR); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); return seqGraph; } @@ -295,7 +285,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { continue; } - graph.printGraph(graphWriter, false, PRUNE_FACTOR); + graph.printGraph(graphWriter, false, pruneFactor); if ( debugGraphTransformations ) break; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 81ff3dfbd..5849b5a0e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -171,6 +171,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png * */ + @Advanced @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) protected StingSAMFileWriter bamWriter = null; private HaplotypeBAMWriter haplotypeBAMWriter; @@ -178,12 +179,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * The type of BAM output we want to see. */ + @Advanced @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ + @Advanced @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; @@ -191,6 +194,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; + @Advanced @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; @@ -217,6 +221,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) protected boolean includeUnmappedReads = false; + @Advanced @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -232,6 +237,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false) protected boolean dontGenotype = false; + @Hidden + @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) + protected boolean errorCorrectKmers = false; + /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -246,6 +255,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Records that are filtered in the comp track will be ignored. * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ + @Advanced @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -258,6 +268,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem /** * Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations. */ + @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(Arrays.asList(new String[]{"ClippingRankSumTest"})); @@ -265,6 +276,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ + @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); @@ -277,12 +289,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); + @Advanced @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + @Advanced @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected int debugGraphTransformations = -1; + @Hidden @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; @@ -388,8 +403,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); } - final byte minBaseQualityToUseInAssembly = useLowQualityBasesForAssembly ? (byte)1 : DeBruijnAssembler.DEFAULT_MIN_BASE_QUALITY_TO_USE; - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, graphWriter, minKmer, minBaseQualityToUseInAssembly ); + // setup the assembler + assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer); + assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); @@ -520,7 +539,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 3efa342b1..c31405872 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.io.PrintStream; import java.util.List; /** @@ -59,13 +60,46 @@ import java.util.List; * Date: Mar 14, 2011 */ public abstract class LocalAssemblyEngine { + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; - public enum ASSEMBLER { - SIMPLE_DE_BRUIJN + protected PrintStream graphWriter = null; + protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; + protected int pruneFactor = 2; + protected boolean errorCorrectKmers = false; + + protected LocalAssemblyEngine() { } + + public int getPruneFactor() { + return pruneFactor; } - protected LocalAssemblyEngine() { + public void setPruneFactor(int pruneFactor) { + this.pruneFactor = pruneFactor; } - public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List activeAllelesToGenotype); + public boolean shouldErrorCorrectKmers() { + return errorCorrectKmers; + } + + public void setErrorCorrectKmers(boolean errorCorrectKmers) { + this.errorCorrectKmers = errorCorrectKmers; + } + + public PrintStream getGraphWriter() { + return graphWriter; + } + + public void setGraphWriter(PrintStream graphWriter) { + this.graphWriter = graphWriter; + } + + public byte getMinBaseQualityToUseInAssembly() { + return minBaseQualityToUseInAssembly; + } + + public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) { + this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly; + } + + public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List activeAllelesToGenotype); } From 1917d55dc228f450cabd669b1038e8dce861584f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Mar 2013 22:58:25 -0400 Subject: [PATCH 103/226] Bugfix for DeBruijnAssembler: don't fail when read length > haplotype length -- The previous version would generate graphs that had no reference bases at all in the situation where the reference haplotype was < the longer read length, which would cause the kmer size to exceed the reference haplotype length. Now return immediately with a null graph when this occurs as opposed to continuing and eventually causing an error --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index c219fab00..6343d79ef 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -216,11 +216,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); // First pull kmers from the reference haplotype and add them to the graph - //logger.info("Adding reference sequence to graph " + refHaplotype.getBaseString()); final byte[] refSequence = refHaplotype.getBases(); if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; @@ -232,12 +230,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; } } + } else { + // not enough reference sequence to build a kmer graph of this length, return null + return null; } // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { - //if ( ! read.getReadName().equals("H06JUADXX130110:1:1213:15422:11590")) continue; - //logger.info("Adding read " + read + " with sequence " + read.getReadString()); final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced From 2472828e1c464f43d59beca9c7fe26ad4a533a2b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Mar 2013 23:42:44 -0400 Subject: [PATCH 104/226] HC bug fixes: no longer create reference graphs with cycles -- Though not intended, it was possible to create reference graphs with cycles in the case where you started the graph with a homopolymer of length > the kmer. The previous test would fail to catch this case. Now its not possible -- Lots of code cleanup and refactoring in this push. Split the monolithic createGraphFromSequences into simple calls to addReferenceKmersToGraph and addReadKmersToGraph which themselves share lower level functions like addKmerPairFromSeqToGraph. -- Fix performance problem with reduced reads and the HC, where we were calling add kmer pair for each count in the reduced read, instead of just calling it once with a multiplicity of count. -- Refactor addKmersToGraph() to use things like addOrUpdateEdge, now the code is very clear --- .../haplotypecaller/DeBruijnAssembler.java | 108 ++++++++++++------ .../haplotypecaller/DeBruijnGraph.java | 39 ++++--- .../DeBruijnAssemblerUnitTest.java | 4 +- 3 files changed, 99 insertions(+), 52 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 6343d79ef..9c6c255df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -154,7 +154,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { continue; if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, debug); + DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype); if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object // do a series of steps to clean up the raw assembly graph to make it analysis-ready if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); @@ -189,10 +189,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { seqGraph.simplifyGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); - // if we've assembled just to the reference, just leave now otherwise removePathsNotConnectedToRef - // might blow up because there's no reference source node - if ( seqGraph.vertexSet().size() == 1 ) - return seqGraph; + // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can + // happen in cases where for example the reference somehow manages to acquire a cycle, or + // where the entire assembly collapses back into the reference sequence. + if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) + return null; + seqGraph.removePathsNotConnectedToRef(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); @@ -214,64 +216,102 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } - @Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"}) - protected DeBruijnGraph createGraphFromSequences( final List reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) { - final DeBruijnGraph graph = new DeBruijnGraph(KMER_LENGTH); + @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) + protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) { + final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); // First pull kmers from the reference haplotype and add them to the graph - final byte[] refSequence = refHaplotype.getBases(); - if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = refSequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true, 1) ) { - if( DEBUG ) { - System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping"); - } - return null; - } - } - } else { - // not enough reference sequence to build a kmer graph of this length, return null + if ( ! addReferenceKmersToGraph(graph, refHaplotype.getBases()) ) + // something went wrong, so abort right now with a null graph return null; - } + + // now go through the graph already seeded with the reference sequence and add the read kmers to it + if ( ! addReadKmersToGraph(graph, reads) ) + // some problem was detected adding the reads to the graph, return null to indicate we failed + return null; + + graph.cleanNonRefPaths(); + return graph; + } + + /** + * Add the high-quality kmers from the reads to the graph + * + * @param graph a graph to add the read kmers to + * @param reads a non-null list of reads whose kmers we want to add to the graph + * @return true if we successfully added the read kmers to the graph without corrupting it in some way + */ + protected boolean addReadKmersToGraph(final DeBruijnGraph graph, final List reads) { + final int kmerLength = graph.getKmerSize(); // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; + if( sequence.length > kmerLength + KMER_OVERLAP ) { + final int kmersInSequence = sequence.length - kmerLength + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; - for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) { + for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) { if( qualities[jjj] < minBaseQualityToUseInAssembly ) { badKmer = true; break; } } if( !badKmer ) { + // how many observations of this kmer have we seen? A normal read counts for 1, but + // a reduced read might imply a higher multiplicity for our the edge int countNumber = 1; if( read.isReducedRead() ) { // compute mean number of reduced read counts in current kmer span // precise rounding can make a difference with low consensus counts - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH)); + // TODO -- optimization: should extend arrayMax function to take start stop values + countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength)); } - final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH); - final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH); - - for( int kkk=0; kkk < countNumber; kkk++ ) { - graph.addKmersToGraph(kmer1, kmer2, false, 1); - } + graph.addKmerPairFromSeqToGraph(sequence, iii, false, countNumber); } } } } - graph.cleanNonRefPaths(); - return graph; + // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way + return true; + } + + /** + * Add the kmers from the reference sequence to the DeBruijnGraph + * + * @param graph the graph to add the reference kmers to. Must be empty + * @param refSequence the reference sequence from which we'll get our kmers + * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise + */ + protected boolean addReferenceKmersToGraph(final DeBruijnGraph graph, final byte[] refSequence) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( graph.vertexSet().size() != 0 ) throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + graph.vertexSet().size() + " vertices in it already: " + graph); + if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null"); + + + final int kmerLength = graph.getKmerSize(); + if( refSequence.length < kmerLength + KMER_OVERLAP ) { + // not enough reference sequence to build a kmer graph of this length, return null + return false; + } + + final int kmersInSequence = refSequence.length - kmerLength + 1; + for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + graph.addKmerPairFromSeqToGraph(refSequence, iii, true, 1); + } + + // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices + if ( graph.vertexSet().size() != kmersInSequence ) { + if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping"); + return false; + } + + return true; } protected void printGraphs(final List graphs) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java index 0e20c311b..fd8581254 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java @@ -126,30 +126,37 @@ public class DeBruijnGraph extends BaseGraph { * @param kmer1 the source kmer for the edge * @param kmer2 the target kmer for the edge * @param isRef true if the added edge is a reference edge - * @return will return false if trying to add a reference edge which creates a cycle in the assembly graph */ - public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { + public void addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) { if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); } if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); } - final int numVertexBefore = vertexSet().size(); final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 ); - addVertex(v1); final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 ); - addVertex(v2); - if( isRef && vertexSet().size() == numVertexBefore ) { return false; } + final BaseEdge toAdd = new BaseEdge(isRef, multiplicity); - final BaseEdge targetEdge = getEdge(v1, v2); - if ( targetEdge == null ) { - addEdge(v1, v2, new BaseEdge( isRef, multiplicity )); - } else { - if( isRef ) { - targetEdge.setIsRef( true ); - } - targetEdge.setMultiplicity(targetEdge.getMultiplicity() + multiplicity); - } - return true; + addVertices(v1, v2); + addOrUpdateEdge(v1, v2, toAdd); + } + + /** + * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this + * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The + * edge between them is added with isRef and multiplicity + * + * @param sequence a sequence of bases from which we want to extract a pair of kmers + * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() + * @param isRef should the edge between the two kmers be a reference edge? + * @param multiplicity what's the multiplicity of the edge between these two kmers + */ + public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final boolean isRef, final int multiplicity ) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); + if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); + final byte[] kmer1 = Arrays.copyOfRange(sequence, start, start + getKmerSize()); + final byte[] kmer2 = Arrays.copyOfRange(sequence, start + 1, start + 1 + getKmerSize()); + addKmersToGraph(kmer1, kmer2, isRef, multiplicity); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 663d619a8..cce623b76 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -72,8 +72,8 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { public void testReferenceCycleGraph() { String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), false); - final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), false); + final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true)); + final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true)); Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); From ad04fdb23313ae9439887d829b236fde18c87c84 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 24 Mar 2013 14:41:21 -0400 Subject: [PATCH 105/226] PerReadAlleleLikelihoodMap getMostLikelyAllele returns an MostLikelyAllele objects now -- This new functionality allows the client to make decisions about how to handle non-informative reads, rather than having a single enforced constant that isn't really appropriate for all users. The previous functionality is maintained now and used by all of the updated pieces of code, except the BAM writers, which now emit reads to display to their best allele, regardless of whether this is particularly informative or not. That way you can see all of your data realigned to the new HC structure, rather than just those that are specifically informative. -- This all makes me concerned that the informative thresholding isn't appropriately used in the annotations themselves. There are many cases where nearby variation makes specific reads non-informative about one event, due to not being informative about the second. For example, suppose you have two SNPs A/B and C/D that are in the same active region but separated by more than the read length of the reads. All reads would be non-informative as no read provides information about the full combination of 4 haplotypes, as they reads only span a single event. In this case our annotations will all fall apart, returning their default values. Added a JIRA to address this (should be discussed in group meeting) --- .../annotator/BaseQualityRankSumTest.java | 13 +- .../annotator/ClippingRankSumTest.java | 9 +- .../annotator/DepthPerAlleleBySample.java | 9 +- .../gatk/walkers/annotator/FisherStrand.java | 5 +- .../annotator/MappingQualityRankSumTest.java | 9 +- .../walkers/annotator/ReadPosRankSumTest.java | 9 +- .../genotyper/MostLikelyAlleleUnitTest.java | 114 ++++++++++++++++ .../utils/genotyper/MostLikelyAllele.java | 127 ++++++++++++++++++ .../genotyper/PerReadAlleleLikelihoodMap.java | 30 ++--- .../AllHaplotypeBAMWriter.java | 52 +++---- .../CalledHaplotypeBAMWriter.java | 52 +++---- 11 files changed, 333 insertions(+), 96 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 04f9e87c7..a3a9e50e9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -90,13 +91,13 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot } for (Map el : alleleLikelihoodMap.getLikelihoodMapValues()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) - refQuals.add(-10.0*(double)el.get(a)); - else if (allAlleles.contains(a)) - altQuals.add(-10.0*(double)el.get(a)); + if (a.getMostLikelyAllele().isReference()) + refQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele())); + else if (allAlleles.contains(a.getMostLikelyAllele())) + altQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele())); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index 90ca5c667..366512119 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -84,12 +85,12 @@ public class ClippingRankSumTest extends RankSumTest { for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 9f90a1308..1cf91f181 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; @@ -141,12 +142,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa } for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final GATKSAMRecord read = el.getKey(); - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative() ) continue; // read is non-informative - if (!vc.getAlleles().contains(a)) + if (!vc.getAlleles().contains(a.getMostLikelyAllele())) continue; // sanity check - shouldn't be needed - alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); + alleleCounts.put(a.getMostLikelyAllele(), alleleCounts.get(a.getMostLikelyAllele()) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 7960a3ce2..e4dcaec54 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -273,10 +274,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount); + updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index b30df04a8..3873138a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; @@ -92,13 +93,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn return; } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); // BUGBUG: There needs to be a comparable isUsableBase check here - if (a.isNoCall()) + if (! a.isInformative()) continue; // read is non-informative - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)el.getKey().getMappingQuality()); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)el.getKey().getMappingQuality()); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 182a9226f..6ce4aab49 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -51,6 +51,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -107,8 +108,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio } for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if (a.isNoCall()) + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); @@ -123,9 +124,9 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio if (readPos > numAlignedBases / 2) readPos = numAlignedBases - (readPos + 1); - if (a.isReference()) + if (a.getMostLikelyAllele().isReference()) refQuals.add((double)readPos); - else if (allAlleles.contains(a)) + else if (allAlleles.contains(a.getMostLikelyAllele())) altQuals.add((double)readPos); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java new file mode 100644 index 000000000..cf077392b --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java @@ -0,0 +1,114 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class MostLikelyAlleleUnitTest extends BaseTest { + final Allele a = Allele.create("A"); + + @Test + public void testBasicCreation() { + final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1; + MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); + Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second); + + Assert.assertEquals(mla.isInformative(), true); + Assert.assertEquals(mla.isInformative(10), false); + Assert.assertEquals(mla.isInformative(0), true); + Assert.assertEquals(mla.getAlleleIfInformative(), a); + Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(0), a); + } + + @Test + public void testNotDefaultInformative() { + final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2); + MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + Assert.assertEquals(mla.isInformative(), false); + Assert.assertEquals(mla.isInformative(10), false); + Assert.assertEquals(mla.isInformative(0), true); + Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(0), a); + } + + @Test + public void testCreationNoGoodSecond() { + MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY); + Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); + Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); + + Assert.assertEquals(mla.isInformative(), true); + Assert.assertEquals(mla.isInformative(10), true); + Assert.assertEquals(mla.isInformative(0), true); + Assert.assertEquals(mla.getAlleleIfInformative(), a); + Assert.assertEquals(mla.getAlleleIfInformative(10), a); + Assert.assertEquals(mla.getAlleleIfInformative(0), a); + } + + @Test + public void testCreationNoAllele() { + MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); + Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL); + Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY); + Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); + + Assert.assertEquals(mla.isInformative(), false); + Assert.assertEquals(mla.isInformative(10), false); + Assert.assertEquals(mla.isInformative(0), false); + Assert.assertEquals(mla.getAlleleIfInformative(), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(10), Allele.NO_CALL); + Assert.assertEquals(mla.getAlleleIfInformative(0), Allele.NO_CALL); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java new file mode 100644 index 000000000..e12fb546c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -0,0 +1,127 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.Allele; + +/** + * Stores the most likely and second most likely alleles, along with a threshold + * for assuming computing that a read is informative. + * + * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD + * then the most likely allele is set to "no call", and isInformative will return false. This constant can be + * overridden simply by using one of the version of these calls that accepts informative threshold as an argument. + * + * For convenience, there are functions called getAlleleIfInformative that return either the most likely allele, or + * NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD of one another. + * + * By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the + * corresponding key + * + * User: depristo + * Date: 3/24/13 + * Time: 1:39 PM + */ +public final class MostLikelyAllele { + public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; + + final Allele mostLikely; + final double log10LikelihoodOfMostLikely; + final double log10LikelihoodOfSecondBest; + + /** + * Create a new MostLikelyAllele + * + * If there's a meaningful most likely allele, allele should be a real allele. If none can be determined, + * mostLikely should be a NO_CALL allele. + * + * @param mostLikely the most likely allele + * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele + * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) + */ + public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); + if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) + throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); + if ( log10LikelihoodOfSecondBest != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfSecondBest) ) + throw new IllegalArgumentException("log10LikelihoodOfSecondBest must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfSecondBest); + if ( log10LikelihoodOfMostLikely < log10LikelihoodOfSecondBest ) + throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest); + + this.mostLikely = mostLikely; + this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely; + this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest; + } + + public Allele getMostLikelyAllele() { + return mostLikely; + } + + public double getLog10LikelihoodOfMostLikely() { + return log10LikelihoodOfMostLikely; + } + + public double getLog10LikelihoodOfSecondBest() { + return log10LikelihoodOfSecondBest; + } + + /** + * @see #isInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD + */ + public boolean isInformative() { + return isInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD); + } + + /** + * Was this allele selected from an object that was specifically informative about the allele? + * + * The calculation that implements this is whether the likelihood of the most likely allele is larger + * than the second most likely by at least the log10ThresholdForInformative + * + * @return true if so, false if not + */ + public boolean isInformative(final double log10ThresholdForInformative) { + return getLog10LikelihoodOfMostLikely() - getLog10LikelihoodOfSecondBest() > log10ThresholdForInformative; + } + + /** + * @see #getAlleleIfInformative(double) with threshold of INFORMATIVE_LIKELIHOOD_THRESHOLD + */ + public Allele getAlleleIfInformative() { + return getAlleleIfInformative(INFORMATIVE_LIKELIHOOD_THRESHOLD); + } + + /** + * Get the most likely allele if isInformative(log10ThresholdForInformative) is true, or NO_CALL otherwise + * + * @param log10ThresholdForInformative a log10 threshold to determine if the most likely allele was informative + * @return a non-null allele + */ + public Allele getAlleleIfInformative(final double log10ThresholdForInformative) { + return isInformative(log10ThresholdForInformative) ? getMostLikelyAllele() : Allele.NO_CALL; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 5e010db67..02618100d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -41,10 +41,6 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { - - - public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; - protected List alleles; protected Map> likelihoodReadMap; @@ -119,9 +115,9 @@ public class PerReadAlleleLikelihoodMap { for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { // do not remove reduced reads! if ( !entry.getKey().isReducedRead() ) { - final Allele bestAllele = getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - alleleReadMap.get(bestAllele).add(entry.getKey()); + final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); + if ( bestAllele.isInformative() ) + alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); } } @@ -194,32 +190,25 @@ public class PerReadAlleleLikelihoodMap { /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD - * then the most likely allele is set to "no call" + * * @param alleleMap - a map from alleles to likelihoods - * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD - * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the - * corresponding key + * @return - a MostLikelyAllele object */ @Ensures("result != null") - public static Allele getMostLikelyAllele( final Map alleleMap ) { + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { return getMostLikelyAllele(alleleMap, null); } /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD - * then the most likely allele is set to "no call" * * @param alleleMap - a map from alleles to likelihoods * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. * this is useful for the case where you've selected a subset of the alleles that * the reads have been computed for further analysis. If null totally ignored - * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD - * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the - * corresponding key + * @return - a MostLikelyAllele object */ - public static Allele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; @@ -237,7 +226,8 @@ public class PerReadAlleleLikelihoodMap { prevMaxLike = el.getValue(); } } - return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); + + return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java index 46ffd43b6..f6fa44ac5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -1,27 +1,27 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.haplotypeBAMWriter; @@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; @@ -71,9 +72,8 @@ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { // next, output the interesting reads for each sample aligned against the appropriate haplotype for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index a33ed809a..aae00c3ea 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -1,33 +1,34 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; @@ -77,9 +78,8 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { if ( entry.getKey().getMappingQuality() > 0 ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); } } } From 12475cc0276b27d4cf890779b96716c2f7113754 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 24 Mar 2013 14:42:48 -0400 Subject: [PATCH 106/226] Display the active MappingQualityFilter if mmq > 0 in the HaplotypeCaller --- .../HCMappingQualityFilter.java | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java index 3892ffe27..21b66986a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCMappingQualityFilter.java @@ -1,32 +1,34 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.ReadFilter; /** @@ -35,9 +37,17 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; * @author mdepristo */ public class HCMappingQualityFilter extends ReadFilter { + private final static Logger logger = Logger.getLogger(HCMappingQualityFilter.class); + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for analysis with the HaplotypeCaller", required = false) public int MIN_MAPPING_QUALTY_SCORE = 20; + @Override + public void initialize(GenomeAnalysisEngine engine) { + if ( MIN_MAPPING_QUALTY_SCORE > 0 ) + logger.info("Filtering out reads with MAPQ < " + MIN_MAPPING_QUALTY_SCORE); + } + public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE); } From 78c672676b25b46a6a302703a7e77afb3f0ff996 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 24 Mar 2013 17:20:30 -0400 Subject: [PATCH 107/226] Bugfix for pruning and removing non-reference edges in graph -- Previous algorithms were applying pruneGraph inappropriately on the raw sequence graph (where each vertex is a single base). This results in overpruning of the graph, as prunegraph really relied on the zipping of linear chains (and the sharing of weight this provides) to avoid over-pruning the graph. Probably we should think hard about this. This commit fixes this logic, so we zip the graph between pruning -- In this process ID's a fundamental problem with how we were trimming away vertices that occur on a path from the reference source to sink. In fact, we were leaving in any vertex that happened to be accessible from source, any vertices in cycles, and any vertex that wasn't the absolute end of a chain going to a sink. The new algorithm fixes all of this, using a BaseGraphIterator that's a general approach to walking the base graph. Other routines that use the same traversal idiom refactored to use this iterator. Added unit tests for all of these capabilities. -- Created new BaseGraphIterator, which abstracts common access patterns to graph, and use this where appropriate --- .../walkers/haplotypecaller/BaseGraph.java | 128 ++++++++++-------- .../haplotypecaller/BaseGraphIterator.java | 120 ++++++++++++++++ .../haplotypecaller/DeBruijnAssembler.java | 15 +- .../haplotypecaller/BaseGraphUnitTest.java | 86 ++++++++++-- 4 files changed, 279 insertions(+), 70 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index c3f371ec7..80e5148db 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -336,9 +336,18 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph PRUNE_FACTOR ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); -// } if( edge.isRef() ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } - //if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); } } for( final T v : vertexSet() ) { @@ -436,6 +441,30 @@ public class BaseGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList(); + for( final BaseEdge e : edgeSet() ) { + if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor + edgesToRemove.add(e); + } + } + removeAllEdges(edgesToRemove); + + removeSingletonOrphanVertices(); + } + + /** + * Remove all vertices in the graph that have in and out degree of 0 + */ + protected void removeSingletonOrphanVertices() { // Run through the graph and clean up singular orphaned nodes final List verticesToRemove = new LinkedList(); for( final T v : vertexSet() ) { @@ -446,65 +475,54 @@ public class BaseGraph extends DefaultDirectedGraph toRemove = new HashSet(vertexSet()); + + final T refV = getReferenceSourceVertex(); + if ( refV != null ) { + for ( final T v : new BaseGraphIterator(this, refV, true, true) ) { + toRemove.remove(v); + } + } + + removeAllVertices(toRemove); + } + + /** + * Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex + * + * More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection, + * as it requires vertices to not only be connected by a series of directed edges but also prunes away + * paths that do not also meet eventually with the reference sink vertex + */ protected void removePathsNotConnectedToRef() { if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) { throw new IllegalStateException("Graph must have ref source and sink vertices"); } + // get the set of vertices we can reach by going forward from the ref source + final Set onPathFromRefSource = new HashSet(vertexSet().size()); + for ( final T v : new BaseGraphIterator(this, getReferenceSourceVertex(), false, true) ) { + onPathFromRefSource.add(v); + } + + // get the set of vertices we can reach by going backward from the ref sink + final Set onPathFromRefSink = new HashSet(vertexSet().size()); + for ( final T v : new BaseGraphIterator(this, getReferenceSinkVertex(), true, false) ) { + onPathFromRefSink.add(v); + } + + // we want to remove anything that's not in both the sink and source sets final Set verticesToRemove = new HashSet(vertexSet()); - final DepthFirstIterator dfi = new DepthFirstIterator(this, getReferenceSourceVertex()); - while ( dfi.hasNext() ) { - final T accessibleFromRefSource = dfi.next(); - // we also want to prune all sinks that aren't the reference sink - if ( ! isNonRefSink(accessibleFromRefSource) ) - verticesToRemove.remove(accessibleFromRefSource); - } - + onPathFromRefSource.retainAll(onPathFromRefSink); + verticesToRemove.removeAll(onPathFromRefSource); removeAllVertices(verticesToRemove); } - protected void pruneGraph( final int pruneFactor ) { - final List edgesToRemove = new ArrayList(); - for( final BaseEdge e : edgeSet() ) { - if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor - edgesToRemove.add(e); - } - } - removeAllEdges(edgesToRemove); - - // Run through the graph and clean up singular orphaned nodes - final List verticesToRemove = new ArrayList(); - for( final T v : vertexSet() ) { - if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { - verticesToRemove.add(v); - } - } - - removeAllVertices(verticesToRemove); - } - - public void removeVerticesNotConnectedToRef() { - final HashSet toRemove = new HashSet(vertexSet()); - final HashSet visited = new HashSet(); - - final LinkedList toVisit = new LinkedList(); - final T refV = getReferenceSourceVertex(); - if ( refV != null ) { - toVisit.add(refV); - while ( ! toVisit.isEmpty() ) { - final T v = toVisit.pop(); - if ( ! visited.contains(v) ) { - toRemove.remove(v); - visited.add(v); - for ( final T prev : incomingVerticesOf(v) ) toVisit.add(prev); - for ( final T next : outgoingVerticesOf(v) ) toVisit.add(next); - } - } - - removeAllVertices(toRemove); - } - } - /** * Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java new file mode 100644 index 000000000..8841f835c --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphIterator.java @@ -0,0 +1,120 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; + +/** + * General iterator that can iterate over all vertices in a BaseGraph, following either + * incoming, outgoing edge (as well as both or none) edges. Supports traversal of graphs + * with cycles and other crazy structures. Will only ever visit each vertex once. The + * order in which the vertices are visited is undefined. + * + * User: depristo + * Date: 3/24/13 + * Time: 4:41 PM + */ +public class BaseGraphIterator implements Iterator, Iterable { + final HashSet visited = new HashSet(); + final LinkedList toVisit = new LinkedList(); + final BaseGraph graph; + final boolean followIncomingEdges, followOutgoingEdges; + + /** + * Create a new BaseGraphIterator starting its traversal at start + * + * Note that if both followIncomingEdges and followOutgoingEdges are false, we simply return the + * start vertex + * + * @param graph the graph to iterator over. Cannot be null + * @param start the vertex to start at. Cannot be null + * @param followIncomingEdges should we follow incoming edges during our + * traversal? (goes backward through the graph) + * @param followOutgoingEdges should we follow outgoing edges during out traversal? + */ + public BaseGraphIterator(final BaseGraph graph, final T start, + final boolean followIncomingEdges, final boolean followOutgoingEdges) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( start == null ) throw new IllegalArgumentException("start cannot be null"); + if ( ! graph.containsVertex(start) ) throw new IllegalArgumentException("start " + start + " must be in graph but it isn't"); + this.graph = graph; + this.followIncomingEdges = followIncomingEdges; + this.followOutgoingEdges = followOutgoingEdges; + + toVisit.add(start); + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + return ! toVisit.isEmpty(); + } + + @Override + public T next() { + final T v = toVisit.pop(); + + if ( ! visited.contains(v) ) { + visited.add(v); + if ( followIncomingEdges ) for ( final T prev : graph.incomingVerticesOf(v) ) toVisit.add(prev); + if ( followOutgoingEdges ) for ( final T next : graph.outgoingVerticesOf(v) ) toVisit.add(next); + } + + return v; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Doesn't implement remove"); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 9c6c255df..e9961519c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -183,11 +183,18 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + + // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive + seqGraph.zipLinearChains(); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); + + // now go through and prune the graph, removing vertices no longer connected to the reference chain seqGraph.pruneGraph(pruneFactor); - seqGraph.removeVerticesNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.pruned.dot"), pruneFactor); + seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor); seqGraph.simplifyGraph(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.merged.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor); // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can // happen in cases where for example the reference somehow manages to acquire a cycle, or @@ -196,7 +203,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.refcleaned.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.refcleaned.dot"), pruneFactor); return seqGraph; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java index 463e861b1..db4127ddb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java @@ -49,22 +49,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeMethod; -import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import scala.actors.threadpool.Arrays; import java.io.File; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; +import java.util.*; -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 3/15/13 - * Time: 3:36 PM - * To change this template use File | Settings | File Templates. - */ public class BaseGraphUnitTest extends BaseTest { SeqGraph graph; SeqVertex v1, v2, v3, v4, v5; @@ -105,6 +95,80 @@ public class BaseGraphUnitTest extends BaseTest { assertVertexSetEquals(graph.incomingVerticesOf(v5), v4); } + @Test + public void testRemoveSingletonOrphanVertices() throws Exception { + // all vertices in graph are connected + final List kept = new LinkedList(graph.vertexSet()); + final SeqVertex rm1 = new SeqVertex("CAGT"); + final SeqVertex rm2 = new SeqVertex("AGTC"); + graph.addVertices(rm1, rm2); + Assert.assertEquals(graph.vertexSet().size(), kept.size() + 2); + final BaseEdge rm12e = new BaseEdge(false, 1); + graph.addEdge(rm1, rm2, rm12e); + + final SeqGraph original = (SeqGraph)graph.clone(); + graph.removeSingletonOrphanVertices(); + Assert.assertTrue(BaseGraph.graphEquals(original, graph), "Graph with disconnected component but edges between components shouldn't be modified"); + + graph.removeEdge(rm12e); // now we should be able to remove rm1 and rm2 + graph.removeSingletonOrphanVertices(); + Assert.assertTrue(graph.vertexSet().containsAll(kept)); + Assert.assertFalse(graph.containsVertex(rm1)); + Assert.assertFalse(graph.containsVertex(rm2)); + } + + @Test + public void testRemovePathsNotConnectedToRef() throws Exception { + final SeqGraph graph = new SeqGraph(); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + SeqVertex b7 = new SeqVertex("AAAA"); + SeqVertex b8 = new SeqVertex("GGGG"); + SeqVertex b9 = new SeqVertex("CCCC"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6, b7, b8, b9); + graph.addEdges(src, b1); // source -> b1 is dead + graph.addEdges(b6, src); // x -> source is bad + graph.addEdges(g4, b2); // off random vertex is bad + graph.addEdges(g3, b3, b4); // two vertices that don't connect to end are bad + graph.addEdges(end, b5); // vertex off end is bad + graph.addEdges(g3, b7, b8, b7); // cycle is bad + graph.addEdges(g3, b9, b9); // self-cycle is bad + + final boolean debug = true; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removePathsNotConnectedToRef(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + @Test public void testPrintEmptyGraph() throws Exception { final File tmp = File.createTempFile("tmp", "dot"); From a97576384d7953109de2a02c71b49e978f075738 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:29:16 -0400 Subject: [PATCH 108/226] Fix bug in the HC not respecting the requested pruning --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 1 + 1 file changed, 1 insertion(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5849b5a0e..7e2211502 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -406,6 +406,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // setup the assembler assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); + assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); From b1b615b668777b63247be224f9ef696b065e01f9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:31:23 -0400 Subject: [PATCH 109/226] BaseGraph shouldn't implement getEdge -- no idea why I added this --- .../gatk/walkers/haplotypecaller/BaseGraph.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java index 80e5148db..9872a370b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java @@ -569,20 +569,6 @@ public class BaseGraph extends DefaultDirectedGraph Date: Mon, 25 Mar 2013 18:38:12 -0400 Subject: [PATCH 110/226] Increase max cigar elements from SW before failing path creation to 20 from 6 -- This allows more diversity in paths, which is sometimes necessary when we cannot simply graphs that have large bubbles --- .../sting/gatk/walkers/haplotypecaller/Path.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java index 4adfe6612..269adcc22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java @@ -68,6 +68,8 @@ import java.util.*; * */ class Path { + private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20; + // the last vertex seen in the path private final T lastVertex; @@ -357,7 +359,7 @@ class Path { } final Cigar swCigar = swConsensus.getCigar(); - if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference + if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference returnCigar.add(new CigarElement(1, CigarOperator.N)); } else { for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) { @@ -399,9 +401,15 @@ class Path { */ public boolean equalScoreAndSequence(final Path other) { if ( other == null ) throw new IllegalArgumentException("other cannot be null"); + return getScore() == other.getScore() && equalSequence(other); + } - if ( getScore() != other.getScore() ) - return false; + /** + * Tests that this and other have the same vertices in the same order with the same seq + * @param other the other path to consider. Cannot be null + * @return true if this and path are equal, false otherwise + */ + public boolean equalSequence(final Path other) { final List mine = getVertices(); final List yours = other.getVertices(); if ( mine.size() == yours.size() ) { // hehehe From 66910b036c628e858defb8acde4ea26bfec31d0a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 25 Mar 2013 18:37:25 -0400 Subject: [PATCH 111/226] Added new and improved suffix and node merging algorithms -- These new algorithms are more powerful than the restricted diamond merging algoriths, in that they can merge nodes with multiple incoming and outgoing edges. Together the splitter + merger algorithms will correctly merge many more cases than the original headless and tailless diamond merger. -- Refactored haplotype caller infrastructure into graphs package, code cleanup -- Cleanup new merging / splitting algorithms, with proper docs and unit tests -- Fix bug in zipping of linear chains. Because the multiplicity can be 0, protect ourselves with a max function call -- Fix BaseEdge.max unit test -- Add docs and some more unit tests -- Move error correct from DeBruijnGraph to DeBruijnAssembler -- Replaced uses of System.out.println with logger.info -- Don't make multiplicity == 0 nodes look like they should be pruned -- Fix toString of Path --- .../haplotypecaller/DeBruijnAssembler.java | 49 ++++- .../haplotypecaller/GenotypingEngine.java | 20 +- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../LikelihoodCalculationEngine.java | 6 +- .../{ => graphs}/BaseEdge.java | 6 +- .../{ => graphs}/BaseGraph.java | 25 +-- .../{ => graphs}/BaseGraphIterator.java | 2 +- .../{ => graphs}/BaseVertex.java | 2 +- .../graphs/CommonSuffixSplitter.java | 182 ++++++++++++++++++ .../{ => graphs}/DeBruijnGraph.java | 38 +--- .../{ => graphs}/DeBruijnVertex.java | 2 +- .../{ => graphs}/KBestPaths.java | 2 +- .../haplotypecaller/{ => graphs}/Path.java | 7 +- .../{ => graphs}/SeqGraph.java | 78 +++++++- .../{ => graphs}/SeqVertex.java | 3 +- .../graphs/SharedSequenceMerger.java | 138 +++++++++++++ .../SharedVertexSequenceSplitter.java | 47 +---- .../walkers/haplotypecaller/graphs/Utils.java | 138 +++++++++++++ .../DeBruijnAssemblerUnitTest.java | 1 + .../DeBruijnAssemblyGraphUnitTest.java | 1 + .../{ => graphs}/BaseEdgeUnitTest.java | 18 +- .../{ => graphs}/BaseGraphUnitTest.java | 61 +++++- .../{ => graphs}/BaseVertexUnitTest.java | 2 +- .../graphs/CommonSuffixMergerUnitTest.java | 160 +++++++++++++++ .../graphs/CommonSuffixSplitterUnitTest.java | 113 +++++++++++ .../{ => graphs}/DeBruijnVertexUnitTest.java | 2 +- .../{ => graphs}/KBestPathsUnitTest.java | 2 +- .../{ => graphs}/SeqGraphUnitTest.java | 48 ++--- .../{ => graphs}/SeqVertexUnitTest.java | 3 +- .../SharedVertexSequenceSplitterUnitTest.java | 6 +- 30 files changed, 1001 insertions(+), 165 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseEdge.java (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraph.java (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraphIterator.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseVertex.java (99%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnGraph.java (88%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnVertex.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/KBestPaths.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/Path.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqGraph.java (86%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqVertex.java (99%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SharedVertexSequenceSplitter.java (91%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseEdgeUnitTest.java (92%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseGraphUnitTest.java (86%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/BaseVertexUnitTest.java (99%) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/DeBruijnVertexUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/KBestPathsUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqGraphUnitTest.java (97%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SeqVertexUnitTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{ => graphs}/SharedVertexSequenceSplitterUnitTest.java (98%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index e9961519c..198abeac8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -53,6 +53,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -160,7 +161,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); if ( shouldErrorCorrectKmers() ) { - graph = graph.errorCorrect(); + graph = errorCorrect(graph); if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); } @@ -189,6 +190,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); // now go through and prune the graph, removing vertices no longer connected to the reference chain + // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight + // edges to maintain graph connectivity. seqGraph.pruneGraph(pruneFactor); seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); @@ -203,7 +206,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); - if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.refcleaned.dot"), pruneFactor); + if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); return seqGraph; } @@ -321,6 +324,39 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return true; } + /** + * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers + * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason + * we cannot actually do the error correction + */ + public DeBruijnGraph errorCorrect(final DeBruijnGraph graph) { + final KMerErrorCorrector corrector = new KMerErrorCorrector(graph.getKmerSize(), 1, 1, 5); // TODO -- should be static variables + + for( final BaseEdge e : graph.edgeSet() ) { + for ( final byte[] kmer : Arrays.asList(graph.getEdgeSource(e).getSequence(), graph.getEdgeTarget(e).getSequence())) { + // TODO -- need a cleaner way to deal with the ref weight + corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); + } + } + + if ( corrector.computeErrorCorrectionMap() ) { + final DeBruijnGraph correctedGraph = new DeBruijnGraph(graph.getKmerSize()); + + for( final BaseEdge e : graph.edgeSet() ) { + final byte[] source = corrector.getErrorCorrectedKmer(graph.getEdgeSource(e).getSequence()); + final byte[] target = corrector.getErrorCorrectedKmer(graph.getEdgeTarget(e).getSequence()); + if ( source != null && target != null ) { + correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); + } + } + + return correctedGraph; + } else { + // the error correction wasn't possible, simply return this graph + return graph; + } + } + protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; @@ -366,6 +402,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { for( final SeqGraph graph : graphs ) { for ( final Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { +// logger.info("Found path " + path); Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { final Cigar cigar = path.calculateCigar(); @@ -421,13 +458,13 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( debug ) { if( returnHaplotypes.size() > 1 ) { - System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); + logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against."); } else { - System.out.println("Found only the reference haplotype in the assembly graph."); + logger.info("Found only the reference haplotype in the assembly graph."); } for( final Haplotype h : returnHaplotypes ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); + logger.info( h.toString() ); + logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index cc9d94b1b..0d6e29fe9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -51,6 +51,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; @@ -66,6 +67,7 @@ import java.io.PrintStream; import java.util.*; public class GenotypingEngine { + private final static Logger logger = Logger.getLogger(GenotypingEngine.class); private final boolean DEBUG; private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; @@ -168,15 +170,15 @@ public class GenotypingEngine { // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } + if( DEBUG ) { logger.info("=== Best Haplotypes ==="); } for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - System.out.println( ">> Events = " + h.getEventMap()); + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); } } @@ -261,7 +263,7 @@ public class GenotypingEngine { final Map> alleleMapper = createAlleleMapper(mergeMap, eventMapper); if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); + logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } @@ -500,9 +502,9 @@ public class GenotypingEngine { if( isBiallelic ) { final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) ); if( DEBUG ) { - System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); - System.out.println("-- " + thisVC); - System.out.println("-- " + nextVC); + logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); + logger.info("-- " + thisVC); + logger.info("-- " + nextVC); } if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { @@ -528,7 +530,7 @@ public class GenotypingEngine { if(!containsStart) { startPosKeySet.remove(thisStart); } if(!containsNext) { startPosKeySet.remove(nextStart); } - if( DEBUG ) { System.out.println("====> " + mergedVC); } + if( DEBUG ) { logger.info("====> " + mergedVC); } mapWasUpdated = true; break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 7e2211502..ca105fe03 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -581,7 +581,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } - if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } + if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } return 1; // One active region was processed during this map call } @@ -614,7 +614,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- private void finalizeActiveRegion( final ActiveRegion activeRegion ) { - if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 87b488b3e..51483c53f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -62,6 +63,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; public class LikelihoodCalculationEngine { + private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); private static final double LOG_ONE_HALF = -Math.log10(2.0); private final byte constantGCP; @@ -256,14 +258,14 @@ public class LikelihoodCalculationEngine { } } if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } + if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } } - if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } final List bestHaplotypes = new ArrayList(); for( final int hIndex : bestHaplotypesIndexList ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java index 07a6629d7..6076626f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import java.io.Serializable; import java.util.Collection; @@ -69,7 +69,7 @@ public class BaseEdge { * @param multiplicity the number of observations of this edge */ public BaseEdge(final boolean isRef, final int multiplicity) { - if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0"); + if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0 but got " + multiplicity); this.multiplicity = multiplicity; this.isRef = isRef; @@ -176,7 +176,7 @@ public class BaseEdge { } /** - * Return a new edge that the max of this and edge + * Return a new edge whose multiplicity is the max of this and edge, and isRef is or of this and edge * * isRef is simply the or of this and edge * multiplicity is the max diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 9872a370b..1d294e591 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; @@ -53,7 +53,6 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DefaultDirectedGraph; -import org.jgrapht.traverse.DepthFirstIterator; import java.io.File; import java.io.FileNotFoundException; @@ -221,15 +220,6 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];"); if( edge.isRef() ) { graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); } @@ -414,7 +404,12 @@ public class BaseGraph extends DefaultDirectedGraph extends DefaultDirectedGraph edgesToRemove = new ArrayList(); for( final BaseEdge e : edgeSet() ) { if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor @@ -499,7 +494,7 @@ public class BaseGraph extends DefaultDirectedGraph toMerge = graph.incomingVerticesOf(v); + if ( toMerge.size() < 2 ) + // Can only split at least 2 vertices + return false; + else if ( ! safeToSplit(graph, v, toMerge) ) { + return false; + } else { + final SeqVertex suffixVTemplate = commonSuffix(toMerge); + if ( suffixVTemplate.isEmpty() ) { + return false; + } else { + final List edgesToRemove = new LinkedList(); + +// graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0); + for ( final SeqVertex mid : toMerge ) { + // create my own copy of the suffix + final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); + graph.addVertex(suffixV); + final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence()); + final BaseEdge out = graph.outgoingEdgeOf(mid); + + final SeqVertex incomingTarget; + if ( prefixV == null ) { + // this node is entirely explained by suffix + incomingTarget = suffixV; + } else { + incomingTarget = prefixV; + graph.addVertex(prefixV); + graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0)); + edgesToRemove.add(out); + } + + graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out)); + + for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) { + graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in)); + edgesToRemove.add(in); + } + } + + graph.removeAllVertices(toMerge); + graph.removeAllEdges(edgesToRemove); +// graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0); + + return true; + } + } + } + +// private static int counter = 0; + + /** + * Can we safely split up the vertices in toMerge? + * + * @param graph a graph + * @param bot a vertex whose incoming vertices we want to split + * @param toMerge the set of vertices we'd be splitting up + * @return true if we can safely split up toMerge + */ + private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection toMerge) { + for ( final SeqVertex m : toMerge ) { + final Set outs = graph.outgoingEdgesOf(m); + if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) ) + // m == bot => don't allow cycles in the graph + return false; + } + + return true; + } + + /** + * Return the longest suffix of bases shared among all provided vertices + * + * For example, if the vertices have sequences AC, CC, and ATC, this would return + * a single C. However, for ACC and TCC this would return CC. And for AC and TG this + * would return null; + * + * @param middleVertices a non-empty set of vertices + * @return a single vertex that contains the common suffix of all middle vertices + */ + @Requires("!middleVertices.isEmpty()") + protected static SeqVertex commonSuffix(final Collection middleVertices) { + final List kmers = Utils.getKmers(middleVertices); + final int min = Utils.minKmerLength(kmers); + final int suffixLen = Utils.compSuffixLen(kmers, min); + final byte[] kmer = kmers.get(0); + final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); + return new SeqVertex(suffix); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java similarity index 88% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index fd8581254..109598029 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -44,9 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerErrorCorrector; import java.util.Arrays; import java.util.HashMap; @@ -88,39 +89,6 @@ public class DeBruijnGraph extends BaseGraph { } } - /** - * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason - * we cannot actually do the error correction - */ - protected DeBruijnGraph errorCorrect() { - final KMerErrorCorrector corrector = new KMerErrorCorrector(getKmerSize(), 1, 1, 5); // TODO -- should be static variables - - for( final BaseEdge e : edgeSet() ) { - for ( final byte[] kmer : Arrays.asList(getEdgeSource(e).getSequence(), getEdgeTarget(e).getSequence())) { - // TODO -- need a cleaner way to deal with the ref weight - corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); - } - } - - if ( corrector.computeErrorCorrectionMap() ) { - final DeBruijnGraph correctedGraph = new DeBruijnGraph(getKmerSize()); - - for( final BaseEdge e : edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); - } - } - - return correctedGraph; - } else { - // the error correction wasn't possible, simply return this graph - return this; - } - } - /** * Add edge to assembly graph connecting the two kmers * @param kmer1 the source kmer for the edge @@ -168,7 +136,7 @@ public class DeBruijnGraph extends BaseGraph { * @return a newly allocated SequenceGraph */ @Ensures({"result != null"}) - protected SeqGraph convertToSequenceGraph() { + public SeqGraph convertToSequenceGraph() { final SeqGraph seqGraph = new SeqGraph(getKmerSize()); final Map vertexMap = new HashMap(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index 0a2c26ca4..4d9441efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java index 0724729a8..1dc712c67 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.common.collect.MinMaxPriorityQueue; import com.google.java.contract.Ensures; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 269adcc22..50ca91d41 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; @@ -67,7 +67,7 @@ import java.util.*; * Time: 2:34 PM * */ -class Path { +public class Path { private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20; // the last vertex seen in the path @@ -163,8 +163,9 @@ class Path { boolean first = true; for ( final T v : getVertices() ) { if ( first ) { - b.append(" -> "); first = false; + } else { + b.append(" -> "); } b.append(v.getSequenceString()); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java similarity index 86% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index da24a06a4..400b5c7ee 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -44,10 +44,12 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.apache.commons.lang.ArrayUtils; +import java.io.File; +import java.util.HashSet; import java.util.Set; /** @@ -57,6 +59,7 @@ import java.util.Set; * @since 03/2013 */ public class SeqGraph extends BaseGraph { + private final static boolean PRINT_SIMPLIFY_GRAPHS = false; private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; /** @@ -97,9 +100,16 @@ public class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful didSomeWork = false; - //printGraph(new File("simplifyGraph." + i + ".dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0); + + didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0); + didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0); + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); didSomeWork |= zipLinearChains(); i++; @@ -109,7 +119,7 @@ public class SeqGraph extends BaseGraph { /** * Zip up all of the simple linear chains present in this graph. */ - protected boolean zipLinearChains() { + public boolean zipLinearChains() { boolean foundOne = false; while( zipOneLinearChain() ) { // just keep going until zipOneLinearChain says its done @@ -137,13 +147,16 @@ public class SeqGraph extends BaseGraph { final Set outEdges = outgoingEdgesOf(outgoingVertex); final Set inEdges = incomingEdgesOf(incomingVertex); + final BaseEdge singleOutEdge = outEdges.isEmpty() ? null : outEdges.iterator().next(); + final BaseEdge singleInEdge = inEdges.isEmpty() ? null : inEdges.iterator().next(); + if( inEdges.size() == 1 && outEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + singleInEdge.setMultiplicity( singleInEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); + singleOutEdge.setMultiplicity( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); } else if( inEdges.size() == 1 ) { - inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + singleInEdge.setMultiplicity( Math.max(singleInEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); } else if( outEdges.size() == 1 ) { - outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) ); + singleOutEdge.setMultiplicity( Math.max( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); } final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); @@ -297,6 +310,57 @@ public class SeqGraph extends BaseGraph { } } + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class MergeCommonSuffices extends VertexBasedTransformer { + @Override + boolean tryToTransform(final SeqVertex bottom) { + return new SharedSequenceMerger().merge(SeqGraph.this, bottom); + } + } + + /** + * Merge headless configurations: + * + * Performs the transformation: + * + * { x + S_i + y -> Z } + * + * goes to: + * + * { x -> S_i -> y -> Z } + * + * for all nodes that match this configuration. + * + * Differs from the diamond transform in that no top node is required + */ + protected class SplitCommonSuffices extends VertexBasedTransformer { + final Set alreadySplit = new HashSet(); + + @Override + boolean tryToTransform(final SeqVertex bottom) { + if ( alreadySplit.contains(bottom) ) + return false; + else { + alreadySplit.add(bottom); + return new CommonSuffixSplitter().split(SeqGraph.this, bottom); + } + } + } + /** * Merge headless configurations: * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java index 523312dcf..cfc2abfdc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java @@ -44,11 +44,10 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.Utils; - import java.util.Arrays; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java new file mode 100644 index 000000000..1c53f2332 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.apache.commons.lang.ArrayUtils; + +import java.util.*; + +/** + * Merges the incoming vertices of a vertex V of a graph + * + * Looks at the vertices that are incoming to V (i.e., have an outgoing edge connecting to V). If + * they all have the same sequence, merges them into the sequence of V, and updates the graph + * as appropriate + * + * User: depristo + * Date: 3/22/13 + * Time: 8:31 AM + */ +public class SharedSequenceMerger { + public SharedSequenceMerger() { } + + /** + * Attempt to merge the incoming vertices of v + * + * @param graph the graph containing the vertex v + * @param v the vertex whose incoming vertices we want to merge + * @return true if some useful merging was done, false otherwise + */ + public boolean merge(final SeqGraph graph, final SeqVertex v) { + if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v); + + final Set prevs = graph.incomingVerticesOf(v); + if ( ! canMerge(graph, v, prevs) ) + return false; + else { +// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0); + + final List edgesToRemove = new LinkedList(); + final byte[] prevSeq = prevs.iterator().next().getSequence(); + final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence())); + graph.addVertex(newV); + + for ( final SeqVertex prev : prevs ) { + for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) { + graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn)); + edgesToRemove.add(prevIn); + } + } + + for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) { + graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e)); + } + + graph.removeAllVertices(prevs); + graph.removeVertex(v); + graph.removeAllEdges(edgesToRemove); + +// graph.printGraph(new File("csm." + counter++ + "." + v.getSequenceString() + "_post.dot"), 0); + + return true; + } + } + + //private static int counter = 0; + + /** + * Can we safely merge the incoming vertices of v + * + * @param graph the graph containing v and incomingVertices + * @param v the vertex we want to merge into + * @param incomingVertices the incoming vertices of v + * @return true if we can safely merge incomingVertices + */ + private boolean canMerge(final SeqGraph graph, final SeqVertex v, final Collection incomingVertices) { + if ( incomingVertices.isEmpty() ) + return false; + + final SeqVertex first = incomingVertices.iterator().next(); + for ( final SeqVertex prev : incomingVertices) { + if ( ! prev.seqEquals(first) ) + return false; + final Collection prevOuts = graph.outgoingVerticesOf(prev); + if ( prevOuts.size() != 1 ) + return false; + if ( prevOuts.iterator().next() != v ) + return false; + } + + return true; + } + +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java similarity index 91% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index e0501da52..9834653a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; @@ -266,8 +266,8 @@ public class SharedVertexSequenceSplitter { min = Math.min(min, v.getSequence().length); } - final int prefixLen = compPrefixLen(kmers, min); - final int suffixLen = compSuffixLen(kmers, min - prefixLen); + final int prefixLen = Utils.compPrefixLen(kmers, min); + final int suffixLen = Utils.compSuffixLen(kmers, min - prefixLen); final byte[] kmer = kmers.get(0); final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); @@ -275,47 +275,6 @@ public class SharedVertexSequenceSplitter { return new Pair(new SeqVertex(prefix), new SeqVertex(suffix)); } - /** - * Compute the maximum shared prefix length of list of bytes. - * - * @param listOfBytes a list of bytes with at least one element - * @param minLength the min. length among all byte[] in listOfBytes - * @return the number of shared bytes common at the start of all bytes - */ - @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) - @Ensures("result >= 0") - protected static int compPrefixLen(final List listOfBytes, final int minLength) { - for ( int i = 0; i < minLength; i++ ) { - final byte b = listOfBytes.get(0)[i]; - for ( int j = 1; j < listOfBytes.size(); j++ ) { - if ( b != listOfBytes.get(j)[i] ) - return i; - } - } - - return minLength; - } - - /** - * Compute the maximum shared suffix length of list of bytes. - * - * @param listOfBytes a list of bytes with at least one element - * @param minLength the min. length among all byte[] in listOfBytes - * @return the number of shared bytes common at the end of all bytes - */ - @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) - @Ensures("result >= 0") - protected static int compSuffixLen(final List listOfBytes, final int minLength) { - for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { - final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; - for ( int j = 1; j < listOfBytes.size(); j++ ) { - if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) - return suffixLen; - } - } - return minLength; - } - /** * Helper function that returns an edge that we should use for splitting * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java new file mode 100644 index 000000000..8cb272925 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Utility functions used in the graphs package + * + * User: depristo + * Date: 3/25/13 + * Time: 9:42 PM + */ +final class Utils { + private Utils() {} + + /** + * Compute the maximum shared prefix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the start of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compPrefixLen(final List listOfBytes, final int minLength) { + for ( int i = 0; i < minLength; i++ ) { + final byte b = listOfBytes.get(0)[i]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[i] ) + return i; + } + } + + return minLength; + } + + /** + * Compute the maximum shared suffix length of list of bytes. + * + * @param listOfBytes a list of bytes with at least one element + * @param minLength the min. length among all byte[] in listOfBytes + * @return the number of shared bytes common at the end of all bytes + */ + @Requires({"listOfBytes.size() >= 1", "minLength >= 0"}) + @Ensures("result >= 0") + protected static int compSuffixLen(final List listOfBytes, final int minLength) { + for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) { + final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1]; + for ( int j = 1; j < listOfBytes.size(); j++ ) { + if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] ) + return suffixLen; + } + } + return minLength; + } + + /** + * Get the list of kmers as byte[] from the vertices in the graph + * + * @param vertices a collection of vertices + * @return a list of their kmers in order of the iterator on vertices + */ + protected static List getKmers(final Collection vertices) { + final List kmers = new ArrayList(vertices.size()); + for ( final SeqVertex v : vertices ) { + kmers.add(v.getSequence()); + } + return kmers; + } + + /** + * Get the minimum length of a collection of byte[] + * + * @param kmers a list of kmers whose .length min we want + * @return the min of the kmers, if kmers is empty the result is 0 + */ + protected static int minKmerLength(final Collection kmers) { + if ( kmers == null ) throw new IllegalArgumentException("kmers cannot be null"); + + if ( kmers.isEmpty() ) return 0; + int min = Integer.MAX_VALUE; + for ( final byte[] kmer : kmers ) { + min = Math.min(min, kmer.length); + } + return min; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index cce623b76..86d331dae 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -56,6 +56,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java index 2b87cf61d..a13618dae 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java similarity index 92% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java index 3cc44c7de..7df6ee6c8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseEdgeUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdgeUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -102,4 +102,20 @@ public class BaseEdgeUnitTest extends BaseTest { Assert.assertEquals(edges.get(2), e2); Assert.assertEquals(edges.get(3), e1); } + + @Test + public void testMax() { + for ( final boolean firstIsRef : Arrays.asList(true, false) ) { + for ( final boolean secondIsRef : Arrays.asList(true, false) ) { + for ( final int firstMulti : Arrays.asList(1, 4) ) { + for ( final int secondMulti : Arrays.asList(2, 3) ) { + final BaseEdge expected = new BaseEdge(firstIsRef || secondIsRef, Math.max(firstMulti, secondMulti)); + final BaseEdge actual = new BaseEdge(firstIsRef, firstMulti).max(new BaseEdge(secondIsRef, secondMulti)); + Assert.assertEquals(actual.getMultiplicity(), expected.getMultiplicity()); + Assert.assertEquals(actual.isRef(), expected.isRef()); + } + } + } + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java similarity index 86% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index db4127ddb..9737f72f5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -160,7 +160,7 @@ public class BaseGraphUnitTest extends BaseTest { graph.addEdges(g3, b7, b8, b7); // cycle is bad graph.addEdges(g3, b9, b9); // self-cycle is bad - final boolean debug = true; + final boolean debug = false; if ( debug ) good.printGraph(new File("expected.dot"), 0); if ( debug ) graph.printGraph(new File("bad.dot"), 0); graph.removePathsNotConnectedToRef(); @@ -169,6 +169,63 @@ public class BaseGraphUnitTest extends BaseTest { Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); } + @Test + public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception { + final SeqGraph graph = new SeqGraph(); + + SeqVertex src = new SeqVertex("A"); + SeqVertex end = new SeqVertex("A"); + SeqVertex g1 = new SeqVertex("C"); + SeqVertex g2 = new SeqVertex("G"); + SeqVertex g3 = new SeqVertex("T"); + SeqVertex g4 = new SeqVertex("AA"); + SeqVertex g5 = new SeqVertex("AA"); + SeqVertex g6 = new SeqVertex("AA"); + SeqVertex g8 = new SeqVertex("AA"); + SeqVertex g7 = new SeqVertex("AA"); + SeqVertex gPrev = new SeqVertex("AA"); + SeqVertex gPrev1 = new SeqVertex("AA"); + SeqVertex gPrev2 = new SeqVertex("AA"); + SeqVertex gAfter = new SeqVertex("AA"); + SeqVertex gAfter1 = new SeqVertex("AA"); + SeqVertex gAfter2 = new SeqVertex("AA"); + SeqVertex b1 = new SeqVertex("CC"); + SeqVertex b2 = new SeqVertex("GG"); + SeqVertex b3 = new SeqVertex("TT"); + SeqVertex b4 = new SeqVertex("AAA"); + SeqVertex b5 = new SeqVertex("CCC"); + SeqVertex b6 = new SeqVertex("GGG"); + + graph.addVertices(src, end, g1, g2, g3, g4, g5, g6, g7, g8, gPrev, gPrev1, gPrev2, gAfter, gAfter1, gAfter2); + graph.addEdges(new BaseEdge(true, 1), src, g1, g2, g4, end); + graph.addEdges(src, g1, g5, g6, g7, end); + graph.addEdges(src, g1, g5, g8, g7, end); + graph.addEdges(src, g1, g3, end); + + // these should be kept, but are in the wrong direction + graph.addEdges(gPrev, src); + graph.addEdges(gPrev1, gPrev2, src); + graph.addEdges(end, gAfter); + graph.addEdges(end, gAfter1, gAfter2); + + // the current state of the graph is the good one + final SeqGraph good = (SeqGraph)graph.clone(); + + // now add the bads to the graph + graph.addVertices(b1, b2, b3, b4, b5, b6); + graph.addEdges(b2, b3); // b2 -> b3 + graph.addEdges(b4, b5, b4); // cycle + graph.addEdges(b6, b6); // isolated self cycle + + final boolean debug = false; + if ( debug ) good.printGraph(new File("expected.dot"), 0); + if ( debug ) graph.printGraph(new File("bad.dot"), 0); + graph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); + if ( debug ) graph.printGraph(new File("actual.dot"), 0); + + Assert.assertTrue(BaseGraph.graphEquals(graph, good), "Failed to remove exactly the bad nodes"); + } + @Test public void testPrintEmptyGraph() throws Exception { final File tmp = File.createTempFile("tmp", "dot"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java index 8f682d474..859892e33 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/BaseVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java new file mode 100644 index 000000000..012add769 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -0,0 +1,160 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class CommonSuffixMergerUnitTest extends BaseTest { + private final static boolean PRINT_GRAPHS = true; + + @DataProvider(name = "CompleteCycleData") + public Object[][] makeCompleteCycleData() { + return makeSplitMergeData(-1); + } + + public static class SplitMergeData { + final SeqGraph graph; + final SeqVertex v; + final String commonSuffix; + + public SplitMergeData(SeqGraph graph, SeqVertex v, String commonSuffix) { + this.graph = graph; + this.v = v; + this.commonSuffix = commonSuffix; + } + } + + public static Object[][] makeSplitMergeData(final int maxTests) { + List tests = new ArrayList(); + + final List bases = Arrays.asList("A", "C", "G", "T"); + for ( final String commonSuffix : Arrays.asList("", "A", "AT") ) { + for ( final int nBots : Arrays.asList(0, 1, 2) ) { + for ( final int nMids : Arrays.asList(1, 2, 3) ) { + for ( int nTops = 0; nTops < nMids; nTops++ ) { + for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) { + int multi = 1; + final SeqGraph graph = new SeqGraph(); + final SeqVertex v = new SeqVertex("GGGG"); + graph.addVertex(v); + + final LinkedList tops = new LinkedList(); + final LinkedList mids = new LinkedList(); + + for ( int i = 0; i < nMids; i++) { + final SeqVertex mid = new SeqVertex(bases.get(i) + commonSuffix); + graph.addVertex(mid); + graph.addEdge(mid, v, new BaseEdge(i == 0, multi++)); + mids.add(mid); + + tops.add(new SeqVertex(bases.get(i))); + } + + graph.addVertices(tops); + for ( final SeqVertex t : tops ) { + for ( int i = 0; i < nTopConnections; i++ ) { + graph.addEdge(t, mids.get(i), new BaseEdge(i == 0, multi++)); + } + } + + for ( int i = 0; i < nBots; i++ ) { + final SeqVertex bot = new SeqVertex(bases.get(i)); + graph.addVertex(bot); + graph.addEdge(v, bot, new BaseEdge(i == 0, multi++)); + + } + + tests.add(new Object[]{new SplitMergeData(graph, v, commonSuffix)}); + } + } + } + } + } + + final List toUse = maxTests == -1 ? tests : tests.subList(0, Math.min(tests.size(), maxTests)); + return toUse.toArray(new Object[][]{}); + } + + public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { + try { + final Set haplotypes = new HashSet(); + final List> originalPaths = new KBestPaths().getKBestPaths(original); + for ( final Path path : originalPaths ) + haplotypes.add(new String(path.getBases())); + + final List> splitPaths = new KBestPaths().getKBestPaths(actual); + for ( final Path path : splitPaths ) { + final String h = new String(path.getBases()); + Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); + } + + if ( splitPaths.size() == originalPaths.size() ) { + for ( int i = 0; i < originalPaths.size(); i++ ) { + Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + } + } + } catch ( AssertionError e ) { + if ( PRINT_GRAPHS ) original.printGraph(new File(String.format("%s.original.dot", name, actual.vertexSet().size())), 0); + if ( PRINT_GRAPHS ) actual.printGraph(new File(String.format("%s.actual.dot", name, actual.vertexSet().size())), 0); + throw e; + } + } + + @Test(dataProvider = "CompleteCycleData") + public void testMerging(final SplitMergeData data) { + final SeqGraph original = (SeqGraph)data.graph.clone(); + final SharedSequenceMerger splitter = new SharedSequenceMerger(); + splitter.merge(data.graph, data.v); + assertSameHaplotypes(String.format("suffixMerge.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java new file mode 100644 index 000000000..f03dc8762 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -0,0 +1,113 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class CommonSuffixSplitterUnitTest extends BaseTest { + @DataProvider(name = "SplitData") + public Object[][] makeSplitData() { + return CommonSuffixMergerUnitTest.makeSplitMergeData(-1); + } + + @Test(dataProvider = "SplitData") + public void testSplit(final CommonSuffixMergerUnitTest.SplitMergeData data) { + final boolean expectedMerge = ! data.commonSuffix.isEmpty() && data.graph.inDegreeOf(data.v) > 1; + + final SeqGraph original = (SeqGraph)data.graph.clone(); +// original.printGraph(new File("original.dot"), 0); + final CommonSuffixSplitter splitter = new CommonSuffixSplitter(); + final boolean succeed = splitter.split(data.graph, data.v); +// data.graph.printGraph(new File("actual.dot"), 0); + Assert.assertEquals(succeed, expectedMerge, "Not excepted merge success/fail result"); + if ( succeed ) { + Assert.assertEquals(data.graph.incomingVerticesOf(data.v).iterator().next().getSequenceString(), data.commonSuffix, "Common suffix not computed correctly"); + } + + CommonSuffixMergerUnitTest.assertSameHaplotypes(String.format("suffixSplit.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); + } + + @Test + public void testSplitPrevHaveMultipleEdges() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("A"); + final SeqVertex v3 = new SeqVertex("A"); + final SeqVertex v4 = new SeqVertex("A"); + + original.addVertices(v1, v2, v3, v4); + original.addEdges(v1, v3); + + Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with only one vertex"); + + original.addEdges(v2, v3); + original.addEdges(v2, v4); + + Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with multiple outgoing edges from middle nodes"); + } + + @Test + public void testSplitNoCycles() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("C"); + final SeqVertex v3 = new SeqVertex("C"); + final SeqVertex v4 = new SeqVertex("G"); + + original.addVertices(v1, v2, v3, v4); + original.addEdges(v1, v3, v4); + original.addEdges(v1, v2, v4); + + Assert.assertTrue(new CommonSuffixSplitter().split((SeqGraph)original.clone(), v4), "Should be able to split pre-cycle graph"); + + original.addEdges(v4, v4); + Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java index dfbe50668..bdc8ab36d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.annotations.Test; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index 34b4ba912..d20a0f778 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java similarity index 97% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 6b6826e45..cbd7b1063 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -1,50 +1,50 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -315,12 +315,16 @@ public class SeqGraphUnitTest extends BaseTest { public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); merged.simplifyGraph(1); -// if ( ! SeqGraph.graphEquals(merged, expected) ) { -// graph.printGraph(new File("graph.dot"), 0); -// merged.printGraph(new File("merged.dot"), 0); -// expected.printGraph(new File("expected.dot"), 0); -// } - Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + try { + Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + } catch (AssertionError e) { +// if ( ! SeqGraph.graphEquals(merged, expected) ) { +// graph.printGraph(new File("graph.dot"), 0); +// merged.printGraph(new File("merged.dot"), 0); +// expected.printGraph(new File("expected.dot"), 0); +// } + throw e; + } } // A -> ACT -> C [non-ref] diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java index ca38351cc..eab9dfc27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SeqVertexUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertexUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -52,7 +52,6 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class SeqVertexUnitTest extends BaseTest { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java similarity index 98% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 52ab36064..77857c367 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; @@ -98,10 +98,10 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { min = Math.min(min, s.length()); } - final int actualPrefixLen = SharedVertexSequenceSplitter.compPrefixLen(bytes, min); + final int actualPrefixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compPrefixLen(bytes, min); Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); - final int actualSuffixLen = SharedVertexSequenceSplitter.compSuffixLen(bytes, min - actualPrefixLen); + final int actualSuffixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compSuffixLen(bytes, min - actualPrefixLen); Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); } From 197d149495901a08ba07e87d57ed1af656af8bce Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Mar 2013 21:00:04 -0400 Subject: [PATCH 112/226] Increase the maxNumHaplotypesInPopulation to 25 -- A somewhat arbitrary increase, and will need some evaluation but necessary to get good results on the AFR integrationtest. --- .../haplotypecaller/HaplotypeCaller.java | 21 ++++++++++++++++--- .../LikelihoodCalculationEngine.java | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ca105fe03..c379b34dc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -204,7 +204,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 13; + protected int maxNumHaplotypesInPopulation = 25; @Advanced @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) @@ -557,8 +557,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? - likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes ); + final List bestHaplotypes = selectBestHaplotypesForGenotyping(haplotypes, stratifiedReadMap); final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, bestHaplotypes, @@ -586,6 +585,22 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return 1; // One active region was processed during this map call } + /** + * Select the best N haplotypes according to their likelihoods, if appropriate + * + * @param haplotypes a list of haplotypes to consider + * @param stratifiedReadMap a map from samples -> read likelihoods + * @return the list of haplotypes to genotype + */ + protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { + // TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping) + if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + return haplotypes; + } else { + return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); + } + } + //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 51483c53f..5eaaba0dd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -231,7 +231,7 @@ public class LikelihoodCalculationEngine { @Requires({"haplotypes.size() > 0"}) @Ensures({"result.size() <= haplotypes.size()"}) - public List selectBestHaplotypes( final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation ) { + public List selectBestHaplotypesFromPooledLikelihoods(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = stratifiedReadMap.keySet(); From fde7d36926b1ace076d54a01086714c25ab704ed Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 26 Mar 2013 22:07:25 -0400 Subject: [PATCH 113/226] Updating md5s due to changes in assembly graph creation algorithms and default parameter --- ...omplexAndSymbolicVariantsIntegrationTest.java | 8 ++++---- .../HaplotypeCallerIntegrationTest.java | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 830152903..3aaffdeaa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6dd29d6fec056419ab0fa03a7d43d85e"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9fa4d3c88fd9c0f23c7a3ddd3d24a8c"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -75,7 +75,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8225fb59b9fcbe767a473c9eb8b21537"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "e746a38765298acd716194aee4d93554"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -87,12 +87,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "84616464aed68f4d9bc9e08472eff9c0"); + "e8ffbfae3c1af5be02631a31f386a431"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "e2d1023b846bfac31b4f7a3a4b90d931"); + "c3a98b19efa7cb36fe5f5f2ab893ef56"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 1b98b2239..c5614d405 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "9859b136d05085b5ec0833035289106a"); + HCTest(CEUTRIO_BAM, "", "45856ad67bfe8d8bea45808d8258bcf1"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "27f660bf1c9a6ed7167d77022d401b73"); + HCTest(NA12878_BAM, "", "b6c93325f851ac358ea49260fb11b75c"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "e25fc2196401a16347e0c730dbcbe828"); + "4ca6b560d0569cdca400d3e50915e211"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "325d7d73e0bd86b6cb146b249eda959a"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5d06ec5502d3f157964bd7b275d6a0cb"); } @Test @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("53a50dae68f0175ca3088dea1d3bb881")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec97a0a65890169358842e765ff8dd15")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d3bc6adde8cd9514ae5c49cd366d5de4")); executeTest("HCTestStructuralIndels: ", spec); } @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("5280f1a50ca27d8e435da0bd5b26ae93")); + Arrays.asList("4adb833ed8af20224b76bba61e2b0d93")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9")); + Arrays.asList("1704b0901c86f8f597d931222d5c8dd8")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 73d1c319bf8f0fa31fd29db34ca7deadf0dec240 Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Thu, 28 Mar 2013 23:25:28 -0400 Subject: [PATCH 114/226] Rarely-occurring logic bugfix for GenotypeConcordance, streamlining and testing of MathUtils Currently, the multi-allelic test is covering the following case: Eval A T,C Comp A C reciprocate this so that the reverse can be covered. Eval A C Comp A T,C And furthermore, modify ConcordanceMetrics to more properly handle the situation where multiple alternate alleles are available in the comp. It was possible for an eval C/C sample to match a comp T/T sample, so long as the C allele were also present in at least one other comp sample. This comes from the fact that "truth" reference alleles can be paired with *any* allele also present in the truth VCF, while truth het/hom var sites are restricted to having to match only the alleles present in the genotype. The reason that truth ref alleles are special case is as follows, imagine: Eval: A G,T 0/0 2/0 2/2 1/1 Comp: A C,T 0/0 1/0 0/0 0/0 Even though the alt allele of the comp is a C, the assessment of genotypes should be as follows: Sample1: ref called ref Sample2: alleles don't match (the alt allele of the comp was not assessed in eval) Sample3: ref called hom-var Sample4: alleles don't match (the alt allele of the eval was not assessed in comp) Before this change, Sample2 was evaluated as "het called het" (as the T allele in eval happens to also be in the comp record, just not in the comp sample). Thus: apply current logic to comp hom-refs, and the more restrictive logic ("you have to match an allele in the comp genotype") when the comp is not reference. Also in this commit,major refactoring and testing for MathUtils. A large number of methods were not used at all in the codebase, these methods were removed: - dotProduct(several types). logDotProduct is used extensively, but not the real-space version. - vectorSum - array shuffle, random subset - countOccurances (general forms, the char form is used in the codebase) - getNMaxElements - array permutation - sorted array permutation - compare floats - sum() (for integer arrays and lists). Final keyword was extensively added to MathUtils. The ratio() and percentage() methods were revised to error out with non-positive denominators, except in the case of 0/0 (which returns 0.0 (ratio), or 0.0% (percentage)). Random sampling code was updated to make use of the cleaner implementations of generating permutations in MathUtils (allowing the array permutation code to be retired). The PaperGenotyper still made use of one of these array methods, since it was the only walker it was migrated into the genotyper itself. In addition, more extensive tests were added for - logBinomialCoefficient (Newton's identity should always hold) - logFactorial - log10sumlog10 and its approximation All unit tests pass --- .../ConcordanceMetricsUnitTest.java | 21 + .../gatk/examples/GATKPaperGenotyper.java | 28 +- .../variantutils/ConcordanceMetrics.java | 28 +- .../broadinstitute/sting/utils/MathUtils.java | 719 +++--------------- .../sting/utils/MathUtilsUnitTest.java | 132 ++-- .../BandPassActivityProfileUnitTest.java | 11 +- 6 files changed, 263 insertions(+), 676 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index 2e31f6725..bca912d63 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -203,6 +203,27 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],1); Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0); Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); + + // now flip them around + + eval = data.getSecond(); + truth = data.getFirst(); + codec = new VCFCodec(); + evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + metrics = new ConcordanceMetrics(evalHeader,compHeader); + metrics.update(eval,truth); + Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); + Assert.assertEquals(truth.getGenotype("test1_sample2").getType().ordinal(),2); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),1); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getTable()[1][2],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[1][2],0); + Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample3").getTable()[3][2],1); + Assert.assertEquals(metrics.getOverallGenotypeConcordance().getTable()[1][1],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH.ordinal()],0); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.EVAL_SUBSET_TRUTH.ordinal()],1); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_DO_NOT_MATCH.ordinal()],0); + Assert.assertEquals(metrics.getOverallSiteConcordance().getSiteConcordance()[ConcordanceMetrics.SiteConcordanceType.ALLELES_MATCH.ordinal()],0); } private Pair getData3() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java index 7b56852d3..07ec088cf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java @@ -40,7 +40,8 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.io.PrintStream; - +import java.util.Arrays; +import java.util.Comparator; /** * A simple Bayesian genotyper, that outputs a text based call format. Intended to be used only as an @@ -95,7 +96,7 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre likelihoods[genotype.ordinal()] += Math.log10(p / genotype.toString().length()); } - Integer sortedList[] = MathUtils.sortPermutation(likelihoods); + Integer sortedList[] = sortPermutation(likelihoods); // create call using the best genotype (GENOTYPE.values()[sortedList[9]].toString()) // and calculate the LOD score from best - next best (9 and 8 in the sorted list, since the best likelihoods are closest to zero) @@ -110,6 +111,29 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre return 0; } + private static Integer[] sortPermutation(final double[] A) { + class comparator implements Comparator { + public int compare(Integer a, Integer b) { + if (A[a.intValue()] < A[b.intValue()]) { + return -1; + } + if (A[a.intValue()] == A[b.intValue()]) { + return 0; + } + if (A[a.intValue()] > A[b.intValue()]) { + return 1; + } + return 0; + } + } + Integer[] permutation = new Integer[A.length]; + for (int i = 0; i < A.length; i++) { + permutation[i] = i; + } + Arrays.sort(permutation, new comparator()); + return permutation; + } + /** * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector * appropriately. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java index efb84edef..b3b4857b6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java @@ -102,15 +102,16 @@ public class ConcordanceMetrics { public void update(VariantContext eval, VariantContext truth) { overallSiteConcordance.update(eval,truth); Set alleleTruth = new HashSet(8); - alleleTruth.add(truth.getReference().getBaseString()); + String truthRef = truth.getReference().getBaseString(); + alleleTruth.add(truthRef); for ( Allele a : truth.getAlternateAlleles() ) { alleleTruth.add(a.getBaseString()); } for ( String sample : perSampleGenotypeConcordance.keySet() ) { Genotype evalGenotype = eval.getGenotype(sample); Genotype truthGenotype = truth.getGenotype(sample); - perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth); - overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth); + perSampleGenotypeConcordance.get(sample).update(evalGenotype,truthGenotype,alleleTruth,truthRef); + overallGenotypeConcordance.update(evalGenotype,truthGenotype,alleleTruth,truthRef); } } @@ -170,10 +171,14 @@ public class ConcordanceMetrics { } @Requires({"eval!=null","truth != null","truthAlleles != null"}) - public void update(Genotype eval, Genotype truth, Set truthAlleles) { - // this is slow but correct + public void update(Genotype eval, Genotype truth, Set truthAlleles, String truthRef) { + // this is slow but correct. + + // NOTE: a reference call in "truth" is a special case, the eval can match *any* of the truth alleles + // that is, if the reference base is C, and a sample is C/C in truth, A/C, A/A, T/C, T/T will + // all match, so long as A and T are alleles in the truth callset. boolean matchingAlt = true; - if ( eval.isCalled() && truth.isCalled() ) { + if ( eval.isCalled() && truth.isCalled() && truth.isHomRef() ) { // by default, no-calls "match" between alleles, so if // one or both sites are no-call or unavailable, the alt alleles match // otherwise, check explicitly: if the eval has an allele that's not ref, no-call, or present in truth @@ -181,6 +186,17 @@ public class ConcordanceMetrics { for ( Allele evalAllele : eval.getAlleles() ) { matchingAlt &= truthAlleles.contains(evalAllele.getBaseString()); } + } else if ( eval.isCalled() && truth.isCalled() ) { + // otherwise, the eval genotype has to match either the alleles in the truth genotype, or the truth reference allele + // todo -- this can be sped up by caching the truth allele sets + Set genoAlleles = new HashSet(3); + genoAlleles.add(truthRef); + for ( Allele truthGenoAl : truth.getAlleles() ) { + genoAlleles.add(truthGenoAl.getBaseString()); + } + for ( Allele evalAllele : eval.getAlleles() ) { + matchingAlt &= genoAlleles.contains(evalAllele.getBaseString()); + } } if ( matchingAlt ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 2459c1d36..ebbc3945f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -88,14 +88,14 @@ public class MathUtils { * @param max upper bound of the range * @return a random int >= min and <= max */ - public static int randomIntegerInRange( int min, int max ) { + public static int randomIntegerInRange( final int min, final int max ) { return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; } // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). - public static int fastRound(double d) { + public static int fastRound(final double d) { return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); } @@ -123,7 +123,7 @@ public class MathUtils { return approxSum; } - public static double approximateLog10SumLog10(double a, double b, double c) { + public static double approximateLog10SumLog10(final double a, final double b, final double c) { return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); } @@ -152,97 +152,53 @@ public class MathUtils { return big + MathUtils.jacobianLogTable[ind]; } - public static double sum(Collection numbers) { - return sum(numbers, false); - } - - public static double sum(Collection numbers, boolean ignoreNan) { - double sum = 0; - for (Number n : numbers) { - if (!ignoreNan || !Double.isNaN(n.doubleValue())) { - sum += n.doubleValue(); - } - } - - return sum; - } - - public static int nonNanSize(Collection numbers) { - int size = 0; - for (Number n : numbers) { - size += Double.isNaN(n.doubleValue()) ? 0 : 1; - } - - return size; - } - - public static double average(Collection x) { - return sum(x) / x.size(); - } - - public static double average(Collection numbers, boolean ignoreNan) { - if (ignoreNan) { - return sum(numbers, true) / nonNanSize(numbers); - } - else { - return sum(numbers, false) / nonNanSize(numbers); - } - } - - public static double variance(Collection numbers, Number mean, boolean ignoreNan) { - double mn = mean.doubleValue(); - double var = 0; - for (Number n : numbers) { - var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0; - } - if (ignoreNan) { - return var / (nonNanSize(numbers) - 1); - } - return var / (numbers.size() - 1); - } - - public static double variance(Collection numbers, Number mean) { - return variance(numbers, mean, false); - } - - public static double variance(Collection numbers, boolean ignoreNan) { - return variance(numbers, average(numbers, ignoreNan), ignoreNan); - } - - public static double variance(Collection numbers) { - return variance(numbers, average(numbers, false), false); - } - - public static double sum(double[] values) { + public static double sum(final double[] values) { double s = 0.0; for (double v : values) s += v; return s; } - public static long sum(int[] x) { + public static long sum(final int[] x) { long total = 0; for (int v : x) total += v; return total; } - public static int sum(byte[] x) { + public static int sum(final byte[] x) { int total = 0; for (byte v : x) total += (int)v; return total; } - /** - * Calculates the log10 cumulative sum of an array with log10 probabilities - * - * @param log10p the array with log10 probabilities - * @param upTo index in the array to calculate the cumsum up to - * @return the log10 of the cumulative sum - */ - public static double log10CumulativeSumLog10(double[] log10p, int upTo) { - return log10sumLog10(log10p, 0, upTo); + public static double percentage(int x, int base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static double ratio(final int num, final int denom) { + if ( denom > 0 ) { + return ((double) num)/denom; + } else { + if ( num == 0 && denom == 0) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + public static double ratio(final long num, final long denom) { + if ( denom > 0L ) { + return ((double) num)/denom; + } else { + if ( num == 0L && denom == 0L ) { + return 0.0; + } else { + throw new ReviewedStingException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } } /** @@ -251,18 +207,18 @@ public class MathUtils { * @param prRealSpace * @return */ - public static double[] toLog10(double[] prRealSpace) { + public static double[] toLog10(final double[] prRealSpace) { double[] log10s = new double[prRealSpace.length]; for (int i = 0; i < prRealSpace.length; i++) log10s[i] = Math.log10(prRealSpace[i]); return log10s; } - public static double log10sumLog10(double[] log10p, int start) { + public static double log10sumLog10(final double[] log10p, final int start) { return log10sumLog10(log10p, start, log10p.length); } - public static double log10sumLog10(double[] log10p, int start, int finish) { + public static double log10sumLog10(final double[] log10p,final int start,final int finish) { double sum = 0.0; double maxValue = arrayMax(log10p, finish); @@ -276,56 +232,42 @@ public class MathUtils { return Math.log10(sum) + maxValue; } - public static double sumDoubles(List values) { - double s = 0.0; - for (double v : values) - s += v; - return s; - } - - public static int sumIntegers(List values) { - int s = 0; - for (int v : values) - s += v; - return s; - } - - public static double sumLog10(double[] log10values) { + public static double sumLog10(final double[] log10values) { return Math.pow(10.0, log10sumLog10(log10values)); // double s = 0.0; // for ( double v : log10values) s += Math.pow(10.0, v); // return s; } - public static double log10sumLog10(double[] log10values) { + public static double log10sumLog10(final double[] log10values) { return log10sumLog10(log10values, 0); } - public static boolean wellFormedDouble(double val) { + public static boolean wellFormedDouble(final double val) { return !Double.isInfinite(val) && !Double.isNaN(val); } - public static double bound(double value, double minBoundary, double maxBoundary) { + public static double bound(final double value, final double minBoundary, final double maxBoundary) { return Math.max(Math.min(value, maxBoundary), minBoundary); } - public static boolean isBounded(double val, double lower, double upper) { + public static boolean isBounded(final double val, final double lower, final double upper) { return val >= lower && val <= upper; } - public static boolean isPositive(double val) { + public static boolean isPositive(final double val) { return !isNegativeOrZero(val); } - public static boolean isPositiveOrZero(double val) { + public static boolean isPositiveOrZero(final double val) { return isBounded(val, 0.0, Double.POSITIVE_INFINITY); } - public static boolean isNegativeOrZero(double val) { + public static boolean isNegativeOrZero(final double val) { return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); } - public static boolean isNegative(double val) { + public static boolean isNegative(final double val) { return !isPositiveOrZero(val); } @@ -336,7 +278,7 @@ public class MathUtils { * @param b the second double value * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b) { + public static byte compareDoubles(final double a, final double b) { return compareDoubles(a, b, 1e-6); } @@ -348,7 +290,7 @@ public class MathUtils { * @param epsilon the precision within which two double values will be considered equal * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b, double epsilon) { + public static byte compareDoubles(final double a, final double b, final double epsilon) { if (Math.abs(a - b) < epsilon) { return 0; } @@ -358,42 +300,13 @@ public class MathUtils { return 1; } - /** - * Compares float values for equality (within 1e-6), or inequality. - * - * @param a the first float value - * @param b the second float value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. - */ - public static byte compareFloats(float a, float b) { - return compareFloats(a, b, 1e-6f); - } - - /** - * Compares float values for equality (within epsilon), or inequality. - * - * @param a the first float value - * @param b the second float value - * @param epsilon the precision within which two float values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. - */ - public static byte compareFloats(float a, float b, float epsilon) { - if (Math.abs(a - b) < epsilon) { - return 0; - } - if (a > b) { - return -1; - } - return 1; - } - - public static double NormalDistribution(double mean, double sd, double x) { + public static double NormalDistribution(final double mean, final double sd, final double x) { double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); return a * b; } - public static double binomialCoefficient(int n, int k) { + public static double binomialCoefficient(final int n, final int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } @@ -409,7 +322,7 @@ public class MathUtils { * @param p probability of success * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. */ - public static double binomialProbability(int n, int k, double p) { + public static double binomialProbability(final int n, final int k, final double p) { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } @@ -422,7 +335,7 @@ public class MathUtils { * @param probHit - probability of a successful hit * @return - returns the cumulative probability */ - public static double binomialCumulativeProbability(int start, int end, int total, double probHit) { + public static double binomialCumulativeProbability(final int start, final int end, final int total, final double probHit) { double cumProb = 0.0; double prevProb; BigDecimal probCache = BigDecimal.ZERO; @@ -454,7 +367,7 @@ public class MathUtils { * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed * @return the multinomial of the specified configuration. */ - public static double multinomialCoefficient(int[] k) { + public static double multinomialCoefficient(final int[] k) { int n = 0; for (int xi : k) { n += xi; @@ -477,7 +390,7 @@ public class MathUtils { * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur * @return the multinomial probability of the specified configuration. */ - public static double multinomialProbability(int[] k, double[] p) { + public static double multinomialProbability(final int[] k, final double[] p) { if (p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); @@ -496,7 +409,7 @@ public class MathUtils { * @param x an byte[] of numbers * @return the RMS of the specified numbers. */ - public static double rms(byte[] x) { + public static double rms(final byte[] x) { if (x.length == 0) return 0.0; @@ -513,7 +426,7 @@ public class MathUtils { * @param x an int[] of numbers * @return the RMS of the specified numbers. */ - public static double rms(int[] x) { + public static double rms(final int[] x) { if (x.length == 0) return 0.0; @@ -530,7 +443,7 @@ public class MathUtils { * @param x a double[] of numbers * @return the RMS of the specified numbers. */ - public static double rms(Double[] x) { + public static double rms(final Double[] x) { if (x.length == 0) return 0.0; @@ -541,7 +454,7 @@ public class MathUtils { return Math.sqrt(rms); } - public static double rms(Collection l) { + public static double rms(final Collection l) { if (l.size() == 0) return 0.0; @@ -560,7 +473,7 @@ public class MathUtils { return dist; } - public static double round(double num, int digits) { + public static double round(final double num, final int digits) { double result = num * Math.pow(10.0, (double) digits); result = Math.round(result); result = result / Math.pow(10.0, (double) digits); @@ -574,7 +487,7 @@ public class MathUtils { * @param takeLog10OfOutput if true, the output will be transformed back into log10 units * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed */ - public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) { + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { return normalizeFromLog10(array, takeLog10OfOutput, false); } @@ -587,7 +500,7 @@ public class MathUtils { * * @return */ - public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) { + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. double maxValue = arrayMax(array); @@ -630,7 +543,7 @@ public class MathUtils { * @param array the array to be normalized * @return a newly allocated array corresponding the normalized values in array */ - public static double[] normalizeFromLog10(double[] array) { + public static double[] normalizeFromLog10(final double[] array) { return normalizeFromLog10(array, false); } @@ -683,7 +596,7 @@ public class MathUtils { return maxElementIndex(array, array.length); } - public static int maxElementIndex(final int[] array, int endIndex) { + public static int maxElementIndex(final int[] array, final int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -696,7 +609,7 @@ public class MathUtils { return maxI; } - public static int maxElementIndex(final byte[] array, int endIndex) { + public static int maxElementIndex(final byte[] array, final int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -722,19 +635,19 @@ public class MathUtils { return array[maxElementIndex(array, endIndex)]; } - public static double arrayMin(double[] array) { + public static double arrayMin(final double[] array) { return array[minElementIndex(array)]; } - public static int arrayMin(int[] array) { + public static int arrayMin(final int[] array) { return array[minElementIndex(array)]; } - public static byte arrayMin(byte[] array) { + public static byte arrayMin(final byte[] array) { return array[minElementIndex(array)]; } - public static int minElementIndex(double[] array) { + public static int minElementIndex(final double[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -747,7 +660,7 @@ public class MathUtils { return minI; } - public static int minElementIndex(byte[] array) { + public static int minElementIndex(final byte[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -760,7 +673,7 @@ public class MathUtils { return minI; } - public static int minElementIndex(int[] array) { + public static int minElementIndex(final int[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -773,7 +686,7 @@ public class MathUtils { return minI; } - public static int arrayMaxInt(List array) { + public static int arrayMaxInt(final List array) { if (array == null) throw new IllegalArgumentException("Array cannot be null!"); if (array.size() == 0) @@ -785,19 +698,15 @@ public class MathUtils { return m; } - public static double arrayMaxDouble(List array) { - if (array == null) - throw new IllegalArgumentException("Array cannot be null!"); - if (array.size() == 0) - throw new IllegalArgumentException("Array size cannot be 0!"); - - double m = array.get(0); - for (double e : array) - m = Math.max(m, e); - return m; + public static int sum(final List list ) { + int sum = 0; + for ( Integer i : list ) { + sum += i; + } + return sum; } - public static double average(List vals, int maxI) { + public static double average(final List vals, final int maxI) { long sum = 0L; int i = 0; @@ -814,201 +723,11 @@ public class MathUtils { return (1.0 * sum) / i; } - public static double averageDouble(List vals, int maxI) { - double sum = 0.0; - - int i = 0; - for (double x : vals) { - if (i > maxI) - break; - sum += x; - i++; - } - return (1.0 * sum) / i; - } - - public static double average(List vals) { + public static double average(final List vals) { return average(vals, vals.size()); } - public static double average(int[] x) { - int sum = 0; - for (int v : x) - sum += v; - return (double) sum / x.length; - } - - public static byte average(byte[] vals) { - int sum = 0; - for (byte v : vals) { - sum += v; - } - return (byte) (sum / vals.length); - } - - public static double averageDouble(List vals) { - return averageDouble(vals, vals.size()); - } - - // Java Generics can't do primitive types, so I had to do this the simplistic way - - public static Integer[] sortPermutation(final int[] A) { - class comparator implements Comparator { - public int compare(Integer a, Integer b) { - if (A[a.intValue()] < A[b.intValue()]) { - return -1; - } - if (A[a.intValue()] == A[b.intValue()]) { - return 0; - } - if (A[a.intValue()] > A[b.intValue()]) { - return 1; - } - return 0; - } - } - Integer[] permutation = new Integer[A.length]; - for (int i = 0; i < A.length; i++) { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - - public static Integer[] sortPermutation(final double[] A) { - class comparator implements Comparator { - public int compare(Integer a, Integer b) { - if (A[a.intValue()] < A[b.intValue()]) { - return -1; - } - if (A[a.intValue()] == A[b.intValue()]) { - return 0; - } - if (A[a.intValue()] > A[b.intValue()]) { - return 1; - } - return 0; - } - } - Integer[] permutation = new Integer[A.length]; - for (int i = 0; i < A.length; i++) { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - - public static Integer[] sortPermutation(List A) { - final Object[] data = A.toArray(); - - class comparator implements Comparator { - public int compare(Integer a, Integer b) { - return ((T) data[a]).compareTo(data[b]); - } - } - Integer[] permutation = new Integer[A.size()]; - for (int i = 0; i < A.size(); i++) { - permutation[i] = i; - } - Arrays.sort(permutation, new comparator()); - return permutation; - } - - public static int[] permuteArray(int[] array, Integer[] permutation) { - int[] output = new int[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static double[] permuteArray(double[] array, Integer[] permutation) { - double[] output = new double[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static Object[] permuteArray(Object[] array, Integer[] permutation) { - Object[] output = new Object[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static String[] permuteArray(String[] array, Integer[] permutation) { - String[] output = new String[array.length]; - for (int i = 0; i < output.length; i++) { - output[i] = array[permutation[i]]; - } - return output; - } - - public static List permuteList(List list, Integer[] permutation) { - List output = new ArrayList(); - for (int i = 0; i < permutation.length; i++) { - output.add(list.get(permutation[i])); - } - return output; - } - - /** - * Draw N random elements from list. - */ - public static List randomSubset(List list, int N) { - if (list.size() <= N) { - return list; - } - - int idx[] = new int[list.size()]; - for (int i = 0; i < list.size(); i++) { - idx[i] = GenomeAnalysisEngine.getRandomGenerator().nextInt(); - } - - Integer[] perm = sortPermutation(idx); - - List ans = new ArrayList(); - for (int i = 0; i < N; i++) { - ans.add(list.get(perm[i])); - } - - return ans; - } - - /** - * Draw N random elements from an array. - * - * @param array your objects - * @param n number of elements to select at random from the list - * @return a new list with the N randomly chosen elements from list - */ - @Requires({"array != null", "n>=0"}) - @Ensures({"result != null", "result.length == Math.min(n, array.length)"}) - public static Object[] randomSubset(final Object[] array, final int n) { - if (array.length <= n) - return array.clone(); - - Object[] shuffledArray = arrayShuffle(array); - Object[] result = new Object[n]; - System.arraycopy(shuffledArray, 0, result, 0, n); - return result; - } - - public static double percentage(double x, double base) { - return (base > 0 ? (x / base) * 100.0 : 0); - } - - public static double percentage(int x, int base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static double percentage(long x, long base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static int countOccurrences(char c, String s) { + public static int countOccurrences(final char c, final String s) { int count = 0; for (int i = 0; i < s.length(); i++) { count += s.charAt(i) == c ? 1 : 0; @@ -1036,27 +755,6 @@ public class MathUtils { return count; } - /** - * Returns the top (larger) N elements of the array. Naive n^2 implementation (Selection Sort). - * Better than sorting if N (number of elements to return) is small - * - * @param array the array - * @param n number of top elements to return - * @return the n larger elements of the array - */ - public static Collection getNMaxElements(double[] array, int n) { - ArrayList maxN = new ArrayList(n); - double lastMax = Double.MAX_VALUE; - for (int i = 0; i < n; i++) { - double max = Double.MIN_VALUE; - for (double x : array) { - max = Math.min(lastMax, Math.max(x, max)); - } - maxN.add(max); - lastMax = max; - } - return maxN; - } /** * Returns n random indices drawn with replacement from the range 0..(k-1) @@ -1065,7 +763,7 @@ public class MathUtils { * @param k the number of random indices to draw (with replacement) * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates */ - static public ArrayList sampleIndicesWithReplacement(int n, int k) { + static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { ArrayList chosen_balls = new ArrayList(k); for (int i = 0; i < k; i++) { @@ -1084,7 +782,7 @@ public class MathUtils { * @param k the number of random indices to draw (without replacement) * @return a list of k random indices ranging from 0 to (n-1) without duplicates */ - static public ArrayList sampleIndicesWithoutReplacement(int n, int k) { + static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { ArrayList chosen_balls = new ArrayList(k); for (int i = 0; i < n; i++) { @@ -1105,7 +803,7 @@ public class MathUtils { * @param the template type of the ArrayList * @return a new ArrayList consisting of the elements at the specified indices */ - static public ArrayList sliceListByIndices(List indices, List list) { + static public ArrayList sliceListByIndices(final List indices, final List list) { ArrayList subset = new ArrayList(); for (int i : indices) { @@ -1115,35 +813,6 @@ public class MathUtils { return subset; } - public static Comparable orderStatisticSearch(int orderStat, List list) { - // this finds the order statistic of the list (kth largest element) - // the list is assumed *not* to be sorted - - final Comparable x = list.get(orderStat); - ArrayList lessThanX = new ArrayList(); - ArrayList equalToX = new ArrayList(); - ArrayList greaterThanX = new ArrayList(); - - for (Comparable y : list) { - if (x.compareTo(y) > 0) { - lessThanX.add(y); - } - else if (x.compareTo(y) < 0) { - greaterThanX.add(y); - } - else - equalToX.add(y); - } - - if (lessThanX.size() > orderStat) - return orderStatisticSearch(orderStat, lessThanX); - else if (lessThanX.size() + equalToX.size() >= orderStat) - return orderStat; - else - return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); - - } - /** * Given two log-probability vectors, compute log of vector product of them: * in Matlab notation, return log10(10.*x'*10.^y) @@ -1151,7 +820,7 @@ public class MathUtils { * @param y vector 2 * @return a double representing log (dotProd(10.^x,10.^y) */ - public static double logDotProduct(double [] x, double[] y) { + public static double logDotProduct(final double [] x, final double[] y) { if (x.length != y.length) throw new ReviewedStingException("BUG: Vectors of different lengths"); @@ -1165,57 +834,6 @@ public class MathUtils { - } - public static Object getMedian(List list) { - return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); - } - - public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { - // version of the order statistic calculator for SAMRecord/Integer lists, where the - // list index maps to a q-score only through the offset index - // returns the kth-largest q-score. - - if (reads.size() == 0) { - return 0; - } - - ArrayList lessThanQReads = new ArrayList(); - ArrayList equalToQReads = new ArrayList(); - ArrayList greaterThanQReads = new ArrayList(); - ArrayList lessThanQOffsets = new ArrayList(); - ArrayList greaterThanQOffsets = new ArrayList(); - - final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; - - for (int iter = 0; iter < reads.size(); iter++) { - SAMRecord read = reads.get(iter); - int offset = offsets.get(iter); - byte quality = read.getBaseQualities()[offset]; - - if (quality < qk) { - lessThanQReads.add(read); - lessThanQOffsets.add(offset); - } - else if (quality > qk) { - greaterThanQReads.add(read); - greaterThanQOffsets.add(offset); - } - else { - equalToQReads.add(reads.get(iter)); - } - } - - if (lessThanQReads.size() > k) - return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); - else if (equalToQReads.size() + lessThanQReads.size() >= k) - return qk; - else - return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); - - } - - public static byte getQScoreMedian(List reads, List offsets) { - return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } /** @@ -1336,29 +954,6 @@ public class MathUtils { // // useful common utility routines // - public static double rate(long n, long d) { - return n / (1.0 * Math.max(d, 1)); - } - - public static double rate(int n, int d) { - return n / (1.0 * Math.max(d, 1)); - } - - public static long inverseRate(long n, long d) { - return n == 0 ? 0 : d / Math.max(n, 1); - } - - public static long inverseRate(int n, int d) { - return n == 0 ? 0 : d / Math.max(n, 1); - } - - public static double ratio(int num, int denom) { - return ((double) num) / (Math.max(denom, 1)); - } - - public static double ratio(long num, long denom) { - return ((double) num) / (Math.max(denom, 1)); - } static public double max(double x0, double x1, double x2) { double a = Math.max(x0, x1); @@ -1371,8 +966,8 @@ public class MathUtils { * @param ln log(x) * @return log10(x) */ - public static double lnToLog10(double ln) { - return ln * Math.log10(Math.exp(1)); + public static double lnToLog10(final double ln) { + return ln * Math.log10(Math.E); } /** @@ -1384,7 +979,7 @@ public class MathUtils { * Efficient rounding functions to simplify the log gamma function calculation * double to long with 32 bit shift */ - private static final int HI(double x) { + private static final int HI(final double x) { return (int) (Double.doubleToLongBits(x) >> 32); } @@ -1392,7 +987,7 @@ public class MathUtils { * Efficient rounding functions to simplify the log gamma function calculation * double to long without shift */ - private static final int LO(double x) { + private static final int LO(final double x) { return (int) Double.doubleToLongBits(x); } @@ -1400,7 +995,7 @@ public class MathUtils { * Most efficent implementation of the lnGamma (FDLIBM) * Use via the log10Gamma wrapper method. */ - private static double lnGamma(double x) { + private static double lnGamma(final double x) { double t, y, z, p, p1, p2, p3, q, r, w; int i; @@ -1521,7 +1116,7 @@ public class MathUtils { * @param x the x parameter * @return the log10 of the gamma function at x. */ - public static double log10Gamma(double x) { + public static double log10Gamma(final double x) { return lnToLog10(lnGamma(x)); } @@ -1533,11 +1128,11 @@ public class MathUtils { * @param k number of successes * @return the log10 of the binomial coefficient */ - public static double log10BinomialCoefficient(int n, int k) { + public static double log10BinomialCoefficient(final int n, final int k) { return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); } - public static double log10BinomialProbability(int n, int k, double log10p) { + public static double log10BinomialProbability(final int n, final int k, final double log10p) { double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); } @@ -1550,10 +1145,10 @@ public class MathUtils { * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) * @return */ - public static double log10MultinomialCoefficient(int n, int[] k) { + public static double log10MultinomialCoefficient(final int n, final int[] k) { double denominator = 0.0; for (int x : k) { - denominator += log10Factorial(x ); + denominator += log10Factorial(x); } return log10Factorial(n) - denominator; } @@ -1567,7 +1162,7 @@ public class MathUtils { * @param log10p array of log10 probabilities * @return */ - public static double log10MultinomialProbability(int n, int[] k, double[] log10p) { + public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { if (log10p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); double log10Prod = 0.0; @@ -1577,12 +1172,12 @@ public class MathUtils { return log10MultinomialCoefficient(n, k) + log10Prod; } - public static double factorial(int x) { + public static double factorial(final int x) { // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value return (double)Math.round(Math.pow(10, log10Factorial(x))); } - public static double log10Factorial(int x) { + public static double log10Factorial(final int x) { if (x >= log10FactorialCache.length || x < 0) return log10Gamma(x + 1); else @@ -1598,57 +1193,20 @@ public class MathUtils { */ @Requires("a.length == b.length") @Ensures("result.length == a.length") - public static int[] addArrays(int[] a, int[] b) { + public static int[] addArrays(final int[] a, final int[] b) { int[] c = new int[a.length]; for (int i = 0; i < a.length; i++) c[i] = a[i] + b[i]; return c; } - /** - * Quick implementation of the Knuth-shuffle algorithm to generate a random - * permutation of the given array. - * - * @param array the original array - * @return a new array with the elements shuffled - */ - public static Object[] arrayShuffle(Object[] array) { - int n = array.length; - Object[] shuffled = array.clone(); - for (int i = 0; i < n; i++) { - int j = i + GenomeAnalysisEngine.getRandomGenerator().nextInt(n - i); - Object tmp = shuffled[i]; - shuffled[i] = shuffled[j]; - shuffled[j] = tmp; - } - return shuffled; - } - - /** - * Vector operations - * - * @param v1 first numerical array - * @param v2 second numerical array - * @return a new array with the elements added - */ - public static Double[] vectorSum(E v1[], E v2[]) { - if (v1.length != v2.length) - throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); - - Double[] result = new Double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = v1[k].doubleValue() + v2[k].doubleValue(); - - return result; - } - /** Same routine, unboxed types for efficiency * * @param x First vector * @param y Second vector * @return Vector of same length as x and y so that z[k] = x[k]+y[k] */ - public static double[] vectorSum(double[]x, double[] y) { + public static double[] vectorSum(final double[]x, final double[] y) { if (x.length != y.length) throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); @@ -1665,24 +1223,7 @@ public class MathUtils { * @param y Second vector * @return Vector of same length as x and y so that z[k] = x[k]-y[k] */ - public static double[] vectorDiff(double[]x, double[] y) { - if (x.length != y.length) - throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); - - double[] result = new double[x.length]; - for (int k=0; k Double[] scalarTimesVector(E a, E[] v1) { - - Double result[] = new Double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = a.doubleValue() * v1[k].doubleValue(); - - return result; - } - - public static Double dotProduct(E[] v1, E[] v2) { - if (v1.length != v2.length) - throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); - - Double result = 0.0; - for (int k = 0; k < v1.length; k++) - result += v1[k].doubleValue() * v2[k].doubleValue(); - - return result; - } - - public static double dotProduct(double[] v1, double[] v2) { - if (v1.length != v2.length) - throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()"); - - double result = 0.0; - for (int k = 0; k < v1.length; k++) - result += v1[k] * v2[k]; - - return result; - } - - public static double[] vectorLog10(double v1[]) { - double result[] = new double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = Math.log10(v1[k]); - - return result; - - } - - // todo - silly overloading, just because Java can't unbox/box arrays of primitive types, and we can't do generics with primitive types! - public static Double[] vectorLog10(Double v1[]) { - Double result[] = new Double[v1.length]; - for (int k = 0; k < v1.length; k++) - result[k] = Math.log10(v1[k]); - - return result; - - } - /** * Returns a series of integer values between start and stop, inclusive, * expontentially distributed between the two. That is, if there are @@ -1796,4 +1287,18 @@ public class MathUtils { return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; } } + + /** + * Draw N random elements from list + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSubset(final List list, final int N) { + if (list.size() <= N) { + return list; + } + + return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); + } + } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 2c57e8b33..2560bcd11 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -150,6 +150,21 @@ public class MathUtilsUnitTest extends BaseTest { @Test public void testLog10BinomialCoefficient() { logger.warn("Executing testLog10BinomialCoefficient"); + // note that we can test the binomial coefficient calculation indirectly via Newton's identity + // (1+z)^m = sum (m choose k)z^k + double[] z_vals = new double[]{0.999,0.9,0.8,0.5,0.2,0.01,0.0001}; + int[] exponent = new int[]{5,15,25,50,100}; + for ( double z : z_vals ) { + double logz = Math.log10(z); + for ( int exp : exponent ) { + double expected_log = exp*Math.log10(1+z); + double[] newtonArray_log = new double[1+exp]; + for ( int k = 0 ; k <= exp; k++ ) { + newtonArray_log[k] = MathUtils.log10BinomialCoefficient(exp,k)+k*logz; + } + Assert.assertEquals(MathUtils.log10sumLog10(newtonArray_log),expected_log,1e-6); + } + } Assert.assertEquals(MathUtils.log10BinomialCoefficient(4, 2), 0.7781513, 1e-6); Assert.assertEquals(MathUtils.log10BinomialCoefficient(10, 3), 2.079181, 1e-6); @@ -172,36 +187,19 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.log10Factorial(12), 8.680337, 1e-6); Assert.assertEquals(MathUtils.log10Factorial(200), 374.8969, 1e-3); Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); - } - - @Test(enabled = true) - public void testRandomSubset() { - Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0); - Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1); - Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2); - Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3); - Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4); - Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5); - Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6); - Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7); - Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8); - Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9); - Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10); - Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10); - - for (int i = 0; i < 25; i++) - Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5))); - - } - - @Test(enabled = true) - public void testArrayShuffle() { - Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - for (int i = 0; i < 25; i++) { - Object[] t = MathUtils.arrayShuffle(x); - Assert.assertTrue(hasUniqueElements(t)); - Assert.assertTrue(hasAllElements(x, t)); + double log10factorial_small = 0; + double log10factorial_middle = 374.8969; + double log10factorial_large = 45138.26; + int small_start = 1; + int med_start = 200; + int large_start = 12342; + for ( int i = 1; i < 1000; i++ ) { + log10factorial_small += Math.log10(i+small_start); + log10factorial_middle += Math.log10(i+med_start); + log10factorial_large += Math.log10(i+large_start); + Assert.assertEquals(MathUtils.log10Factorial(small_start+i),log10factorial_small,1e-6); + Assert.assertEquals(MathUtils.log10Factorial(med_start+i),log10factorial_middle,1e-3); + Assert.assertEquals(MathUtils.log10Factorial(large_start+i),log10factorial_large,1e-1); } } @@ -286,17 +284,29 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - } - @Test - public void testNormalizeFromLog10() { - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, -1.0, -1.1, -7.8})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, 0.0, -0.1, -6.8})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[] {-8.9, -6.7, -9.4, 0.0, -8.9})); - - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.0}), new double[] {0.25, 0.25, 0.25, 0.25})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -1.0}), new double[] {0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.approximateLog10SumLog10(equal),0.0,requiredPrecision,String.format("Did not sum to one: k=%d equal partitions.",npart)); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(multiplicative),0.0,requiredPrecision, String.format("Did not sum to one: k=%d multiplicative partitions with alpha=%f",npart,alpha)); + } + } } @Test @@ -342,12 +352,29 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - } - @Test - public void testDotProduct() { - Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3); - Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0}, new Double[]{6.0}),-30.0,1e-3); + // magnitude of the sum doesn't matter, so we can combinatorially test this via partitions of unity + double[] mult_partitionFactor = new double[]{0.999,0.98,0.95,0.90,0.8,0.5,0.3,0.1,0.05,0.001}; + int[] n_partitions = new int[] {2,4,8,16,32,64,128,256,512,1028}; + for ( double alpha : mult_partitionFactor ) { + double log_alpha = Math.log10(alpha); + double log_oneMinusAlpha = Math.log10(1-alpha); + for ( int npart : n_partitions ) { + double[] multiplicative = new double[npart]; + double[] equal = new double[npart]; + double remaining_log = 0.0; // realspace = 1 + for ( int i = 0 ; i < npart-1; i++ ) { + equal[i] = -Math.log10(npart); + double piece = remaining_log + log_alpha; // take a*remaining, leaving remaining-a*remaining = (1-a)*remaining + multiplicative[i] = piece; + remaining_log = remaining_log + log_oneMinusAlpha; + } + equal[npart-1] = -Math.log10(npart); + multiplicative[npart-1] = remaining_log; + Assert.assertEquals(MathUtils.log10sumLog10(equal),0.0,requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(multiplicative),0.0,requiredPrecision,String.format("Did not sum to one: nPartitions=%d, alpha=%f",npart,alpha)); + } + } } @Test @@ -355,19 +382,4 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); } - - /** - * Private function used by testNormalizeFromLog10() - */ - private boolean compareDoubleArrays(double[] b1, double[] b2) { - if (b1.length != b2.length) { - return false; // sanity check - } - - for (int i = 0; i < b1.length; i++) { - if (MathUtils.compareDoubles(b1[i], b2[i]) != 0) - return false; - } - return true; - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index d5231c30b..2470364c4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -120,12 +120,21 @@ public class BandPassActivityProfileUnitTest extends BaseTest { for( int iii = 0; iii < activeProbArray.length; iii++ ) { final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii)); final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1)); - bandPassProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + bandPassProbArray[iii] = dotProduct(activeProbSubArray, kernel); } return bandPassProbArray; } + public static double dotProduct(double[] v1, double[] v2) { + Assert.assertEquals(v1.length,v2.length,"Array lengths do not mach in dotProduct"); + double result = 0.0; + for (int k = 0; k < v1.length; k++) + result += v1[k] * v2[k]; + + return result; + } + @DataProvider(name = "BandPassComposition") public Object[][] makeBandPassComposition() { final List tests = new LinkedList(); From 8fbf9c947f60c2c8b6a95a4d107d70d492c1ec76 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 22 Mar 2013 18:25:47 -0400 Subject: [PATCH 115/226] Upgrades and changes to LeftAlignVariants, motivated by 1000G consensus indel production: -- Added ability to trim common bases in front of indels before left-aligning. Otherwise, records may not be left-aligned if they have common bases, as they will be mistaken by complext records. -- Added ability to split multiallelic records and then left align them, otherwise we miss a lot of good left-aligneable indels. -- Motivated by this, renamed walker to LeftAlignAndTrimVariants. -- Code refactoring, cleanup and bring up to latest coding standards. -- Added unit testing to make sure left alignment is performed correctly for all offsets. -- Changed phase 3 HC script to new syntax. Add command line options, more memory and reduce alt alleles because jobs keep crashing. --- ...tAlignAndTrimVariantsIntegrationTest.java} | 6 +- .../LeftAlignAndTrimVariantsUnitTest.java | 177 ++++++++++++++++++ ...nts.java => LeftAlignAndTrimVariants.java} | 114 ++++++++--- .../variant/GATKVariantContextUtils.java | 11 +- 4 files changed, 274 insertions(+), 34 deletions(-) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/{LeftAlignVariantsIntegrationTest.java => LeftAlignAndTrimVariantsIntegrationTest.java} (96%) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java rename public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/{LeftAlignVariants.java => LeftAlignAndTrimVariants.java} (65%) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java similarity index 96% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java index 721eb2874..0b3d9c930 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -52,14 +52,14 @@ import org.testng.annotations.Test; import java.util.Arrays; /** - * Tests LeftAlignVariants + * Tests LeftAlignAndTrimVariants */ -public class LeftAlignVariantsIntegrationTest extends WalkerTest { +public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { @Test public void testLeftAlignment() { WalkerTestSpec spec = new WalkerTestSpec( - "-T LeftAlignVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header", 1, Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2")); executeTest("test left alignment", spec); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java new file mode 100644 index 000000000..a8739dac2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java @@ -0,0 +1,177 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: delangel + * Date: 3/22/13 + * Time: 6:09 PM + * To change this template use File | Settings | File Templates. + */ +public class LeftAlignAndTrimVariantsUnitTest extends BaseTest { + final String refBases1 = "ACAGAGCTGACCCTCCCTCCCCTCTCCCAGTGCAACAGCACGGGCGGCGACTGCTTTTACCGAGGCTACACGTCAGGCGTGGCGGCTGTCCAGGACTGGTACCACTTCCACTATGTGGATCTCTGCTGAGGACCAGGAAAGCCAGCACCCGCAGAGACTCTTCCCCAGTGCTCCATACGATCACCATTCTCTGCAGAAGG"; + final String longPiece = "AAAAAAAAAAAAAAAAAAAAAAAAAAAA"; // where we'll perform tests + final String refBases = refBases1 + longPiece + refBases1; + + final int contigStop = refBases.length(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigStop ); + final String artificialContig = "chr1"; + final int locStart = refBases1.length(); // start position where we desire artificial variant + final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,1,refBases.length()); + final String windowBases = refBases; + + + + @DataProvider(name = "LeftAlignDataProvider") + public Object[][] makeLeftAlignDataProvider() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + for ( int offset = 1; offset < longPiece.length(); offset++ ) { + for ( int indelSize = -longPiece.length()+offset; indelSize < longPiece.length()-offset; indelSize++ ) { + tests.add(new Object[]{offset, indelSize}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "LeftAlignDataProvider") + public void testLeftAlignNoTrimming(final int offset, final int indelSize) { + if (indelSize == 0) + return; + + final List alleles = new ArrayList(); + + if (indelSize < 0) { // deletion + alleles.add(Allele.create(Utils.dupString("A",Math.abs(indelSize)+1),true)); + alleles.add(Allele.create("A", false)); + } + else { + alleles.add(Allele.create("A", true)); + alleles.add(Allele.create(Utils.dupString("A",Math.abs(indelSize)+1),false)); + + } + final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart+offset,locStart+offset); + final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,windowBases.getBytes()); + + final VariantContext vc = new VariantContextBuilder("test", artificialContig, locStart+offset, locStart+offset+alleles.get(0).length()-1, alleles).make(); + final Pair result = LeftAlignAndTrimVariants.alignAndWrite(vc,referenceContext); + Assert.assertTrue(result.second == (offset>0?1:0)); + Assert.assertEquals(result.first.getStart(), locStart); + + + } + + @DataProvider(name = "TrimDataProvider") + public Object[][] makeTrimDataProvider() { + List tests = new ArrayList(); + + for ( int offset = 1; offset < longPiece.length(); offset++ ) { + for ( int indelSize = -longPiece.length()+offset; indelSize < longPiece.length()-offset; indelSize++ ) + tests.add(new Object[]{indelSize, offset}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimDataProvider") + public void testTrimming(final int indelSize, final int offset) { + if (indelSize == 0) + return; + + final List alleles = new ArrayList(); + + final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart+offset,locStart+offset); + final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,windowBases.getBytes()); + + final int prefixLen = 10; + final String prefix = refBases.substring(locStart+offset-prefixLen,locStart+offset); + if (indelSize < 0) { // deletion + alleles.add(Allele.create(prefix+Utils.dupString("A",Math.abs(indelSize)+1),true)); + alleles.add(Allele.create(prefix+"A", false)); + } + else { + alleles.add(Allele.create(prefix+"A", true)); + alleles.add(Allele.create(prefix+Utils.dupString("A",Math.abs(indelSize)+1),false)); + + } + + final VariantContext vc = GATKVariantContextUtils.trimAlleles( new VariantContextBuilder("test", artificialContig, locStart + offset, locStart + offset + alleles.get(0).length() - 1, alleles).make(),true,true); + if (indelSize>0) + Assert.assertEquals(vc.getReference().length(),1); + else + Assert.assertEquals(vc.getReference().length(),Math.abs(indelSize)+1); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java similarity index 65% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java index 700b34b38..25e3e9857 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -38,8 +41,11 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; @@ -55,14 +61,15 @@ import java.util.*; * Left-aligns indels from a variants file. * *

        - * LeftAlignVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be + * LeftAlignAndTrimVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. + * Optionally, the tool will also trim common bases from indels, leaving them with a minimum representation. * *

        Input

        *

        - * A variant set to left-align. + * A variant set to left-align and trim. *

        * *

        Output

        @@ -74,24 +81,39 @@ import java.util.*; *
          * java -Xmx2g -jar GenomeAnalysisTK.jar \
          *   -R ref.fasta \
        - *   -T LeftAlignVariants \
        + *   -T LeftAlignAndTrimVariants \
          *   --variant input.vcf \
          *   -o output.vcf
          * 
        * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) -@Reference(window=@Window(start=-200,stop=200)) -public class LeftAlignVariants extends RodWalker { +@Reference(window=@Window(start=-200,stop=200)) // WARNING: if this changes,MAX_INDEL_LENGTH needs to change as well! +public class LeftAlignAndTrimVariants extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * If this argument is set, bases common to all alleles will be removed, leaving only their minimal representation. + */ + @Argument(fullName="trimAlleles", shortName="trim", doc="Trim alleles to remove bases common to all of them", required=false) + protected boolean trimAlleles = false; + + /** + * If this argument is set, split multiallelic records and left-align individual alleles. + * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. + */ + @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) + protected boolean splitMultiallelics = false; + + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; private VariantContextWriter writer; + private static final int MAX_INDEL_LENGTH = 200; // needs to match reference window size! public void initialize() { String trackName = variantCollection.variants.getName(); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); @@ -110,8 +132,25 @@ public class LeftAlignVariants extends RodWalker { Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); int changedSites = 0; - for ( VariantContext vc : VCs ) - changedSites += alignAndWrite(vc, ref); + for ( final VariantContext vc : VCs ) { + // split first into biallelics, and optionally trim alleles to minimal representation + Pair result = new Pair(vc,0); // default value + if (splitMultiallelics) { + final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics( vc); + for (final VariantContext biallelicVC: vcList) { + final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(vc,true,true):biallelicVC); + result = alignAndWrite(v, ref); + + } + } + else if (trimAlleles) + result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); + else + result = alignAndWrite(vc,ref); + + writer.add(result.first); + changedSites += result.second; + } return changedSites; } @@ -127,18 +166,21 @@ public class LeftAlignVariants extends RodWalker { System.out.println(result + " variants were aligned"); } + /** + * Main routine workhorse. By definitio, it will only take biallelic vc's. Splitting into multiple alleles has to be + * handled by calling routine. + * @param vc Input VC with variants to left align + * @param ref Reference context + * @return # of records left-aligned (0 or 1) and new VC. + */ + @Requires({"vc != null","ref != null", "vc.isBiallelic() == true","ref.getBases().length>=2*MAX_INDEL_LENGTH+1"}) + @Ensures({"result != null","result.first != null", "result.second >=0"}) + protected static Pair alignAndWrite(final VariantContext vc, final ReferenceContext ref) { - private int alignAndWrite(VariantContext vc, final ReferenceContext ref) { - if ( vc.isBiallelic() && vc.isIndel() && !vc.isComplexIndel() ) - return writeLeftAlignedIndel(vc, ref); - else { - writer.add(vc); - return 0; + final Pair retValue = new Pair(vc,0); + if (!vc.isIndel() || vc.isComplexIndel() ) { + return retValue; } - } - - private int writeLeftAlignedIndel(final VariantContext vc, final ReferenceContext ref) { - final byte[] refSeq = ref.getBases(); // get the indel length final int indelLength; @@ -147,13 +189,20 @@ public class LeftAlignVariants extends RodWalker { else indelLength = vc.getAlternateAllele(0).length() - 1; - if ( indelLength > 200 ) { - writer.add(vc); - return 0; - } + if ( indelLength > MAX_INDEL_LENGTH ) + return retValue; + + if (vc.getReference().getBases()[0] != vc.getAlternateAllele(0).getBases()[0]) + return retValue; + + final byte[] refSeq = ref.getBases(); + + // create an indel haplotype. + // + final int originalIndex = vc.getStart() - ref.getWindow().getStart() + 1; + if (originalIndex < 0 || originalIndex >= ref.getBases().length) + return retValue; - // create an indel haplotype - final int originalIndex = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; final byte[] originalIndel = makeHaplotype(vc, refSeq, originalIndex, indelLength); // create a CIGAR string to represent the event @@ -178,15 +227,24 @@ public class LeftAlignVariants extends RodWalker { System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 1, indelLength); final Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); newVC = updateAllele(newVC, newAllele); + // overwrite default return value with new left-aligned VC + retValue.first = newVC; + retValue.second = 1; - writer.add(newVC); - return 1; - } else { - writer.add(vc); - return 0; } + return retValue; } + /** + * Make a haplotype from a given alt allele, using bases in input reference, index of an input reference + * @param vc Input VC - will use only alt allele from it + * @param ref Ref bases + * @param indexOfRef Index in ref where to create indel + * @param indelLength Indel length + * @return + */ + @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) + @Ensures("result != null") private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index dee282056..0bd30c3a4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -436,11 +436,15 @@ public class GATKVariantContextUtils { // the genotypes with PLs final GenotypesContext oldGTs = vc.getGenotypes(); + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + // optimization: if no input genotypes, just exit + if (oldGTs.isEmpty()) + return newGTs; + // samples final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); @@ -1007,7 +1011,8 @@ public class GATKVariantContextUtils { final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; - return trimAlleles(revTrimVC, fwdTrim, 0); + final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); + return vc; } /** From 74a17359a8fcefddb5bb2e264f0dd2f5777b9871 Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Fri, 29 Mar 2013 14:52:10 -0400 Subject: [PATCH 116/226] MathUtils.randomSubset() now uses Collections.shuffle() (indirectly, through the other methods that are tested), resulting in slightly different numbers of calls to the RNG, and ultimately different sets of selected variants. This commits updates the md5 values for the validation site selector integration test to reflect these new random subsets of variants that are selected. --- .../ValidationSiteSelectorIntegrationTest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java index ff9896307..a3d9121d0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java @@ -79,7 +79,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleNone + freqUnif + "--variant " + testfile), 1, - Arrays.asList("b8a988757ac1f206d123140da5a3e778") + Arrays.asList("658c70cbb93faed8ca18e51cd6dd593f") ); executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec); @@ -91,7 +91,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleNone + freqAF + "--variant " + testfile), 1, - Arrays.asList("542d5d5ff8c64da7b077bab4b950a9a3") + Arrays.asList("90411433ea42846352b767da735af53b") ); executeTest("testNoSampleSelectionFreqAF--" + testfile, spec); @@ -103,7 +103,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleGT + freqUnif + "--variant " + testfile), 1, - Arrays.asList("7385b17eed7f4ff0f6e82e60c3334ce7") + Arrays.asList("2afabd447185cf017f60c85380902117") ); executeTest("testPolyGTFreqUniform--" + testfile, spec); @@ -115,7 +115,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleGT + freqAF + "--variant " + testfile), 1, - Arrays.asList("0ee4a565a0d4f6b6942abd72a373becd") + Arrays.asList("381e1a2f0e1908b4d7cba5d6361cf5aa") ); executeTest("testPolyGTFreqAF--" + testfile, spec); @@ -127,7 +127,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleGL + freqAF + "--variant " + testfile), 1, - Arrays.asList("0ee4a565a0d4f6b6942abd72a373becd") + Arrays.asList("381e1a2f0e1908b4d7cba5d6361cf5aa") ); executeTest("testPolyGLFreqAF--" + testfile, spec); From 0de6f55660a0c0dd462308e877dcb68c50f164b3 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 26 Mar 2013 15:42:41 -0400 Subject: [PATCH 117/226] PairHMM rework The current implementation of the PairHMM had issues with the probabilities and the state machines. Probabilities were not adding up to one because: # Initial conditions were not being set properly # Emission probabilities in the last row were not adding up to 1 The following commit fixes both by # averaging all potential start locations (giving an equal prior to the state machine in it's first iteration -- allowing the read to start it's alignment anywhere in the haplotype with equal probability) # discounting all paths that end in deletions by not adding the last row of the deletion matrix and summing over all paths ending in matches and insertions (this saves us from a fourth matrix to represent the end state) Summarized changes: * Fix LoglessCachingPairHMM and Log10PairHMM according to the new algorithm * Refactor probabilities check to throw exception if we ever encounter probabilities greater than 1. * Rename LoglessCachingPairHMM to LoglessPairHMM (this is the default implementation in the HC now) * Rename matrices to matchMatrix, insertionMatrix and deletionMatrix for clarity * Rename metric lengths to read and haplotype lengths for clarity * Rename private methods to initializePriors (distance) and initializeProbabilities (constants) for clarity * Eliminate first row constants (because they're not used anyway!) and directly assign initial conditions in the deletionMatrix * Remove unnecessary parameters from updateCell() * Fix the expected probabilities coming from the exact model in PairHMMUnitTest * Neatify PairHMM class (removed unused methods) and PairHMMUnitTest (removed unused variables) * Update MD5s: Probabilities have changed according to the new PairHMM model and as expected HC and UG integration tests have new MD5s. [fix 47164949] --- .../LikelihoodCalculationEngine.java | 2 +- ...achingPairHMM.java => LoglessPairHMM.java} | 103 +++++++----------- ...perGeneralPloidySuite1IntegrationTest.java | 4 +- ...perGeneralPloidySuite2IntegrationTest.java | 5 +- ...dGenotyperIndelCallingIntegrationTest.java | 16 +-- ...GenotyperNormalCallingIntegrationTest.java | 8 +- ...dGenotyperReducedReadsIntegrationTest.java | 2 +- ...lexAndSymbolicVariantsIntegrationTest.java | 11 +- .../HaplotypeCallerIntegrationTest.java | 16 +-- .../NanoSchedulerIntegrationTest.java | 2 +- .../sting/utils/pairhmm/PairHMMUnitTest.java | 37 ++++--- .../sting/utils/pairhmm/Log10PairHMM.java | 32 +++--- .../sting/utils/pairhmm/PairHMM.java | 64 +++-------- 13 files changed, 130 insertions(+), 172 deletions(-) rename protected/java/src/org/broadinstitute/sting/utils/pairhmm/{LoglessCachingPairHMM.java => LoglessPairHMM.java} (74%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 5eaaba0dd..dc5fed340 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -80,7 +80,7 @@ public class LikelihoodCalculationEngine { pairHMM = new Log10PairHMM(false); break; case LOGLESS_CACHING: - pairHMM = new LoglessCachingPairHMM(); + pairHMM = new LoglessPairHMM(); break; default: throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java similarity index 74% rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java rename to protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index 24d6e1220..93a7f63d0 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -55,25 +55,14 @@ import org.broadinstitute.sting.utils.QualityUtils; * User: rpoplin, carneiro * Date: 10/16/12 */ -public class LoglessCachingPairHMM extends PairHMM { +public class LoglessPairHMM extends PairHMM { protected static final double SCALE_FACTOR_LOG10 = 300.0; + protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10); - double[][] constantMatrix = null; // The cache - double[][] distanceMatrix = null; // The cache + double[][] transition = null; // The cache + double[][] prior = null; // The cache boolean constantsAreInitialized = false; - /** - * Cached data structure that describes the first row's edge condition in the HMM - */ - protected static final double [] firstRowConstantMatrix = { - QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)), - QualityUtils.qualToProb(DEFAULT_GCP), - QualityUtils.qualToErrorProb(DEFAULT_GOP), - QualityUtils.qualToErrorProb(DEFAULT_GCP), - 1.0, - 1.0 - }; - /** * {@inheritDoc} */ @@ -81,8 +70,8 @@ public class LoglessCachingPairHMM extends PairHMM { public void initialize( final int haplotypeMaxLength, final int readMaxLength) { super.initialize(haplotypeMaxLength, readMaxLength); - constantMatrix = new double[X_METRIC_MAX_LENGTH][6]; - distanceMatrix = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** @@ -98,8 +87,8 @@ public class LoglessCachingPairHMM extends PairHMM { final int hapStartIndex, final boolean recacheReadValues ) { if ( ! constantsAreInitialized || recacheReadValues ) - initializeConstants( haplotypeBases.length, readBases.length, insertionGOP, deletionGOP, overallGCP ); - initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + initializeProbabilities(haplotypeBases.length, insertionGOP, deletionGOP, overallGCP); + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); // NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this // read and haplotype lengths, not the max lengths @@ -109,14 +98,19 @@ public class LoglessCachingPairHMM extends PairHMM { for (int i = 2; i < readXMetricLength; i++) { // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based for (int j = hapStartIndex+1; j < hapYMetricLength; j++) { - updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + updateCell(i, j, prior[i][j], transition[i]); } } - // final probability is the log10 sum of the last element in all three state arrays + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. final int endI = readXMetricLength - 1; - final int endJ = hapYMetricLength - 1; - return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; + double finalSumProbabilities = 0.0; + for (int j = 0; j < hapYMetricLength; j++) { + finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; + } + return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10; } /** @@ -128,10 +122,7 @@ public class LoglessCachingPairHMM extends PairHMM { * @param readQuals the base quality scores of the read * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) */ - public void initializeDistanceMatrix( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final int startIndex ) { + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. @@ -141,7 +132,7 @@ public class LoglessCachingPairHMM extends PairHMM { final byte qual = readQuals[i]; for (int j = startIndex; j < haplotypeBases.length; j++) { final byte y = haplotypeBases[j]; - distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + prior[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); } } @@ -150,46 +141,36 @@ public class LoglessCachingPairHMM extends PairHMM { /** * Initializes the matrix that holds all the constants related to quality scores. * - * @param haplotypeSize the number of bases in the haplotype we are testing - * @param readSize the number of bases in the read we are testing * @param insertionGOP insertion quality scores of the read * @param deletionGOP deletion quality scores of the read * @param overallGCP overall gap continuation penalty */ @Requires({ - "haplotypeSize > 0", - "readSize > 0", - "insertionGOP != null && insertionGOP.length == readSize", - "deletionGOP != null && deletionGOP.length == readSize", - "overallGCP != null && overallGCP.length == readSize" + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" }) @Ensures("constantsAreInitialized") - private void initializeConstants( final int haplotypeSize, - final int readSize, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP ) { + private void initializeProbabilities(final int haplotypeLength, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { // the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init - matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10) / getNPotentialXStarts(haplotypeSize, readSize); + final double initialValue = INITIAL_CONDITION / haplotypeLength; + matchMatrix[1][1] = initialValue; // fill in the first row - for( int jjj = 2; jjj < Y_METRIC_MAX_LENGTH; jjj++ ) { - updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + for( int jjj = 2; jjj < paddedMaxHaplotypeLength; jjj++ ) { + deletionMatrix[1][jjj] = initialValue; } final int l = insertionGOP.length; - constantMatrix[1] = firstRowConstantMatrix; for (int i = 0; i < l; i++) { final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); - constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); - constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); - constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); - constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); - constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + transition[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); + transition[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); + transition[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); + transition[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); + transition[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); + transition[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); } - constantMatrix[l+1][4] = 1.0; - constantMatrix[l+1][5] = 1.0; // note that we initialized the constants constantsAreInitialized = true; @@ -204,18 +185,14 @@ public class LoglessCachingPairHMM extends PairHMM { * @param indI row index in the matrices to update * @param indJ column index in the matrices to update * @param prior the likelihood editing distance matrix for the read x haplotype - * @param constants an array with the six constants relevant to this location - * @param matchMetricArray the matches likelihood matrix - * @param XMetricArray the insertions likelihood matrix - * @param YMetricArray the deletions likelihood matrix + * @param transitition an array with the six transitition relevant to this location */ - private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + private void updateCell( final int indI, final int indJ, final double prior, final double[] transitition) { - matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + - XMetricArray[indI - 1][indJ - 1] * constants[1] + - YMetricArray[indI - 1][indJ - 1] * constants[1] ); - XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; - YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; + matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transitition[0] + + insertionMatrix[indI - 1][indJ - 1] * transitition[1] + + deletionMatrix[indI - 1][indJ - 1] * transitition[1] ); + insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transitition[2] + insertionMatrix[indI - 1][indJ] * transitition[3]; + deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transitition[4] + deletionMatrix[indI][indJ - 1] * transitition[5]; } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index ef9f483ff..5cdc2c65f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*; +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES; /** * Created by IntelliJ IDEA. @@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "5812da66811887d834d0379a33e655c0"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "faadc0b77a91a716dbb1191fd579d025"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index dc9220b7e..4299b024b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -49,7 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; -import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*; +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM; +import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS; public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { @@ -57,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","3a321896c4b8b6457973c76c486da4d4"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fe715b715526a7c1ebd575ff66bba716"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 8d0c1f04f..d0d77c8e0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -72,7 +72,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("1cb469b9cc8e6c70430021540bf1af8b")); + Arrays.asList("51e022d07ead45a4e154f949b6642e84")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -87,7 +87,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("c7e59f9ab718df4c604626a0f51af606")); + Arrays.asList("1d9c6fda344eeee76cbe4221251dc341")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d")); + Arrays.asList("2ec7262f0a3d04534ce1fe15cc79f52e")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("86880ec78755ae91cb5bb34a0631a32c")); + Arrays.asList("3131cd7c49b623983a106db5228754b3")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2584d5e3ade1b548f1fe9cdcafbe1b28")); + Arrays.asList("273f5daa936e93da98efd6ceb37d7533")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("939da0bb73b706badd8a0def7446b384")); + Arrays.asList("00a003a0908281384e981294434a9f3e")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -175,7 +175,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("556c214366e82e4682e753ce93307a4e")); + Arrays.asList("87521a1bde124c7c5908ed067060fe45")); executeTest("test minIndelFraction 0.0", spec); } @@ -183,7 +183,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("1df02b805d9dfbd532fa3632875a989d")); + Arrays.asList("8a880b8b1662e31e0b5c65733eac6b74")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 2512dd5c2..c10b3d6df 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("2f15ef1ead56d875a3f1d53772f52b3a")); + Arrays.asList("3f8ee598c9b85aa1d2b85746ad46c1af")); executeTest("test MultiSample Pilot1", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e")); + Arrays.asList("31c0f0074b3306b54170056e93b69e11")); executeTest("test Multiple SNP alleles", spec); } @@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("eb9604b77a7d6baab60c81ac3db5e47b")); + Arrays.asList("753d6358b1634107de76900200116805")); executeTest("test reverse trim", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4")); + Arrays.asList("274eadae8a630a3fda9281d6d6253dea")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index 0620f15df..b63c591ce 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "9a702e7a85465f6c42d6c1828aee6c38"); + testReducedCalling("INDEL", "c5939a7f5f85ea2fe994ce912732e180"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 3aaffdeaa..9400b3dd2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -49,10 +49,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; -import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.*; - import java.util.Arrays; +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM; +import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF; + public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { private void HCTestComplexVariants(String bam, String args, String md5) { @@ -63,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9fa4d3c88fd9c0f23c7a3ddd3d24a8c"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a898b551f78c71befee4d12070d3a788"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -87,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "e8ffbfae3c1af5be02631a31f386a431"); + "8a110549543412fa682419e9a8f0dd1d"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "c3a98b19efa7cb36fe5f5f2ab893ef56"); + "5429c234d471434adc09d9e60b87de24"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c5614d405..c416938cd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "45856ad67bfe8d8bea45808d8258bcf1"); + HCTest(CEUTRIO_BAM, "", "008958c211a8a439a7213a96f3dd7f6c"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "b6c93325f851ac358ea49260fb11b75c"); + HCTest(NA12878_BAM, "", "3b60c6133eeadfea028dffea93b88478"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "4ca6b560d0569cdca400d3e50915e211"); + "70bd5d0805bf6f51e5f61b377526c979"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5d06ec5502d3f157964bd7b275d6a0cb"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa"); } @Test @@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("53a50dae68f0175ca3088dea1d3bb881")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("35a8edeca7518835d67a10de21493eca")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d3bc6adde8cd9514ae5c49cd366d5de4")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c81d7e69dd4116890f06a71b19870300")); executeTest("HCTestStructuralIndels: ", spec); } @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("4adb833ed8af20224b76bba61e2b0d93")); + Arrays.asList("f0a215faed194dc160f19e26293e85f8")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("1704b0901c86f8f597d931222d5c8dd8")); + Arrays.asList("bea274584344fa6b4b0f98eee327bad8")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index 555c02cde..96eaa109e 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); + tests.add(new Object[]{ "BOTH", "9a1202d849653f0480932f450ec507b4", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index c94674c98..6dbcd0220 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -70,7 +70,7 @@ public class PairHMMUnitTest extends BaseTest { final static boolean EXTENSIVE_TESTING = true; final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation - final PairHMM loglessHMM = new LoglessCachingPairHMM(); + final PairHMM loglessHMM = new LoglessPairHMM(); private List getHMMs() { return Arrays.asList(exactHMM, originalHMM, loglessHMM); @@ -116,13 +116,12 @@ public class PairHMMUnitTest extends BaseTest { return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); } - public double expectedLogL(final PairHMM hmm) { - return (expectedQual / -10.0) + 0.03 + - hmm.getNPotentialXStartsLikelihoodPenaltyLog10(refBasesWithContext.length, readBasesWithContext.length); + public double expectedLogL() { + return (expectedQual / -10.0) + 0.03 + Math.log10(1.0/refBasesWithContext.length); } public double getTolerance(final PairHMM hmm) { - if ( hmm instanceof LoglessCachingPairHMM ) + if ( hmm instanceof LoglessPairHMM) return toleranceFromExact(); if ( hmm instanceof Log10PairHMM ) { return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference(); @@ -150,7 +149,7 @@ public class PairHMMUnitTest extends BaseTest { qualAsBytes(gcp, false, anchorIndel), 0, true); } - private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + private byte[] asBytes(final String bases, final boolean left, final boolean right) { return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } @@ -163,7 +162,7 @@ public class PairHMMUnitTest extends BaseTest { // update just the bases corresponding to the provided micro read with the quality scores if( doGOP ) { - phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + phredQuals[CONTEXT.length()] = (byte)phredQual; } else { for ( int i = 0; i < read.length(); i++) phredQuals[i + CONTEXT.length()] = (byte)phredQual; @@ -270,7 +269,7 @@ public class PairHMMUnitTest extends BaseTest { final double exactLogL = cfg.calcLogL( exactHMM, true ); for ( final PairHMM hmm : getHMMs() ) { double actualLogL = cfg.calcLogL( hmm, true ); - double expectedLogL = cfg.expectedLogL(hmm); + double expectedLogL = cfg.expectedLogL(); // compare to our theoretical expectation with appropriate tolerance Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm); @@ -322,8 +321,8 @@ public class PairHMMUnitTest extends BaseTest { System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); - // - log10 is because of number of start positions - Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2); + final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20)); + Assert.assertEquals(res1, expected, 1e-2); } } @@ -354,8 +353,8 @@ public class PairHMMUnitTest extends BaseTest { System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); - // - log10 is because of number of start positions - Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2); + final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20)); + Assert.assertEquals(res1, expected, 1e-2); } } @@ -406,8 +405,14 @@ public class PairHMMUnitTest extends BaseTest { Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(gcp, readBases.length), 0, true); - final double expected = Math.log10(Math.pow(1.0 - QualityUtils.qualToErrorProb(baseQual), readBases.length)); - Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read"); + double expected = 0; + final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length; + if (readBases.length < refBases.length) { + expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), readBases.length)); + } else if (readBases.length > refBases.length) { + expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), refBases.length) * Math.pow(QualityUtils.qualToErrorProb(insQual), readBases.length - refBases.length)); + } + Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read " + String.format("readSize=%d refSize=%d", readSize, refSize)); } @DataProvider(name = "HMMProviderWithBigReads") @@ -472,7 +477,7 @@ public class PairHMMUnitTest extends BaseTest { Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(gcp, readBases.length), 0, true); - loglessHMM.dumpMatrices(); +// loglessHMM.dumpMatrices(); } @DataProvider(name = "JustHMMProvider") @@ -610,7 +615,7 @@ public class PairHMMUnitTest extends BaseTest { public Object[][] makeUninitializedHMMs() { List tests = new ArrayList(); - tests.add(new Object[]{new LoglessCachingPairHMM()}); + tests.add(new Object[]{new LoglessPairHMM()}); tests.add(new Object[]{new Log10PairHMM(true)}); return tests.toArray(new Object[][]{}); diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index 62793bc54..d7c55e37c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -67,10 +67,10 @@ public class Log10PairHMM extends PairHMM { public void initialize( final int haplotypeMaxLength, final int readMaxLength) { super.initialize(haplotypeMaxLength, readMaxLength); - for( int iii=0; iii < X_METRIC_MAX_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + for( int iii=0; iii < paddedMaxReadLength; iii++ ) { + Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } } @@ -88,7 +88,8 @@ public class Log10PairHMM extends PairHMM { final boolean recacheReadValues ) { // the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual // read and haplotypes, not the maximum - matchMetricArray[1][1] = getNPotentialXStartsLikelihoodPenaltyLog10(haplotypeBases.length, readBases.length); + final double initialValue = Math.log10((double) 1/haplotypeBases.length); + matchMatrix[1][1] = initialValue; // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment final int X_METRIC_LENGTH = readBases.length + 2; @@ -104,14 +105,17 @@ public class Log10PairHMM extends PairHMM { for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { if( (iii == 1 && jjj == 1) ) { continue; } updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); + matchMatrix, insertionMatrix, deletionMatrix); } } // final probability is the log10 sum of the last element in all three state arrays final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return myLog10SumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + double result = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); + for (int j = 2; j < Y_METRIC_LENGTH; j++) + result = myLog10SumLog10(new double[]{result, matchMatrix[endI][j], insertionMatrix[endI][j]}); + + return result; } /** @@ -134,7 +138,7 @@ public class Log10PairHMM extends PairHMM { private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + final double[][] matchMatrix, final double[][] insertionMatrix, final double[][] deletionMatrix ) { // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions final int im1 = indI - 1; @@ -151,18 +155,18 @@ public class Log10PairHMM extends PairHMM { final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ - 1] + d0, XMetricArray[indI - 1][indJ - 1] + e0, YMetricArray[indI - 1][indJ - 1] + e0}); + matchMatrix[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + d0, insertionMatrix[indI - 1][indJ - 1] + e0, deletionMatrix[indI - 1][indJ - 1] + e0}); // update the X (insertion) array final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ] + d1, XMetricArray[indI - 1][indJ] + e1}); + insertionMatrix[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ] + d1, insertionMatrix[indI - 1][indJ] + e1}); // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double d2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]); + final double e2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]); final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI][indJ - 1] + d2, YMetricArray[indI][indJ - 1] + e2}); + deletionMatrix[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMatrix[indI][indJ - 1] + d2, deletionMatrix[indI][indJ - 1] + e2}); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index 0d2eb0c1c..bd3360370 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -25,10 +25,12 @@ package org.broadinstitute.sting.utils.pairhmm; -import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. @@ -52,11 +54,11 @@ public abstract class PairHMM { LOGLESS_CACHING } - protected double[][] matchMetricArray = null; - protected double[][] XMetricArray = null; - protected double[][] YMetricArray = null; + protected double[][] matchMatrix = null; + protected double[][] insertionMatrix = null; + protected double[][] deletionMatrix = null; protected int maxHaplotypeLength, maxReadLength; - protected int X_METRIC_MAX_LENGTH, Y_METRIC_MAX_LENGTH; + protected int paddedMaxReadLength, paddedMaxHaplotypeLength; private boolean initialized = false; /** @@ -72,12 +74,12 @@ public abstract class PairHMM { maxReadLength = readMaxLength; // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - X_METRIC_MAX_LENGTH = readMaxLength + 2; - Y_METRIC_MAX_LENGTH = haplotypeMaxLength + 2; + paddedMaxReadLength = readMaxLength + 2; + paddedMaxHaplotypeLength = haplotypeMaxLength + 2; - matchMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; - XMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; - YMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; + matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; initialized = true; } @@ -124,21 +126,17 @@ public abstract class PairHMM { double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); - // TODO -- remove max when PairHMM no longer returns likelihoods >= 0 - result = Math.min(result, 0.0); - if ( MathUtils.goodLog10Probability(result) ) return result; else - throw new IllegalStateException("Bad likelihoods detected: " + result); -// return result; + throw new ReviewedStingException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); } /** * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 */ @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", - "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, @@ -148,41 +146,13 @@ public abstract class PairHMM { final int hapStartIndex, final boolean recacheReadValues ); - /** - * How many potential starting locations are a read with readSize bases against a haplotype with haplotypeSize bases? - * - * for example, a 3 bp read against a 5 bp haplotype could potentially start at 1, 2, 3 = 5 - 3 + 1 = 3 - * the max value is necessary in the case where the read is longer than the haplotype, in which case - * there's a single unique start site by assumption - * - * @param haplotypeSize the number of bases in the haplotype we are testing - * @param readSize the number of bases in the read we are testing - * @return a positive integer >= 1 - */ - @Ensures("result >= 1") - protected int getNPotentialXStarts(final int haplotypeSize, final int readSize) { - return Math.max(haplotypeSize - readSize + 1, 1); - } - - /** - * The the log10 probability penalty for the number of potential start sites of the read aginst the haplotype - * - * @param haplotypeSize the number of bases in the haplotype we are testing - * @param readSize the number of bases in the read we are testing - * @return a log10 probability - */ - @Ensures("MathUtils.goodLog10Probability(result)") - protected double getNPotentialXStartsLikelihoodPenaltyLog10(final int haplotypeSize, final int readSize) { - return - Math.log10(getNPotentialXStarts(haplotypeSize, readSize)); - } - /** * Print out the core hmm matrices for debugging */ protected void dumpMatrices() { - dumpMatrix("matchMetricArray", matchMetricArray); - dumpMatrix("XMetricArray", XMetricArray); - dumpMatrix("YMetricArray", YMetricArray); + dumpMatrix("matchMetricArray", matchMatrix); + dumpMatrix("insertionMatrix", insertionMatrix); + dumpMatrix("deletionMatrix", deletionMatrix); } /** From 6b8bed34d01856b4ae4fb45154b1f9fd3c9dba64 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sat, 30 Mar 2013 19:05:04 -0400 Subject: [PATCH 118/226] Big bad bug fix: feature added to LeftAlignAndTrimVariants to left align multiallelic records didn't work. -- Corrected logic to pick biallelic vc to left align. -- Added integration test to make sure this feature is tested and feature to trim bases is also tested. --- ...ftAlignAndTrimVariantsIntegrationTest.java | 10 ++++++++++ .../LeftAlignAndTrimVariants.java | 20 +++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java index 0b3d9c930..a7d32d43b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -64,4 +64,14 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2")); executeTest("test left alignment", spec); } + + @Test + public void testLeftAlignmentWithTrimmingAndMultialleliecs() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -trim -split", + 1, + Arrays.asList("4ae03954f8bd66e73fd005c49ea301db")); + executeTest("test left alignment with trimming and hard multiple alleles", spec); + + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java index 25e3e9857..9168d17f0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -138,18 +138,22 @@ public class LeftAlignAndTrimVariants extends RodWalker { if (splitMultiallelics) { final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics( vc); for (final VariantContext biallelicVC: vcList) { - final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(vc,true,true):biallelicVC); + final VariantContext v = (trimAlleles ? GATKVariantContextUtils.trimAlleles(biallelicVC,true,true):biallelicVC); result = alignAndWrite(v, ref); - + writer.add(result.first); + changedSites += result.second; } } - else if (trimAlleles) - result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); - else - result = alignAndWrite(vc,ref); + else { + if (trimAlleles) + result = alignAndWrite(GATKVariantContextUtils.trimAlleles(vc,true,true), ref); + else + result = alignAndWrite(vc,ref); + writer.add(result.first); + changedSites += result.second; + + } - writer.add(result.first); - changedSites += result.second; } return changedSites; From 68bf4705243b32a12cc0e2cb03bddb5fd29330f1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 30 Mar 2013 20:00:45 -0400 Subject: [PATCH 119/226] making LoglessPairHMM final --- .../org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index 93a7f63d0..d94893e3e 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -55,7 +55,7 @@ import org.broadinstitute.sting.utils.QualityUtils; * User: rpoplin, carneiro * Date: 10/16/12 */ -public class LoglessPairHMM extends PairHMM { +public final class LoglessPairHMM extends PairHMM { protected static final double SCALE_FACTOR_LOG10 = 300.0; protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10); From 52e67a69730f43f38075f6c4b7fb3398fa1b1774 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 30 Mar 2013 20:11:55 -0400 Subject: [PATCH 120/226] ReviewedStingException -> IllegalStateException --- .../org/broadinstitute/sting/utils/pairhmm/PairHMM.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index bd3360370..f71819a69 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; @@ -129,7 +128,7 @@ public abstract class PairHMM { if ( MathUtils.goodLog10Probability(result) ) return result; else - throw new ReviewedStingException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); + throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); } /** @@ -185,8 +184,8 @@ public abstract class PairHMM { * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same */ public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { - if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + haplotype1); - if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + haplotype2); + if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); + if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { if( haplotype1[iii] != haplotype2[iii] ) { From ec475a46b16f55092040bc99eca4badcbc69b02c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 29 Mar 2013 10:10:13 -0400 Subject: [PATCH 121/226] Fixing @PG tag uniqueness issue The Problem: ------------ the SAM spec does not allow multiple @PG tags with the same id. Our @PG tag writing routines were allowing that to happen with the boolean parameter "keep_all_pg_records". How this fixes it: ------------------ This commit removes that option from all the utility functions and cleans up the code around the classes that used these methods off-spec. Summarized changes: ------------------- * Remove keep_all_pg_records option from setupWriter utility methos in Util * Update all walkers to now replace the last @PG tag of the same walker (if it already exists) * Cleanup NWaySamFileWriter now that it doesn't need to keep track of the keep_all_pg_records variable * Simplify the multiple implementations to setupWriter Bamboo: ------- http://gsabamboo.broadinstitute.org/browse/GSAUNSTABLE-PARALLEL31 Issue Tracker: -------------- [fixes 47100885] --- .../compression/reducereads/ReduceReads.java | 39 ++++--- .../gatk/walkers/readutils/PrintReads.java | 6 +- .../org/broadinstitute/sting/utils/Utils.java | 100 ++++-------------- .../sting/utils/sam/NWaySAMFileWriter.java | 2 +- 4 files changed, 42 insertions(+), 105 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 5e9429284..c9730e95a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -123,7 +123,7 @@ public class ReduceReads extends ReadWalker, Redu * The number of bases to keep around mismatches (potential variation) */ @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) - private int contextSize = 10; + public int contextSize = 10; /** * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have @@ -131,7 +131,7 @@ public class ReduceReads extends ReadWalker, Redu * towards variable regions. */ @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) - private int minMappingQuality = 20; + public int minMappingQuality = 20; /** * The minimum base quality to be considered for the consensus synthetic read. Reads that have @@ -139,14 +139,14 @@ public class ReduceReads extends ReadWalker, Redu * towards variable regions. */ @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) - private byte minBaseQual = 20; + public byte minBaseQual = 20; /** * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality * lower than this threshold will be hard clipped off before entering the reduce reads algorithm. */ @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) - private byte minTailQuality = 2; + public byte minTailQuality = 2; /** * Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction. @@ -161,21 +161,21 @@ public class ReduceReads extends ReadWalker, Redu * and read group). */ @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) - private boolean DONT_SIMPLIFY_READS = false; + public boolean DONT_SIMPLIFY_READS = false; /** * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. * The program will behave correctly in those cases. */ @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) - private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; + public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; /** * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail * quality. */ @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) - private boolean DONT_CLIP_LOW_QUAL_TAILS = false; + public boolean DONT_CLIP_LOW_QUAL_TAILS = false; /** * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped @@ -183,7 +183,7 @@ public class ReduceReads extends ReadWalker, Redu * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) */ @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) - private boolean DONT_USE_SOFTCLIPPED_BASES = false; + public boolean DONT_USE_SOFTCLIPPED_BASES = false; /** * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee @@ -191,55 +191,55 @@ public class ReduceReads extends ReadWalker, Redu * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. */ @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) - private boolean DONT_COMPRESS_READ_NAMES = false; + public boolean DONT_COMPRESS_READ_NAMES = false; /** * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval * border. */ @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false) - private boolean HARD_CLIP_TO_INTERVAL = false; + public boolean HARD_CLIP_TO_INTERVAL = false; /** * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be * considered consensus. */ @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) - private double minAltProportionToTriggerVariant = 0.05; + public double minAltProportionToTriggerVariant = 0.05; /** * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be * considered consensus. */ @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) - private double minIndelProportionToTriggerVariant = 0.05; + public double minIndelProportionToTriggerVariant = 0.05; /** * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). * A value of 0 turns downsampling off. */ @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) - private int downsampleCoverage = 250; + public int downsampleCoverage = 250; @Hidden @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) - private boolean nwayout = false; + public boolean nwayout = false; @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) - private int debugLevel = 0; + public int debugLevel = 0; @Hidden @Argument(fullName = "", shortName = "dr", doc = "", required = false) - private String debugRead = ""; + public String debugRead = ""; @Hidden @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) - private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; + public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; @Hidden @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) - private boolean NO_PG_TAG = false; + public boolean NO_PG_TAG = false; public enum DownsampleStrategy { Normal, @@ -282,7 +282,6 @@ public class ReduceReads extends ReadWalker, Redu final boolean preSorted = true; final boolean indexOnTheFly = true; - final boolean keep_records = true; final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; if (nwayout) { SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); @@ -292,7 +291,7 @@ public class ReduceReads extends ReadWalker, Redu writerToUse = out; out.setPresorted(false); if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java index 16afc18fa..add567b36 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; @@ -151,7 +150,7 @@ public class PrintReads extends ReadWalker impleme @Hidden @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) - private boolean NO_PG_TAG = false; + public boolean NO_PG_TAG = false; List readTransformers = Collections.emptyList(); private TreeSet samplesToChoose = new TreeSet(); @@ -166,7 +165,6 @@ public class PrintReads extends ReadWalker impleme * The initialize function. */ public void initialize() { - final boolean keep_records = true; final GenomeAnalysisEngine toolkit = getToolkit(); if ( platform != null ) @@ -192,7 +190,7 @@ public class PrintReads extends ReadWalker impleme final boolean preSorted = true; if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index e50025ea1..ff0ea958c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -29,7 +29,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.util.StringUtil; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; @@ -87,9 +86,7 @@ public class Utils { * @return True if the two objects are equal, false otherwise. */ public static boolean equals(Object lhs, Object rhs) { - if (lhs == null && rhs == null) return true; - else if (lhs == null) return false; - else return lhs.equals(rhs); + return lhs == null && rhs == null || lhs != null && lhs.equals(rhs); } public static List cons(final T elt, final List l) { @@ -128,35 +125,6 @@ public class Utils { logger.warn(String.format("* %s", builder)); } - public static ArrayList subseq(char[] fullArray) { - byte[] fullByteArray = new byte[fullArray.length]; - StringUtil.charsToBytes(fullArray, 0, fullArray.length, fullByteArray, 0); - return subseq(fullByteArray); - } - - public static ArrayList subseq(byte[] fullArray) { - return subseq(fullArray, 0, fullArray.length - 1); - } - - public static ArrayList subseq(byte[] fullArray, int start, int end) { - assert end < fullArray.length; - ArrayList dest = new ArrayList(end - start + 1); - for (int i = start; i <= end; i++) { - dest.add(fullArray[i]); - } - return dest; - } - - public static String baseList2string(List bases) { - byte[] basesAsbytes = new byte[bases.size()]; - int i = 0; - for (Byte b : bases) { - basesAsbytes[i] = b; - i++; - } - return new String(basesAsbytes); - } - /** * join the key value pairs of a map into one string, i.e. myMap = [A->1,B->2,C->3] with a call of: * joinMap("-","*",myMap) -> returns A-1*B-2*C-3 @@ -255,7 +223,6 @@ public class Utils { * Create a new list that contains the elements of left along with elements elts * @param left a non-null list of elements * @param elts a varargs vector for elts to append in order to left - * @param * @return A newly allocated linked list containing left followed by elts */ public static List append(final List left, T ... elts) { @@ -267,9 +234,9 @@ public class Utils { /** * Returns a string of the values in joined by separator, such as A,B,C * - * @param separator - * @param doubles - * @return + * @param separator separator character + * @param doubles the array with values + * @return a string with the values separated by the separator */ public static String join(String separator, double[] doubles) { if ( doubles == null || doubles.length == 0) @@ -486,7 +453,7 @@ public class Utils { return rcbases; } - static public final List reverse(final List l) { + static public List reverse(final List l) { final List newL = new ArrayList(l); Collections.reverse(newL); return newL; @@ -525,10 +492,8 @@ public class Utils { /** * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, * unresolvable gets returned instead. - * - * @return */ - public static final String resolveHostname() { + public static String resolveHostname() { try { return InetAddress.getLocalHost().getCanonicalHostName(); } @@ -555,17 +520,15 @@ public class Utils { * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets * up the writer with the header and presorted status. * - * @param toolkit the engine * @param originalHeader original header - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param programRecord the program record for this program */ - public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) { - SAMFileHeader header = originalHeader.clone(); - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); + public static SAMFileHeader setupWriter(final SAMFileHeader originalHeader, final SAMProgramRecord programRecord) { + final SAMFileHeader header = originalHeader.clone(); + final List oldRecords = header.getProgramRecords(); + final List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId()))) newRecords.add(record); if (programRecord != null) { @@ -580,14 +543,13 @@ public class Utils { * the new header to be added to the BAM writer. * * @param toolkit the engine - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag * @return a pre-filled header for the bam writer */ - public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord); + return setupWriter(originalHeader, programRecord); } /** @@ -597,12 +559,11 @@ public class Utils { * @param writer BAM file writer * @param toolkit the engine * @param preSorted whether or not the writer can assume reads are going to be added are already sorted - * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file * @param walker the walker object (so we can extract the command line) * @param PROGRAM_RECORD_NAME the name for the PG tag */ - public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { - SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME); + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME); writer.writeHeader(header); writer.setPresorted(preSorted); } @@ -629,23 +590,11 @@ public class Utils { return programRecord; } - public static Collection makeCollection(Iterable iter) { - Collection list = new ArrayList(); - for (E item : iter) { - list.add(item); - } - return list; - } - /** * Returns the number of combinations represented by this collection * of collection of options. * * For example, if this is [[A, B], [C, D], [E, F, G]] returns 2 * 2 * 3 = 12 - * - * @param options - * @param - * @return */ @Requires("options != null") public static int nCombinations(final Collection[] options) { @@ -676,21 +625,18 @@ public class Utils { * if N = 1 => [[A], [B], [C]] * if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]] * - * @param objects - * @param n - * @param + * @param objects list of objects + * @param n size of each combination * @param withReplacement if false, the resulting permutations will only contain unique objects from objects - * @return + * @return a list with all combinations with size n of objects. */ public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { final List> combinations = new ArrayList>(); - if ( n <= 0 ) - ; - else if ( n == 1 ) { + if ( n == 1 ) { for ( final T o : objects ) combinations.add(Collections.singletonList(o)); - } else { + } else if (n > 1) { final List> sub = makePermutations(objects, n - 1, withReplacement); for ( List subI : sub ) { for ( final T a : objects ) { @@ -738,9 +684,6 @@ public class Utils { /** * Create a constant map that maps each value in values to itself - * @param values - * @param - * @return */ public static Map makeIdentityFunctionMap(Collection values) { Map map = new HashMap(values.size()); @@ -756,9 +699,6 @@ public class Utils { * groupSize = 2 * result = [[A, B], [C, D], [E]] * - * @param list - * @param groupSize - * @return */ public static List> groupList(final List list, final int groupSize) { if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index d26a1f807..4cd361ba1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -141,7 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); - SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); + SAMFileHeader header = Utils.setupWriter(toolkit.getSAMFileHeader(id), programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); From 8e2094d2aff6d056d2402ac520bdd0d005c43e43 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 25 Mar 2013 12:02:12 -0400 Subject: [PATCH 122/226] Updated AssessReducedQuals and applied it systematically to all ReduceReads integration tests. * Moved to protected for packaging purposes. * Cleaned up and removed debugging output. * Fixed logic for epsilons so that we really only test significant differences between BAMs. * Other small fixes (e.g. don't include low quality reduced reads in overall qual). * Most RR integration tests now automatically run the quals test on output. * A few are disabled because we expect them to fail in various locations (e.g. due to downsampling). --- .../gatk/walkers/qc/AssessReducedQuals.java | 199 ++++++++++++++++++ .../ReduceReadsIntegrationTest.java | 104 ++++++--- 2 files changed, 271 insertions(+), 32 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java new file mode 100644 index 000000000..597077742 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -0,0 +1,199 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.List; + +/** + * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of + * the reduced bam are above sufficient threshold) + * + *

        Input

        + *

        + * The original and reduced BAM files. + *

        + * + *

        Output

        + *

        + * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. + *

        + * + *

        Examples

        + *
        + * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + *   -I:original original.bam \
        + *   -I:reduced reduced.bam \
        + *   -R ref.fasta \
        + *   -T AssessReducedQuals \
        + *   -o output.intervals
        + * 
        + * + * @author ami + */ + +public class AssessReducedQuals extends LocusWalker implements TreeReducible { + + private static final String reduced = "reduced"; + private static final int originalQualsIndex = 0; + private static final int reducedQualsIndex = 1; + + @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) + public int sufficientQualSum = 600; + + @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) + public double qual_epsilon = 0.25; + + @Output + protected PrintStream out; + + public void initialize() { + if ( qual_epsilon < 0.0 || qual_epsilon > 1.0 ) + throw new UserException.BadArgumentValue("qual_epsilon", "must be a number between 0 and 1"); + } + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Override + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return null; + + boolean reportLocus; + final int[] quals = getPileupQuals(context.getBasePileup()); + final int epsilon = MathUtils.fastRound(quals[originalQualsIndex] * qual_epsilon); + final int calcOriginalQuals = Math.min(quals[originalQualsIndex], sufficientQualSum); + final int calcReducedQuals = Math.min(quals[reducedQualsIndex], sufficientQualSum); + final int originalReducedQualDiff = calcOriginalQuals - calcReducedQuals; + reportLocus = originalReducedQualDiff > epsilon || originalReducedQualDiff < -1 * epsilon; + + return reportLocus ? ref.getLocus() : null; + } + + private int[] getPileupQuals(final ReadBackedPileup readPileup) { + + final int[] quals = new int[2]; + + for ( final PileupElement p : readPileup ) { + final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); + if ( isGoodRead(p) ) { + final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); + final int tagIndex = getTagIndex(tags); + quals[tagIndex] += tempQual; + } + } + + return quals; + } + + private boolean isGoodRead(final PileupElement p) { + return !p.isDeletion() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20; + } + + private int getTagIndex(final List tags) { + return tags.contains(reduced) ? 1 : 0; + } + + @Override + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + @Override + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + @Override + public GenomeLoc reduceInit() { + return null; + } + + @Override + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index de95b5e9a..65e930b89 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -74,6 +74,10 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { + return executeTest(name, spec, false); + } + + protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final boolean disableQualsTest) { final Pair, List> result = super.executeTest(name, spec); // perform some Reduce Reads specific testing now @@ -83,46 +87,76 @@ public class ReduceReadsIntegrationTest extends WalkerTest { spec.disableImplicitArgs(); final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" "); - final StringBuilder newArgs = new StringBuilder(); - for ( int i = 0; i < originalArgs.length; i++ ) { - final String arg = originalArgs[i]; - if ( arg.equals("-T") ) { - newArgs.append("-T AssessReducedCoverage "); - } else if ( arg.startsWith("-I") ) { - newArgs.append("-I:original "); - newArgs.append(originalArgs[++i]); - newArgs.append(" "); - } else if ( arg.equals("-R") || arg.equals("-L") ) { - newArgs.append(arg); - newArgs.append(" "); - newArgs.append(originalArgs[++i]); - newArgs.append(" "); - } - } + final StringBuilder reducedInputs = new StringBuilder(); for ( final File file : result.getFirst() ) { - newArgs.append("-I:reduced "); - newArgs.append(file.getAbsolutePath()); - newArgs.append(" "); + reducedInputs.append(" -I:reduced "); + reducedInputs.append(file.getAbsolutePath()); } - newArgs.append("-o %s"); - super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(newArgs.toString(), Arrays.asList(emptyFileMd5))); + // run the coverage test + final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); + super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); + + // run the quals test + if ( !disableQualsTest ) { + final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); + super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); + } } return result; } + /* + * Generate a new command-line based on the old one + * + * @param walkerName the new walker name to use + * @param originalArgs the original arguments used for the test + * @return the new command line + */ + private String createCommandLine(final String walkerName, final String[] originalArgs) { + + final StringBuilder newArgs = new StringBuilder(); + + for ( int i = 0; i < originalArgs.length; i++ ) { + final String arg = originalArgs[i]; + + if ( arg.equals("-T") ) { + newArgs.append("-T "); + newArgs.append(walkerName); + } else if ( arg.startsWith("-I") ) { + newArgs.append("-I:original "); + newArgs.append(originalArgs[++i]); + } else if ( arg.equals("-R") || arg.equals("-L") ) { + newArgs.append(arg); + newArgs.append(" "); + newArgs.append(originalArgs[++i]); + } + + // always add a trailing space + newArgs.append(" "); + } + + newArgs.append("-o %s"); + + return newArgs.toString(); + } + protected Pair, List> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) { return super.executeTest(name, spec); } private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; - WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); - executeTest(testName, spec); + this.RRTest(testName, args, md5, useKnowns, false); } - @Test(enabled = true) + private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final boolean disableQualsTest) { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; + WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); + executeTest(testName, spec, disableQualsTest); + } + + @Test(enabled = true) public void testDefaultCompression() { RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false); } @@ -156,12 +190,14 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false); + // too much downsampling for quals test + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false, true); } @Test(enabled = true) public void testLowCompressionWithKnowns() { - RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true); + // too much downsampling for quals test + RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true, true); } @Test(enabled = true) @@ -174,19 +210,22 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517"))); + // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517")), true); } @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022"))); + // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022")), true); } @Test(enabled = true) public void testCoReductionWithKnowns() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7"))); + // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7")), true); } @Test(enabled = true) @@ -216,7 +255,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - // we expect to lose coverage due to the downsampling so don't run the systematic coverage test + // we expect to lose coverage due to the downsampling so don't run the systematic tests executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); } @@ -237,7 +276,8 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa"))); + // don't use quals test here (there's one location with low quals that won't pass; signed off by EB) + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa")), true); } } From 9686e91a51819960a3c59a29560b3b8ad3b9c19c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 25 Mar 2013 17:13:40 -0400 Subject: [PATCH 123/226] Added small feature to VariantFiltration to filter sites outside of a given mask: -- Sometimes it's desireable to specify a set of "good" regions and filter out other stuff (like say an alignability mask or a "good regions" mask). But by default, the -mask argument in VF will only filter sites inside a particular mask. New argument -filterNotInMask will reverse default logic and filter outside of a given mask. -- Added integration test, and made sure we also test with a BED rod. --- .../VariantFiltrationIntegrationTest.java | 9 ++++++++ .../walkers/filters/VariantFiltration.java | 21 +++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 84729647a..9de190f5f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -98,6 +98,15 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { executeTest("test mask extend", spec3); } + @Test + public void testMaskReversed() { + WalkerTestSpec spec3 = new WalkerTestSpec( + baseTestString() + " -maskName outsideGoodSites -filterNotInMask --mask:BED " + privateTestDir + "goodMask.bed --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + Arrays.asList("e65d27c13953fc3a77dcad27a4357786")); + executeTest("test filter sites not in mask", spec3); + } + + @Test public void testFilter1() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index 8feb9101c..362b49f68 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -87,9 +87,10 @@ public class VariantFiltration extends RodWalker { protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** - * Any variant which overlaps entries from the provided mask rod will be filtered. + * Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed, + * i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used. */ - @Input(fullName="mask", doc="Input ROD mask", required=false) + @Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false) public RodBinding mask; @Output(doc="File to which variants should be written") @@ -140,6 +141,14 @@ public class VariantFiltration extends RodWalker { @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String MASK_NAME = "Mask"; + /** + * By default, if the -mask argument is used, any variant falling in a mask will be filtered. + * If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered. + * Use case is, for example, if we have an interval list or BED file with "good" sites. + */ + @Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false) + protected boolean filterRecordsNotInMask = false; + /** * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing. * Use this argument to have it evaluate as failing filters instead for these cases. @@ -186,7 +195,9 @@ public class VariantFiltration extends RodWalker { hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); if ( mask.isBound() ) { - hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); + if (filterRecordsNotInMask) + hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask")); + else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); } writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); @@ -199,6 +210,8 @@ public class VariantFiltration extends RodWalker { if ( MASK_EXTEND < 0 ) throw new UserException.BadArgumentValue("maskExtension", "negative values are not allowed"); + if (filterRecordsNotInMask && !mask.isBound()) + throw new UserException.BadArgumentValue("filterNotInMask","argument not allowed if mask argument is not provided"); filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS); genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS); @@ -223,7 +236,7 @@ public class VariantFiltration extends RodWalker { Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); // is there a SNP mask present? - boolean hasMask = tracker.hasValues(mask); + boolean hasMask = (tracker.hasValues(mask) && !filterRecordsNotInMask) || (filterRecordsNotInMask && !tracker.hasValues(mask)); if ( hasMask ) previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask From f65206e75861d8295ddcf18374ab785a2c0e6aca Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 1 Apr 2013 10:20:04 -0400 Subject: [PATCH 124/226] Two changes to HC GGA mode to make it more like the UG. -- Only try to genotype PASSing records in the alleles file -- Don't attempt to genotype multiple records with the same start location. Instead take the first record and throw a warning message. --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index c379b34dc..da077ff02 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -451,12 +451,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { - if( !allelesToGenotype.contains(vc) ) { - allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object - } - } - if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { + final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles); + if( vcFromAllelesRod != null ) { + allelesToGenotype.add(vcFromAllelesRod); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object return new ActivityProfileState(ref.getLocus(), 1.0); } } From c191d7de8c15fac2a23cbc7e86bd2cf369906a34 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 2 Apr 2013 09:20:34 -0400 Subject: [PATCH 125/226] Critical bugfix for CommonSuffixSplitter -- Graphs with cycles from the bottom node to one of the middle nodes would introduce an infinite cycle in the algorithm. Created unit test that reproduced the issue, and then fixed the underlying issue. --- .../graphs/CommonSuffixSplitter.java | 7 +++- .../graphs/CommonSuffixMergerUnitTest.java | 9 +++++ .../graphs/CommonSuffixSplitterUnitTest.java | 35 +++++++++++++++++-- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index f3a41ee8b..dabfbb322 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Requires; +import java.io.File; import java.util.*; /** @@ -150,10 +151,14 @@ public class CommonSuffixSplitter { * @return true if we can safely split up toMerge */ private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection toMerge) { + final Set outgoingOfBot = new HashSet(graph.outgoingVerticesOf(bot)); for ( final SeqVertex m : toMerge ) { final Set outs = graph.outgoingEdgesOf(m); if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) ) - // m == bot => don't allow cycles in the graph + // m == bot => don't allow self cycles in the graph + return false; + if ( outgoingOfBot.contains(m) ) + // forbid cycles from bottom -> mid return false; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index 012add769..8682ae5e4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -72,6 +72,15 @@ public class CommonSuffixMergerUnitTest extends BaseTest { this.v = v; this.commonSuffix = commonSuffix; } + + @Override + public String toString() { + return "SplitMergeData{" + + "graph=" + graph + + ", v=" + v + + ", commonSuffix='" + commonSuffix + '\'' + + '}'; + } } public static Object[][] makeSplitMergeData(final int maxTests) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java index f03dc8762..8006cb18d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -51,13 +51,18 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.util.Arrays; + public class CommonSuffixSplitterUnitTest extends BaseTest { + private final static boolean DEBUG = false; + @DataProvider(name = "SplitData") public Object[][] makeSplitData() { return CommonSuffixMergerUnitTest.makeSplitMergeData(-1); } - @Test(dataProvider = "SplitData") + @Test(dataProvider = "SplitData", enabled = !DEBUG) public void testSplit(final CommonSuffixMergerUnitTest.SplitMergeData data) { final boolean expectedMerge = ! data.commonSuffix.isEmpty() && data.graph.inDegreeOf(data.v) > 1; @@ -74,7 +79,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { CommonSuffixMergerUnitTest.assertSameHaplotypes(String.format("suffixSplit.%s.%d", data.commonSuffix, data.graph.vertexSet().size()), data.graph, original); } - @Test + @Test(enabled = !DEBUG) public void testSplitPrevHaveMultipleEdges() { final SeqGraph original = new SeqGraph(); final SeqVertex v1 = new SeqVertex("A"); @@ -93,7 +98,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(new CommonSuffixSplitter().split(original, v3), "Cannot split graph with multiple outgoing edges from middle nodes"); } - @Test + @Test(enabled = !DEBUG) public void testSplitNoCycles() { final SeqGraph original = new SeqGraph(); final SeqVertex v1 = new SeqVertex("A"); @@ -110,4 +115,28 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { original.addEdges(v4, v4); Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); } + + @Test(timeOut = 10000) + public void testSplitComplexCycle() { + final SeqGraph original = new SeqGraph(); + final SeqVertex r1 = new SeqVertex("ACTG"); + final SeqVertex r2 = new SeqVertex("ATGC"); + final SeqVertex cat1 = new SeqVertex("CAT"); + final SeqVertex cat2 = new SeqVertex("CAT"); + final SeqVertex c1 = new SeqVertex("C"); + final SeqVertex c2 = new SeqVertex("C"); + + original.addVertices(r1, r2, cat1, cat2, c1, c2); + original.addEdges(r1, cat1, c1, cat2, c1); + original.addEdges(r2, c2, cat2); + + original.printGraph(new File("testSplitComplexCycle.dot"), 0); + + for ( final SeqVertex v : Arrays.asList(cat2) ) { // original.vertexSet() ) { + final SeqGraph graph = (SeqGraph)original.clone(); + final boolean success = new CommonSuffixSplitter().split(graph, v); + if ( success ) graph.printGraph(new File("testSplitComplexCycle.fail.dot"), 0); + Assert.assertFalse(success, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); + } + } } From 5baf906c28d3653dac8d775e89f5c7da71eade89 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 1 Apr 2013 15:32:40 -0400 Subject: [PATCH 126/226] Intervals: fix bug where we could fail to find the intersection of unsorted/missorted interval lists -The algorithm for finding the intersection of two sets of intervals relies on the sortedness of the intervals within each set, but the engine was not sorting the intervals before attempting to find the intersection. -The result was that if one or both interval lists was unsorted / lexicographically sorted, we would often fail to find the intersection correctly. -Now the IntervalBinding sorts all sets of intervals before returning them, solving the problem. -Added an integration test for this case. GSA-909 #resolve --- .../sting/commandline/IntervalBinding.java | 1 + .../utils/interval/IntervalIntegrationTest.java | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index 7f419abb2..9253e1ee5 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -98,6 +98,7 @@ public final class IntervalBinding { intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); } + Collections.sort(intervals); return intervals; } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 98ecd0f43..69466d163 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -285,4 +285,20 @@ public class IntervalIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("testSymbolicAlleles", spec); } + + @Test + public void testIntersectionOfLexicographicallySortedIntervals() { + final String md5 = "18be9375e5a753f766616a51eb6131f0"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T CountLoci" + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -R " + b37KGReference + + " -L " + privateTestDir + "lexicographicallySortedIntervals.bed" + + " -L 4" + + " -isr INTERSECTION" + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntersectionOfLexicographicallySortedIntervals", spec); + } } From b4b58a3968f5b838b5c91b1630fe2e35d1432823 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 2 Apr 2013 14:24:23 -0400 Subject: [PATCH 127/226] Fix unprintable character in a comment from the BaseEdge class Compiler warnings about this were starting to get to me... --- .../sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java index 6076626f6..be5a431c4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseEdge.java @@ -118,7 +118,7 @@ public class BaseEdge { } /** - * Does this and edge have the same source and target vertices in graph? + * Does this and edge have the same source and target vertices in graph? * * @param graph the graph containing both this and edge * @param edge our comparator edge From bb42c90f2b8b5de41ecb941597146ccf33c29c7c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 2 Apr 2013 15:31:09 -0400 Subject: [PATCH 128/226] Use LinkedHashSets in incoming and outgoing vertex functions in BaseGraph -- Using a LinkedHashSet changed the md5 for HCTestComplexVariants. --- .../sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java | 4 ++-- ...lotypeCallerComplexAndSymbolicVariantsIntegrationTest.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 1d294e591..5d591fd5c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -348,7 +348,7 @@ public class BaseGraph extends DefaultDirectedGraph outgoingVerticesOf(final T v) { - final Set s = new HashSet(); + final Set s = new LinkedHashSet(); for ( final BaseEdge e : outgoingEdgesOf(v) ) { s.add(getEdgeTarget(e)); } @@ -361,7 +361,7 @@ public class BaseGraph extends DefaultDirectedGraph v */ public Set incomingVerticesOf(final T v) { - final Set s = new HashSet(); + final Set s = new LinkedHashSet(); for ( final BaseEdge e : incomingEdgesOf(v) ) { s.add(getEdgeSource(e)); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 9400b3dd2..4204a0634 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a898b551f78c71befee4d12070d3a788"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "73817d9173b8d9d05dac1f3092871f33"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { From 2eac97a76c74d1aa7e8a17ea590082ba23621377 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 2 Apr 2013 17:04:44 -0400 Subject: [PATCH 129/226] Remove auto-creation of fai/dict files for fasta references -A UserException is now thrown if either the fai or dict file for the reference does not exist, with pointers to instructions for creating these files. -Gets rid of problematic file locking that was causing intermittent errors on our farm. -Integration tests to verify that correct exceptions are thrown in the case of a missing fai / dict file. GSA-866 #resolve --- .../reference/ReferenceDataSource.java | 123 +----------------- .../sting/utils/exceptions/UserException.java | 36 +++-- .../ReferenceDataSourceIntegrationTest.java | 75 +++++++++++ 3 files changed, 98 insertions(+), 136 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index 79100e89a..01edd44ba 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -25,10 +25,7 @@ package org.broadinstitute.sting.gatk.datasources.reference; -import net.sf.picard.reference.FastaSequenceIndex; -import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.sam.CreateSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; @@ -36,11 +33,8 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.file.FSLockWithShared; -import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; import java.util.ArrayList; @@ -77,128 +71,25 @@ public class ReferenceDataSource { final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta"; final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict")); - /* - * if index file does not exist, create it manually - */ + // It's an error if either the fai or dict file does not exist. The user is now responsible + // for creating these files. if (!indexFile.exists()) { - - logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath())); - FSLockWithShared indexLock = new FSLockWithShared(indexFile,true); - try { - // get exclusive lock - if (!indexLock.exclusiveLock()) - throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile); - FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, true); - FastaSequenceIndex sequenceIndex = faiBuilder.createIndex(); - FastaSequenceIndexBuilder.saveAsFaiFile(sequenceIndex, indexFile); - } - catch(FileSystemInabilityToLockException ex) { - logger.info("Unable to create write lock: " + ex.getMessage()); - logger.info("Skipping index creation."); - } - catch(UserException e) { - // Rethrow all user exceptions as-is; there should be more details in the UserException itself. - throw e; - } - catch (Exception e) { - // If lock creation succeeded, the failure must have been generating the index. - // If lock creation failed, just skip over index creation entirely. - throw new UserException.CouldNotCreateReferenceIndexFile(indexFile, e); - } - finally { - indexLock.unlock(); - } + throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile); } - - /* - * If dict file doesn't exist, try to create it using Picard's CreateSequenceDictionary - * Currently, dictionary cannot be created without running CreateSequenceDictionary's main routine, hence the - * argument string - * This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary - */ if (!dictFile.exists()) { - - logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath())); - - /* - * Please note another hack here: we have to create a temporary file b/c CreateSequenceDictionary cannot - * create a dictionary file if that file is locked. - */ - - // get read lock on dict file so nobody else can read it - FSLockWithShared dictLock = new FSLockWithShared(dictFile,true); - try { - // get shared lock on dict file so nobody else can start creating it - if (!dictLock.exclusiveLock()) - throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile); - // dict will be written to random temporary file in same directory (see note above) - File tempFile = File.createTempFile("dict", null, dictFile.getParentFile()); - tempFile.deleteOnExit(); - - // create dictionary by calling main routine. Temporary fix - see comment above. - String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()), - String.format("o=%s", tempFile.getAbsolutePath())}; - new CreateSequenceDictionary().instanceMain(args); - - if (!tempFile.renameTo(dictFile)) - throw new UserException("Error transferring temp file " + tempFile + " to dict file " + dictFile); - } - catch(FileSystemInabilityToLockException ex) { - logger.info("Unable to create write lock: " + ex.getMessage()); - logger.info("Skipping dictionary creation."); - } - catch (Exception e) { - // If lock creation succeeded, the failure must have been generating the index. - // If lock creation failed, just skip over index creation entirely. - throw new UserException.CouldNotCreateReferenceIndexFile(dictFile, e); - } - finally { - dictLock.unlock(); - } + throw new UserException.MissingReferenceDictFile(dictFile, fastaFile); } - /* - * Read reference data by creating an IndexedFastaSequenceFile. - * A note about thread safety: IndexFastaSequenceFile reads the fasta using dictionary and index files. It will - * fail if either does not exist, but not if either is currently being written (in which case it exists - * but is incomplete). To avoid this, obtain shared locks on both files before creating IndexedFastaSequenceFile. - */ - - FSLockWithShared dictLock = new FSLockWithShared(dictFile,true); - FSLockWithShared indexLock = new FSLockWithShared(indexFile,true); + // Read reference data by creating an IndexedFastaSequenceFile. try { - try { - if (!dictLock.sharedLock()) { - throw new ReviewedStingException("Could not open dictionary file because a lock could not be obtained."); - } - } - catch(FileSystemInabilityToLockException ex) { - logger.info(String.format("Unable to create a lock on dictionary file: %s",ex.getMessage())); - logger.info("Treating existing dictionary file as complete."); - } - - try { - if (!indexLock.sharedLock()) { - throw new ReviewedStingException("Could not open index file because a lock could not be obtained."); - } - } - catch(FileSystemInabilityToLockException ex) { - logger.info(String.format("Unable to create a lock on index file: %s",ex.getMessage())); - logger.info("Treating existing index file as complete."); - } - reference = new CachingIndexedFastaSequenceFile(fastaFile); - - } catch (IllegalArgumentException e) { + } + catch (IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); } catch (Exception e) { throw new UserException.CouldNotReadInputFile(fastaFile, e); } - finally { - dictLock.unlock(); - indexLock.unlock(); - } } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index fcc132ffe..83400cc73 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -392,29 +392,25 @@ public class UserException extends ReviewedStingException { } } - public static class CouldNotCreateReferenceIndexFile extends UserException { - public CouldNotCreateReferenceIndexFile(File f, Exception e) { - this(f, "", e); - } - - public CouldNotCreateReferenceIndexFile(File f, String message, Exception e) { - super(String.format("Index file %s does not exist but could not be created because: %s. ", f, message) - + (e == null ? "" : getMessage(e))); - } - } - public static class CannotHandleGzippedRef extends UserException { - public CannotHandleGzippedRef() { - super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); - } + public CannotHandleGzippedRef() { + super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience."); + } } - public static class CouldNotCreateReferenceIndexFileBecauseOfLock extends UserException.CouldNotCreateReferenceIndexFile { - public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) { - super(f, "could not be written because an exclusive file lock could not be obtained. " + - "If you are running multiple instances of GATK, another GATK process is " + - "probably creating this file now, and has locked it. Please wait until this process finishes " + - "and try again.", null); + public static class MissingReferenceFaiFile extends UserException { + public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) { + super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.", + indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); + } + } + + public static class MissingReferenceDictFile extends UserException { + public MissingReferenceDictFile( final File dictFile, final File fastaFile ) { + super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.", + dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(), + HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference"))); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java new file mode 100644 index 000000000..00d0dd051 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceIntegrationTest.java @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.datasources.reference; + +import junit.framework.Assert; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +public class ReferenceDataSourceIntegrationTest extends WalkerTest { + + @Test + public void testReferenceWithMissingFaiFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File dictFile = new File(dummyReference.getAbsolutePath().replace(".fasta", ".dict")); + dictFile.deleteOnExit(); + Assert.assertTrue(dictFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T PrintReads" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceFaiFile.class + ); + + executeTest("testReferenceWithMissingFaiFile", spec); + } + + @Test + public void testReferenceWithMissingDictFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File faiFile = new File(dummyReference.getAbsolutePath() + ".fai"); + faiFile.deleteOnExit(); + Assert.assertTrue(faiFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T PrintReads" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceDictFile.class + ); + + executeTest("testReferenceWithMissingDictFile", spec); + } +} From 8a93bb687b1f0188e6406da2118cdaf1ef7a7779 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 1 Apr 2013 14:28:20 -0400 Subject: [PATCH 130/226] Critical bug fix for the case of duplicate map calls in ActiveRegionWalkers with exome interval lists. -- When consecutive intervals were within the bandpass filter size the ActiveRegion traversal engine would create duplicate active regions. -- Now when flushing the activity profile after we jump to a new interval we remove the extra states which are outside of the current interval. -- Added integration test which ensures that the output VCF contains no duplicate records. Was failing test before this commit. --- .../haplotypecaller/GenotypingEngine.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 56 +++++++++++++++++-- .../traversals/TraverseActiveRegions.java | 6 +- .../utils/activeregion/ActivityProfile.java | 14 +++-- 4 files changed, 67 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 0d6e29fe9..ee9993b4f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -794,7 +794,7 @@ public class GenotypingEngine { return vcs; } - private static class Event { + protected static class Event { public VariantContext vc; public Event( final VariantContext vc ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c416938cd..292bea50d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -46,11 +46,22 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.TribbleIndexedFeatureReader; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.Collections; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; @@ -88,6 +99,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { "70bd5d0805bf6f51e5f61b377526c979"); } + @Test + public void testHaplotypeCallerInsertionOnEdgeOfContig() { + HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); + } + private void HCTestIndelQualityScores(String bam, String args, String md5) { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); @@ -99,9 +115,41 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa"); } + private void HCTestNearbySmallIntervals(String bam, String args, String md5) { + try { + final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); + final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); + + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { + if( containsDuplicateRecord(vcf, parser) ) { + throw new IllegalStateException("Duplicate records detected but there should be none."); + } + } + } catch( FileNotFoundException e ) { + throw new IllegalStateException("Could not find the b37 reference file."); + } + } + + private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { + final List> VCs = new ArrayList>(); + try { + for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { + VCs.add(new Pair(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + } + } catch( IOException e ) { + throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); + } + + final Set> VCsAsSet = new HashSet>(VCs); + return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. + } + + @Test - public void testHaplotypeCallerInsertionOnEdgeOfContig() { - HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae"); + public void testHaplotypeCallerNearbySmallIntervals() { + HCTestNearbySmallIntervals(NA12878_BAM, "", "b9d614efdaf38b87b459df421aab93a7"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 64c6d5094..7b831db32 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -70,7 +70,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine stateList; @@ -98,7 +98,7 @@ public class ActivityProfile { */ @Ensures("result >= 0") public int getMaxProbPropagationDistance() { - return MAX_PROB_PROPOGATION_DISTANCE; + return MAX_PROB_PROPAGATION_DISTANCE; } /** @@ -210,7 +210,7 @@ public class ActivityProfile { contigLength = parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength(); } else { if ( regionStopLoc.getStart() != loc.getStart() - 1 ) - throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); + throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediately after last loc " + regionStopLoc ); regionStopLoc = loc; } @@ -239,7 +239,7 @@ public class ActivityProfile { throw new IllegalArgumentException("Must add state contiguous to existing states: adding " + stateToAdd); if ( position >= 0 ) { - // ignore states starting before this regions start + // ignore states starting before this region's start if ( position < size() ) { stateList.get(position).isActiveProb += stateToAdd.isActiveProb; } else { @@ -352,6 +352,12 @@ public class ActivityProfile { if ( stateList.isEmpty() ) return null; + // If we are flushing the activity profile we need to trim off the excess states so that we don't create regions outside of our current processing interval + if( forceConversion ) { + final List statesToTrimAway = new ArrayList(stateList.subList(getSpan().size(), stateList.size())); + stateList.removeAll(statesToTrimAway); + } + final ActivityProfileState first = stateList.get(0); final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD; final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion); From 6197078c5defad50e6e884df949501b395577236 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 4 Apr 2013 11:56:17 -0400 Subject: [PATCH 131/226] Disable Contracts for Java for tests -cofoja is not compatible with Java 7, so we're forced to disable it for now until a replacement can be found --- build.xml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/build.xml b/build.xml index fb5362b3e..c70f85a28 100644 --- a/build.xml +++ b/build.xml @@ -91,9 +91,8 @@ - - - + + @@ -1203,8 +1202,8 @@ - - + + @@ -1340,9 +1339,9 @@ - + - + @@ -1472,7 +1471,6 @@ - From 14bbba0980e3021dcff902bd326d4ba721e8843a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 3 Apr 2013 17:37:19 -0500 Subject: [PATCH 133/226] Optimization to method for getting values in ArgumentMatch * Very trivial, but I happened to see this code and it drove me nuts so I felt compelled to refactor it. * Instead of iterating over keys in map to get the values, just iterate over the values... --- .../broadinstitute/sting/commandline/ArgumentMatch.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java index 828f10fcb..e354601da 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java @@ -276,10 +276,10 @@ public class ArgumentMatch implements Iterable { * @return A collection of the string representation of these value. */ public List values() { - List values = new ArrayList(); - for( ArgumentMatchSite site: sites.keySet() ) { - if( sites.get(site) != null ) - values.addAll(sites.get(site)); + final List values = new ArrayList(); + for ( final List siteValue : sites.values() ) { + if ( siteValue != null ) + values.addAll(siteValue); } return values; } From 7897d52f3208f00c6012c63e8ea21936f26f600d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 4 Apr 2013 10:44:40 -0500 Subject: [PATCH 134/226] Don't allow users to specify keys and IDs that contain angle brackets or equals signs (not allowed in VCF spec). * As reported here: http://gatkforums.broadinstitute.org/discussion/comment/4270#Comment_4270 * This was a commit into the variant.jar; the changes here are a rev of that jar and handling of errors in VF * Added integration test to confirm failure with User Error * Removed illegal header line in KB test VCF that was causing related tests to fail. --- .../VariantFiltrationIntegrationTest.java | 8 +++++++ .../walkers/filters/VariantFiltration.java | 22 +++++++++++------- ...nt-1.85.1357.jar => variant-1.88.1401.jar} | Bin 555516 -> 556173 bytes ...nt-1.85.1357.xml => variant-1.88.1401.xml} | 2 +- 4 files changed, 22 insertions(+), 10 deletions(-) rename settings/repository/org.broadinstitute/{variant-1.85.1357.jar => variant-1.88.1401.jar} (94%) rename settings/repository/org.broadinstitute/{variant-1.85.1357.xml => variant-1.88.1401.xml} (71%) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 9de190f5f..6a29ff255 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -106,6 +107,13 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { executeTest("test filter sites not in mask", spec3); } + @Test + public void testIllegalFilterName() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName 'foo < foo' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + UserException.class); + executeTest("test illegal filter name", spec); + } @Test public void testFilter1() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index 362b49f68..83d4d81d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -186,18 +186,22 @@ public class VariantFiltration extends RodWalker { if ( clusterWindow > 0 ) hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters")); - for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); - for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) - hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); - if ( genotypeFilterExps.size() > 0 ) hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); - if ( mask.isBound() ) { - if (filterRecordsNotInMask) - hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask")); - else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); + try { + for ( VariantContextUtils.JexlVCMatchExp exp : filterExps ) + hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); + for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) + hInfo.add(new VCFFilterHeaderLine(exp.name, exp.exp.toString())); + + if ( mask.isBound() ) { + if (filterRecordsNotInMask) + hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Doesn't overlap a user-input mask")); + else hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); + } + } catch (IllegalArgumentException e) { + throw new UserException.BadInput(e.getMessage()); } writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); diff --git a/settings/repository/org.broadinstitute/variant-1.85.1357.jar b/settings/repository/org.broadinstitute/variant-1.88.1401.jar similarity index 94% rename from settings/repository/org.broadinstitute/variant-1.85.1357.jar rename to settings/repository/org.broadinstitute/variant-1.88.1401.jar index d341e1cf5cdc9f202ea7e51ef59c6334c83040b9..688c6ae17c73fdb788c2c83f32a347bbcfd3870b 100644 GIT binary patch delta 9784 zcmcgSX?Rpcw&!-=+g07U**Xv)kfZ})%O3V62}xK&Ac2G}gf$_AHDrOX3D^X5a0EsW zqzH?EqPU=Jl7_(%To4%?k#Rspae~VD#)W0nL6Il#oT{6J`TXAZe!YC$y62ucTb(*p z=TtR!zmHn~bCiGHaJ%jT-~dC%KeW-m3(p$=P^Ol(BWz>ptfU^@DkL=RYGzrw( zR@ws)lUcTTez*P%pg$8u=WJ1%^7OzXPdcVDqBeozUz zy6`)v#9Z)RQ^4`x+ZFN2^;`U8*Z5kiEB)!#Z6s#mtA^?-Ib*Spz7VP~WuMlCNScl} zOO%|oKb%p(*q=iU*;W4QYQvwb58V7!k8m9zd{g4hBwvEAB}hu&RIbfb&{ET|(ERv~^u1v|^Z6E{7_v=W8hv6+cQ_Ck+s)^ zdD39t+B)rCABkIkLpu^9p@CsGkJ2?G+?Lo?;*lX==~qX z!81m(D9YuXe0xXP?n#?`&rYx{e$OsDPDUJ=un#--sg9Ze{g_gb;?@T!DG!F~nM#VjYUev&An_N9IJm9; zgCgdU`gIwPzMnn%m;Pj}s87&yq}hpy`Wz)`M|V9eO5%=>(bp@6>~Z?rPU)!3JzrgV z0#_nEIigrMQbo~3?)Fs_>APH#D|oIx#ZS`SC`-^jLOOazE83NtbS@@O;=#U4i}cSG z!-*Pwgo+Y#z1~O7QC6)UsA8z_xc-t{Gd|~=dVCA1qVI>gUGWaMsvirIxI4G>ac&8X z){W6h-t&P*xEj>i5TjJhV{#kgupGPZADxWXRPVl6BzF;^M$HkY8@0C~29X2j2$Jc=n9JxRiYGyx}#ZgQ8Pb z{JIBMCOx_2pdMo7B_mWkHk&(q8!j3zDrISxjkPLZ->o;!DpR(8ZcJ1L{QH_wpakb# zH%g@ojvjOD8XCz(W2A#)4=)-2H5!3jCAgPNVxJW0(iC+vnjE`*+9@HB_LE@1Es^*rl@JlN;L z?&nN$+I_pK$op1%31384g!Ce8%AKWLrJy zyUO8ndVj&;OAgmK{DZ?+4F1anS#W?1`zI;+n&`h_@GVLFj=^;XHyGR$7jD`@Y;en0 z(S~(%1bxfk7=wQ?xW(XmG(#MUWXrX`3vC&PWts3cS^6(i3k0vJ1%Vf_38ofIUvB!c#V0c@#MD}7 zEln*HDnHM78<)!^R*@HYK#pUG{)t-`y^6KK*<(}g5nOJ>H zn_pZqx3qcza;5&?xICp*3yaGZcoxi_S&p7u5Y58;BQ+0M8)d>et-UxDZ|{xH-2y@+ zcjAdYXGb>25Z!j;Bvr+*)!rNo3Jper18jgZ3xIE&Gx^4;m2aGE`NsLix0|K`-@(XN z1jX1+!>>eqYxp)|UIFbO0yZeY)*+GoVLHO5B7jnufnOTTL}oj3Z#p?Mv~v@2%5M_S zg4xJ2O`M80niHq~Ch;7YD~WH$vgX8Tze$WSD~aRdSX8$rGX^vx4zg@6N4pKM1*Fqh zE1**Ht&C%x`w(9M*s;yPy(bf$Gq4$dXfu90V8{`7mE`_=9IMt!#F%*2!Anl^ppX>k z4D)dgZ1`Pbu@bZZs*#6`U|@97HAet`E`&uS2$2Jjfu_Je<}idF1wF=g6kLZ8p{7Ea zSB$*Va4Jfgndh%`xIo86xP+u_Nnk_Ks_X=om`hfr;Y0%TZnUMhj1^MWTSg06+MD(R z>4M(0J)khcQLGqJ@KO>iPA9Mol*|%YY%|G8jgpfZB`2Ze3DO5ijgpfZB_}nL`~XSz zc@x=24Mhhhv&GVtLxJHk?*z+WIhn65X;6aD3N#SSN9aNDk`jc-R68VVS+Ejn(Bv^% zG^|2816JcuD0)@jL)b@?5Ci`AU=}zSJj~#C4AwAsgu(9-Mx`PCPJgG&h(7J||tR@gUQuR3T+`-Lv|iv;jP_!64-A+&^# zAssHm5cmW$)~7HEKEr^h$H831#q&81@(ZsV8Lb(#*BTiI0qb!BHC%#|;1BQ^`o3WT zRo@2eqwxoWaTn5Y$_xVm#$Y3MV9vmnrs-j1B*8JUuS3o@FzowdPeD#@Ok5r0)S|J^^yJBAvul8UAQc4h^HrgMgj!;F?s3ToUR%6EP zD^1GWs$`1ZJz0{BB_-i*GYOH65+d)H5T1^inQ|X0w!t>pZpZln2*cs$;cP(=#|kP3 z#-z8wqK3f-;Joihof&_)SH2EYwNB>J;1n>=oZIJL_W;A=C9-BT5G+i7e@Pf2VZ{JNkz0H8WR1i#0+Jl@A8 zMpko&IGM|wUaauqq0EWdY#ue`3@SKO(iT|7VZIBx!U7J}92Rm|#9^@uX_hcp>cWXC zqvio>mN8gPJu8U*L4v&0EF;<2bcL1F)Noiu@6~?r5IpRM+rt{aG+ds)_rvB<25UL2 z^8*j8XYd$@jU@PS4x3!i0jdbUnZXkrwqWdX*vg=m!8WmGD=X2qi*L5Fv-)!E7jbdC zz3*}@(`-RY8SFr*Xf1=CNH4zG&Q|#MfhSGq1wBmMlzXU}WycN{9nuGOVTh%eSgP(p z0Lzsmv2zCt3;*A06GhUtiKKZUV!}`uEUP+~2~WY(CYGUtsrpDtgF!IZglrgQ;;Kd{ z7xtR)3_Od9JD9K!ora*C)oq4B+B2f=lMGBVt(2fh{BnS$7Pp*G0_g|nl0iQw2_PxFYRXShQzvA9q?^n#=4yd$jx3abh~`uI2r5qYlboj z6%%_DY%y_1!A`l{4*jw12BxUr&Du;$Km~mem4QXWa7e|qmxc*+0Db{Rj#P@+H2gyK zh$o{7N=L|sT!6c*ehUaMqSK9JI8|i7tjf&Q}iVq&aZlcHc1_qY@VcmE+`_bjuDirVc`5W!h{_ z>BS}kKuyNXW|GYY$zII#Bc*6M{N`KY#$Xhaw<2TShIS~j&6Cj59TsE9Oy2f42GKCx z70tXAi90+6H(88n1d{;9`B2>EZDg?a0cyal8N5GP6M}^U26|9|o*Hu;@Jw_Z!ZjEX ze;n{wR10Sjh}7T|c>3w>YJuBR_hCkJM-fLG{W!GOu!iY!0y-30;@r{L)dubD=ZBp0lRi-&8rg&_*mtjoT*Ro%3N;;M{Nn;eS1iFgW+?>116XP-!_8wkI%);_&x=Qg z8f;f#E?A94!W#G)d%V7;=euXt_ z61ty`V-#PK(+fq&su+s#P1DsM_^%L-mA}TI?5 zupyek%%8Dm77DQ8@yiBtBs3TI4>|_gz=H{Gtiy?k>JPA?A;U8?5$JAs+cV6qa`J%} z*gSP{6nU6sD%^A*%TV`f&%MQTb?0>8G#jI?rX26F1Dd)^KNPtmJs2Q8Sk&aQE@JnG zEJ*xv9c zjse8=2vK5!p9_!rVdanceLehvn#UMy@Pk;`NX_F6HZj=jd;NVj)iCk=@Pr1&%+l)Y zL1N=r79s{*Vqx}Putiiwv$pA&V}yxYe|O5`Tkp?knC49^#I~Eb@N+T$o3K;m`)s_| z$R1>3b-Y`=zTeiui2;*r!X9`^)d0U-Vj142A>V|(@C*s2T#VHK=3)f?e=aXL-d@P8 zFN=mi6JCVeZ7k<_ww|vf+2+w1I(I##;|d=%3!$_7UqpVd+|5@!lsa(o4HFty{`x{pBhqC>3RB5=YV~ zqa#|M6@3h0X+ zE-AJo1LG}I<_u*;V51d*h*wqwVqPC4$pfWjw4(%KDjm|y3Wdo+j2iJ!X-zg+BhG)y zQgFQH71nk(jW-6V03*0zys?erjm1pbU22Mz^6hezD*I6%6-&f97^_0NrD-@y!?-0) z*!NHLG}%`sHe6xx-dH!D7+cG7sSN_MTna99hxs5B%Oy-+ji`2nqH>990RvB^;Wi8r zI$Dyq)LD5rF6|tw>vC}lMqt_>iAgOFH`Gy>u1Di?9U~p7N~ba!UKX;VEixu(KFeqY zqNV8wt1^f%G^!DZ!Hb#Twp2eU7mh^TDhKxw8K1E(UbN^0L}0lRiTTG<2T?SDC_D+b z$5$P6plZb~%~c1;0^Cf;V~th_k+}bp=anZ@Ezfh*OtGXj+M_U|5Tnsb!4`e0wMD5I zOI}!kWleO$VKzD!9TkXY*g$!PrNYKI4xKbWlmd3qu~(rxQwGN@OBuW?W$^9@#8B3a z?Wa>tj3YRJSm`UC$o*}&COb-X`|kZ&-;cG?eX}tI9X1uKh$0N(Vhq)3IFu4hFVpd^ zq!d%;tj4e&2ko)29gp0Z=pMS6q!3xr7=SA$ zoa6peuCKVBRjAbV!Btize*yJn{m7Pu%hWb9?$uwvL-y}L}i5k;*y6ZhQwzvVA0 z_d@vB>iqj{Yksu79e>Ccd;9Y~jX!0JFZ*-(!;FYZ;fcf}MyK#J32aW`LnLq|g%6iN zY$~55fk#t$i3Dz@@(Kw|OXCwHa59Zgk$`7_MeH5G%OvLd0A4MD(sT>>G@Z|vnEVV2 z^G*XWAk)IsW%8wxrrSUZ^ZYTteJcD*~CmrEcu$0FXB!{x~AW z$5;#VYy;4GoP~L`0l3=$lowcC^#zvFqVd*9-yd)JJF(EB*;8nB1y8Vye`Eq*u56yj z7fYaKqUD6U6ZwM@vtp9vv0ob|sb>qQr`8$8^h{HPfw;vQqv#$+e->QqgJ#e^g@j z&ak@TXIemA12ABgHJm@qvL>$UY)jYP*_Pz!Io2p%n8VjffqmxkZ4$UR*BV!0nH3Ue z%J?GLl|Ij6KQ)gJlNkSU%d2b3d6C3?U2d(0Ar;mlc-jKCRPynXWMZWiv!^Th3c39> z{mv@#sw}pbs(4%5>%Bx&HIEkO)0k7-uHy2iT4Bz|uCrOHD?OSAiOKW1+k@Lq=EX%J ztyf|}Ko4c`hQGZvQ%5NJhzA@&;>AT4{hjc2bCU4pnx6aVdyZE0-cc-6Brf2lv^{MB zsy;D^x?<#gpV+j(61C^xwC}=u;(u1bC!s+vzW3JXC5rw=18LKJAYGs|{q3!px0OiK zFIT! zfVv}+eoJ4wE{YB_huE6KO{uj0-jd2&cw(dn2h#Yrx8|2)Y7~{p4klU+wdUf^LR3HC zZR&~^2_2a!eJd@Sw1_M_qq;m->;dBDX;jtd%Tv_9^StW6@`J-A{1#i5mo*->{&QMJ zUJx})ct=@*H%(*N*D2YpuqL7>wb5qs{k6}DzPGr%n9KhYA#N>3tp#5y?7Z9PuaQeE zt!qCr7k0$NOb^}uH2m$YId_}rv*obuR;I- delta 9180 zcmZuW30PIt*6ZAR&fWW74$~zN1q4(CK_+n)5pX~S6$P9pL~%kCbI1X&G#6T$))sTl zqSDGVxsqm9PtE6h`K+gzf2B5A_A-6dE6YFMT6>=h^uG8w`>eC(z4oxy-goV-*xNtE z`WNQgbr%4~|LWrP{t28_7w6W#RxxIM-&tuHJ(8!Tbx-b^(KR_eeON}Xv|cd@ISUq4 z%!nCMS(eht-{tRWme*1KT(q5DKc~=ZLxR>_zDu<}^ggg%%hDup*|ypqfSBBhjSIW< zVE}!Yn0ne4yP;4IJh06%g%Q{OfKL4-cju8kbP1G~{M=RoQx~mLimojF&M7fxJzpxI z@e#XXesk?pev&k{&LU+$6W&^4CLA#osqCaBUUW80VJbdu2$4K-H_DWr+8<9UpyXDV zAxV|L-)H!f_kkO)c8$^jqBf-7Nb{!XT8iZKPT|^21)VarDZwPw0QvN@&x#rAI82z8bzLQ3}m^#xIO8b0l=}Y1_(<5*qim?R!hhyEdk5l$^G`Z%S(71>1awgtmNY>+cAr7$xs> z#AF!&^b8Tfui5*Epoi>!-tJ%8hAT}EU9**Ll2YH^YY$DAko`4#j?$jzwJ+)=aYt|1 z>lAMMkM>*@-|fHH2P)|5ZM!a^gV*&;g-SN`1G*fjyDtZYUAF^V7dLO)2YAz6`W?F@ zyxI1^_f`W&TcONQPiR^l3q^|eZ1$Y}k9QwAz`=hgF1;!32aK$Pl6U$J_F z&b>3H>MyvYRPbDVvY+Jrz9L1tfPC~6HS>z=lXeSWcVq@UE1veCGztcAZ|ysE!$ z>HWq&_W_l(DRfMy;B#f4gj)Qm$w-hj=z`bb zupb@Dhj1 zM83k{Gg9z5gD)6-$>1w-_J%FQ23JMtX%^|-(wc=if<9w#fWg-ct}^%rO^dgq*-GuA z$hpDVi=bz1x+srf2cipMqzR*`83SWYD25US*G%{pN=%puv!2pfyqFTp`ou4Uf0-}} zicI(puAA^Z%w=%Hgqtwegnz>i41P4>C-{$eE0#st7ebY|9LvJn!O!rEiOnq&euar9 z{6=~g!fg}A!FUrMM82HXdZsX1UdVpB|w(HEyLTXH(noTlc|{7lV+xnkr;j)CH4JC^M9Cv1QQ zA);!PBSiFW&&KdT>ib5RCDxDA;>EG{>~$W5?rOoBTLfQV5pFF+yNl#P(NJF#9@4_Z ztI75rh6(H836Xxy5%)w-m$x>K%`wDx*Bm)wN)pTVzR;P)c4>B_ zUkm0S%jwzZxB<3+Z0hVBm@DOyl39FDk_!M#XSg01_ru+(~BbGbu_js3a5OVIB^G4Zq7QQGzOAKFW|242%R_a|Gb$0;rOTCa18rsm*Sb zBF{98d07iL{4HHN&hNXW(4G`F7_I-9!cvEk^(-7QfF4aYd&o&a1KUGR3-YK3twY(> z9<(e0lLF#sQWyv?ghf*Gf>f4+n)j!&q!yafnlz_1X--4UC9 zG9wgydC0fgv=sE;jw-evhMaL;$Sp{6N72G-g#P0D3^u6!fMH4Qww>|Q4r7ws(MYrV zK4S30MvP;-vPjRM1_)_@P+GCNT(K$Ohw%G&g!uqM;T&Ycc^CvALIGUBak_}1`xsZ^ zC)meNF5v`MXN;`^(+{A8Mh%DV`&%&Fa~Ro zg4-yz6u$>hk~AkKy#;xj!LaX5ItF>e5<54*V9~i7iyTYQbff10Ut_Gl0ViC;W%z9i zuM#bV2pgC}MC6b!?68w^A~ zW>g5`SR&-Xi0sywwHdqz&Oe+PbCdUXpWhe+Ku@~&NdRVJO!V;fX)RxzA?pv&{( zuTg8}q&Wf7oZ?CzOAxDzS&*nb!2-P-i&(mbiT&2kPa=)ICdwI5C?j~}dt8b7!f?xO}5o0uN3$%DZr3?L{Uwwbtkb|7ne6L!Kb1TxW%3D3Z82G5%C9Mq!&COi)> znD8R(5z%&An2BtiNyQ!pdxf^c2o=YEz;dD0c2*R;1zys?5mUZ!{=676xJSIT*LIf< zFT*S1vF$9}re%qmr|qG_^R3|!Z*FJH#krkEFH!g(N2qt-4(6wc+ShEIiXj%qAqnSr zXB@ytNQM-IQ{}`=!|!zH()68yQx=0EX&4WDJCW8MbAtvw@v9D}Fs&B`6Ca8}vi08?-y< zke$(fl7s(<(cZ4U#-o!m*Aa|5{C9zSep+s0P|nE$JGTnn{rn>v6~%2K-w z)zV&-jytEn+&M`x?wkrves znBFuN=~zAvMVmAN4%|uOvT;)4gf6iB$hQ126B{5nsR7)WZpeg94!0VNvKE?6pJor{ z;bBs>6Zg9jmbzk`gcPml#1jySD(~6`q3+N`yWRj{f6@uxaL;WKt!T`+G&jj(+_Q1V z#m)`F>~EvN)ILB9l$<@kSObEowH1!R7`<;|BZOxK4jCJ}!J(iMjU z52kIJ0%If^)nM#MZO6?JZz(2Byu^eyi@FnJ96I>Sb&}?!BV?04(H=V4q$GAs{hh8! zZuA6=SyN31Isd!hAOO~L92P?$mRG@8V5MV8)fvIv!D}uG1k3&R{h+My~x|U@VqjbU}IpD~Sm>n89!rtArE`KV^KHUsBS8laA6l z2~5E^4ObuFze5y*sTzaQ+YpHbt|R4VC}2=F6`wme0}Te#Ee0xoZHQ(t;}%lNQGpF+ z;-ix5Y&=NPu3^JXV-*xRoR}gW#`?fYrwm;Lx@pX+XKq7g%l2D3oDakMy)Y(H{aL~j zJ^h~Q#GK-lrFaRlV=s$glI!fqkgS{N`AuB}mTT?2Hx95+b;rURS&q8VDLBD&O>!IG z;N2WrvUA0~`Xq~2mn1KqVXr8j%(|1;iZGiO1*t6k_*oY1!ApWEF6ac<@lqFtb{hF# zM$mK)GyJf-Gb!oKa$%8!sWuwsa;PBmJO-7-i|MdjCS%OPjIwYi%CrHLhl61W5tlN` z!r?HD#E}*b%kbbzj4NC{@oOc6YLZw*%$SSIVYMHY#P|B)Ot_yKOqZC9JHkWMJd7Ec z!J{H4NNele_&&=q^lApR;-e2)H+wp)p+jtBzjRn@Qj#%o7Cfeb-+&o+mo1#VVA1UP zm6(xDtd`cx%*o&h6E?s`;v0Zx=^+D5ESH{=NzRG+3iG0XP3o+??*hwl*Fk{^n<+V` z!xj^^!Zw+qgH_ta44n@SgLhcPaZ~9^Qiq7gI1^ z3cUn-P230e`m(#g19paY7%&ZsPuh^tF*yzYm`3T4pb}|@`1li+ zt#uVqpR(wZZpceZiBb%uQTj{8U8*zk6GYl*lTXFXv@?!;B!uAdOTl&12eFjh6ebNZ zWFZr|GsOB&S&XMEGPtC04vt_h)-OI&fla0YG0!SfftY9eN$x;tp0ao#R=+tdOest* zVpMfN2@c1GOd|JobdQ#EQeu)@^MXjb#1gTa6E3lc*XW;Rul?|x)&@s%uQ&~Pv z{{ozX!!b*Yz>pQn*u*FeP5$+j!KY#9D{CTKQ^*yDyc!}k+kzu3uZE(Xd@v|AW{CGM zvjh*CKMdiR_FLob+ok~`+>zKM9Rl3wBO0JBWqrH!O`UOya8oMAQ&|Zv$kEa-6^Fi- zUmS5X0kkG>6lNG=Flos&ild=LpjMn#9;n9hAli4ZiXMhh3G9qd1H`BT!Dxh7Ed4N8 z64*{hFbZ8Gw4+SeUS&c|T6K_tnJ|vBSI6Eu<(7DY14xv<5{TX3hFL5|nybI_SADNX z28cr65pytLLSUE@oR|&*Fd3v{2FSp@e>6ZakTmI;TBdh;=V^uWRJtq3JK4kd?BjdK?`@bnkYV7zu40?>T7Uy)fu7vwOZQ_a;K z21b=(^D+t4Wb^qF_#vB*mq1|- zpDck_b1dedTwWnDo?O0&(=Swq3txIiOdi0)L~wsTPSWP}=L;q9S%1Dr0^{exz&Ac1Z;0i_Geaz=Z9}d8 zOc`oPy)l%_pO=K0Z}o6qJ};5Bj_31I33Mvp@}~!}zJSk^nBNMlNRv6W%TBFtEZ(CtbiYzU`chEXpweJw484}iLX>XPvT1@;GS#+VeMqT zN@4=0SP@$@h2JkRAycgx_3%`FpW-RCB34^!VS=ae8cBL`nk5xkW{u9CGRtV8rRSv?mKzD>))W`zmOtS$t)6b0$sd$bv9l}_&(GqIN=&!e7U`%D$eUwf z{xQeu&hWX`z`Z`#vem7^(tNPO>P4S0q#b=XuaEZJKarOS;{*I4}M75t%Utxkiy2S#0#bCi?}<6wuW;{Lc&*L6+jOioPB@0Yi18q z{M(1KFmY@#mp`tG*^5zCzfp=L@1|Z~%;oQQ;tY|FPasmFIJVspAOa^bhe%vviS}B8 z^nhuKzIut}KyJ%>E1z|I@2*)Yh&6?-v6QdgHb+k%LqF+Jm6zh*U9` zzZ7lldXccHz7v5`(tH*GOb*R|yKCJ0iG6?!%jG-wEVzjWEqbUH^8M|u8RI3cM6n>< z!Nl`Jt-jaZgZ7^O3z0gAeR0ebO=0>i1IGW3%bqieACDYrTqHlHuI9|;^A_K`Yl<&h{Ful@#d01Z`HS9o?7iR+yza$k z;2-!;TxsHu5|@`-v9G;gE{=-=poj9d?{9Za@1L6ZLqywOaKuNfutt3H3Y>t4++4(# z;jf1`!Z{P`Lh(*=uaDs{G5kBkVz?a5+la80R&bJ5BJEPRMcWs_{lwChJjn56A3U4E Fe*wc|#SH)e diff --git a/settings/repository/org.broadinstitute/variant-1.85.1357.xml b/settings/repository/org.broadinstitute/variant-1.88.1401.xml similarity index 71% rename from settings/repository/org.broadinstitute/variant-1.85.1357.xml rename to settings/repository/org.broadinstitute/variant-1.88.1401.xml index f6d7a2caa..5db78b1e4 100644 --- a/settings/repository/org.broadinstitute/variant-1.85.1357.xml +++ b/settings/repository/org.broadinstitute/variant-1.88.1401.xml @@ -1,3 +1,3 @@ - + From 6253ba164e9277846e4a2c323d1decc388163010 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 4 Apr 2013 12:02:22 -0500 Subject: [PATCH 135/226] Using --keepOriginalAC in SelectVariants was causing it to emit bad VCFs * This occurred when one or more alleles were lost from the record after selection * Discussed here: http://gatkforums.broadinstitute.org/discussion/comment/4718#Comment_4718 * Added some integration tests for --keepOriginalAC (there were none before) --- .../SelectVariantsIntegrationTest.java | 26 +++++++++++++++++++ .../walkers/variantutils/SelectVariants.java | 20 +++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index c97f0bf02..303a2871a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -242,6 +242,32 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testRemoveMLE--" + testFile, spec); } + @Test + public void testKeepOriginalAC() { + String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalAC -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("ad7e8b25e431a3229a78cec063876559") + ); + + executeTest("testKeepOriginalAC--" + testFile, spec); + } + + @Test + public void testKeepOriginalACAndENV() { + String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("e9b8292212545684cdb163423329ee7e") + ); + + executeTest("testKeepOriginalACAndENV--" + testFile, spec); + } + @Test public void testMultipleRecordsAtOnePosition() { String testFile = privateTestDir + "selectVariants.onePosition.vcf"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 52c1acd2d..02c8ed8d8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -406,8 +406,8 @@ public class SelectVariants extends RodWalker implements TreeR headerLines.add(new VCFHeaderLine("source", "SelectVariants")); if (KEEP_ORIGINAL_CHR_COUNTS) { - headerLines.add(new VCFInfoHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC")); - headerLines.add(new VCFInfoHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF")); + headerLines.add(new VCFInfoHeaderLine("AC_Orig", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Original AC")); + headerLines.add(new VCFInfoHeaderLine("AF_Orig", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Original AF")); headerLines.add(new VCFInfoHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN")); } headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); @@ -670,7 +670,8 @@ public class SelectVariants extends RodWalker implements TreeR GenotypesContext newGC = sub.getGenotypes(); // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate) - if ( vc.getAlleles().size() != sub.getAlleles().size() ) + final boolean lostAllelesInSelection = vc.getAlleles().size() != sub.getAlleles().size(); + if ( lostAllelesInSelection ) newGC = GATKVariantContextUtils.stripPLsAndAD(sub.getGenotypes()); // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags @@ -697,15 +698,22 @@ public class SelectVariants extends RodWalker implements TreeR builder.genotypes(newGC); - addAnnotations(builder, sub); + addAnnotations(builder, sub, lostAllelesInSelection); return builder.make(); } - private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) { + /* + * Add annotations to the new VC + * + * @param builder the new VC to annotate + * @param originalVC the original -- but post-selection -- VC + * @param lostAllelesInSelection true if the original (pre-selection) VC has more alleles than the new one + */ + private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final boolean lostAllelesInSelection) { if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data - if (KEEP_ORIGINAL_CHR_COUNTS) { + if ( KEEP_ORIGINAL_CHR_COUNTS && !lostAllelesInSelection ) { if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); if ( originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) From ebe2edbef3ea028820d8b756efbce9319587b8db Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 5 Apr 2013 09:21:59 -0400 Subject: [PATCH 136/226] Fix caching indices in the PairHMM Problem: -------- PairHMM was generating positive likelihoods (even after the re-work of the model) Solution: --------- The caching idices were never re-initializing the initial conditions in the first position of the deletion matrix. Also the match matrix was being wrongly initialized (there is not necessarily a match in the first position). This commit fixes both issues on both the Logless and the Log10 versions of the PairHMM. Summarized Changes: ------------------ * Redesign the matrices to have only 1 col/row of padding instead of 2. * PairHMM class now owns the caching of the haplotype (keeps track of last haplotypes, and decides where the caching should start) * Initial condition (in the deletionMatrix) is now updated every time the haplotypes differ in length (this was wrong in the previous version) * Adjust the prior and probability matrices to be one based (logless) * Update Log10PairHMM to work with prior and probability matrices as well * Move prior and probability matrices to parent class * Move and rename padded lengths to parent class to simplify interface and prevent off by one errors in new implementations * Simple cleanup of PairHMMUnitTest class for a little speedup * Updated HC and UG integration test MD5's because of the new initialization (without enforcing match on first base). * Create static indices for the transition probabilities (for better readability) [fixes #47399227] --- .../LikelihoodCalculationEngine.java | 14 +- .../indels/PairHMMIndelErrorModel.java | 18 +- .../sting/utils/pairhmm/LoglessPairHMM.java | 80 ++++--- ...perGeneralPloidySuite1IntegrationTest.java | 2 +- ...perGeneralPloidySuite2IntegrationTest.java | 2 +- ...dGenotyperIndelCallingIntegrationTest.java | 16 +- .../UnifiedGenotyperIntegrationTest.java | 4 +- ...GenotyperNormalCallingIntegrationTest.java | 8 +- ...dGenotyperReducedReadsIntegrationTest.java | 2 +- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +- .../HaplotypeCallerIntegrationTest.java | 21 +- .../NanoSchedulerIntegrationTest.java | 2 +- .../sting/utils/pairhmm/PairHMMUnitTest.java | 205 ++++++++---------- .../sting/utils/pairhmm/Log10PairHMM.java | 169 ++++++++++----- .../sting/utils/pairhmm/PairHMM.java | 43 ++-- 15 files changed, 306 insertions(+), 286 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index dc5fed340..4ea2498c4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -106,12 +106,8 @@ public class LikelihoodCalculationEngine { if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } } - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - X_METRIC_LENGTH += 2; - Y_METRIC_LENGTH += 2; - // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { @@ -134,7 +130,6 @@ public class LikelihoodCalculationEngine { for( final GATKSAMRecord read : reads ) { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - Haplotype previousHaplotypeSeen = null; // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read final byte[] readQuals = read.getBaseQualities().clone(); final byte[] readInsQuals = read.getBaseInsertionQualities(); @@ -149,14 +144,9 @@ public class LikelihoodCalculationEngine { for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { final Haplotype haplotype = haplotypes.get(jjj); - - final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); - previousHaplotypeSeen = haplotype; - final boolean isFirstHaplotype = jjj == 0; final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - read.getReadBases(), readQuals, readInsQuals, readDelQuals, - overallGCP, haplotypeStart, isFirstHaplotype); + read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index e3d3c6640..4c5490395 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -349,7 +349,6 @@ public class PairHMMIndelErrorModel { int j=0; - byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -392,34 +391,25 @@ public class PairHMMIndelErrorModel { final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - final int X_METRIC_LENGTH = readBases.length+2; - final int Y_METRIC_LENGTH = haplotypeBases.length+2; - - if (previousHaplotypeSeen == null) { + if (firstHap) { //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); + pairHMM.initialize(readBases.length, haplotypeBases.length); + firstHap = false; } - int startIndexInHaplotype = 0; - if (previousHaplotypeSeen != null) - startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - previousHaplotypeSeen = haplotypeBases.clone(); readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, - contextLogGapContinuationProbabilities, startIndexInHaplotype, firstHap); + baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap); if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIndexInHaplotype); } perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; - firstHap = false; } } } diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index d94893e3e..b62d7a334 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -59,16 +59,20 @@ public final class LoglessPairHMM extends PairHMM { protected static final double SCALE_FACTOR_LOG10 = 300.0; protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10); - double[][] transition = null; // The cache - double[][] prior = null; // The cache - boolean constantsAreInitialized = false; + private static final int matchToMatch = 0; + private static final int indelToMatch = 1; + private static final int matchToInsertion = 2; + private static final int insertionToInsertion = 3; + private static final int matchToDeletion = 4; + private static final int deletionToDeletion = 5; + /** * {@inheritDoc} */ @Override - public void initialize( final int haplotypeMaxLength, final int readMaxLength) { - super.initialize(haplotypeMaxLength, readMaxLength); + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); transition = new double[paddedMaxReadLength][6]; prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; @@ -86,18 +90,22 @@ public final class LoglessPairHMM extends PairHMM { final byte[] overallGCP, final int hapStartIndex, final boolean recacheReadValues ) { + + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + final double initialValue = INITIAL_CONDITION / haplotypeBases.length; + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; + } + } + if ( ! constantsAreInitialized || recacheReadValues ) - initializeProbabilities(haplotypeBases.length, insertionGOP, deletionGOP, overallGCP); + initializeProbabilities(insertionGOP, deletionGOP, overallGCP); initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - // NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this - // read and haplotype lengths, not the max lengths - final int readXMetricLength = readBases.length + 2; - final int hapYMetricLength = haplotypeBases.length + 2; - - for (int i = 2; i < readXMetricLength; i++) { + for (int i = 1; i < paddedReadLength; i++) { // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based - for (int j = hapStartIndex+1; j < hapYMetricLength; j++) { + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { updateCell(i, j, prior[i][j], transition[i]); } } @@ -105,9 +113,9 @@ public final class LoglessPairHMM extends PairHMM { // final probability is the log10 sum of the last element in the Match and Insertion state arrays // this way we ignore all paths that ended in deletions! (huge) // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. - final int endI = readXMetricLength - 1; + final int endI = paddedReadLength - 1; double finalSumProbabilities = 0.0; - for (int j = 0; j < hapYMetricLength; j++) { + for (int j = 1; j < paddedHaplotypeLength; j++) { finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; } return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10; @@ -132,7 +140,7 @@ public final class LoglessPairHMM extends PairHMM { final byte qual = readQuals[i]; for (int j = startIndex; j < haplotypeBases.length; j++) { final byte y = haplotypeBases[j]; - prior[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); } } @@ -151,25 +159,15 @@ public final class LoglessPairHMM extends PairHMM { "overallGCP != null" }) @Ensures("constantsAreInitialized") - private void initializeProbabilities(final int haplotypeLength, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { - // the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init - final double initialValue = INITIAL_CONDITION / haplotypeLength; - matchMatrix[1][1] = initialValue; - - // fill in the first row - for( int jjj = 2; jjj < paddedMaxHaplotypeLength; jjj++ ) { - deletionMatrix[1][jjj] = initialValue; - } - - final int l = insertionGOP.length; - for (int i = 0; i < l; i++) { + private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + for (int i = 0; i < insertionGOP.length; i++) { final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); - transition[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); - transition[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); - transition[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); - transition[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); - transition[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); - transition[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); + transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); + transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); + transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); + transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); + transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); } // note that we initialized the constants @@ -185,14 +183,14 @@ public final class LoglessPairHMM extends PairHMM { * @param indI row index in the matrices to update * @param indJ column index in the matrices to update * @param prior the likelihood editing distance matrix for the read x haplotype - * @param transitition an array with the six transitition relevant to this location + * @param transition an array with the six transition relevant to this location */ - private void updateCell( final int indI, final int indJ, final double prior, final double[] transitition) { + private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transitition[0] + - insertionMatrix[indI - 1][indJ - 1] * transitition[1] + - deletionMatrix[indI - 1][indJ - 1] * transitition[1] ); - insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transitition[2] + insertionMatrix[indI - 1][indJ] * transitition[3]; - deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transitition[4] + deletionMatrix[indI][indJ - 1] * transitition[5]; + matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] + + insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] + + deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] ); + insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion]; + deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion]; } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 5cdc2c65f..34b19ed2d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "faadc0b77a91a716dbb1191fd579d025"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "603416111f34e2a735163fa97e1a8272"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 4299b024b..8a165cbeb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fe715b715526a7c1ebd575ff66bba716"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","13de8558acaa0b9082f2df477b45de9b"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index d0d77c8e0..6b26be0d0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -72,7 +72,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("51e022d07ead45a4e154f949b6642e84")); + Arrays.asList("118ed5b54fc9ce1cde89f06a20afebef")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -87,7 +87,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1d9c6fda344eeee76cbe4221251dc341")); + Arrays.asList("6ef59013331bc031ea37807b325d7d2c")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("2ec7262f0a3d04534ce1fe15cc79f52e")); + Arrays.asList("dd3ee4675377191e34aaf67335e0219a")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("3131cd7c49b623983a106db5228754b3")); + Arrays.asList("bb06ef8262f91664b7d2fe7e1e5df195")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("273f5daa936e93da98efd6ceb37d7533")); + Arrays.asList("0a2a8cc2d1a79e84624836a31de5491c")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("00a003a0908281384e981294434a9f3e")); + Arrays.asList("939f80c6d2dfb592956aed3bdeaf319d")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -175,7 +175,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("87521a1bde124c7c5908ed067060fe45")); + Arrays.asList("fc937f92e59dfe07b894411b5dfc166a")); executeTest("test minIndelFraction 0.0", spec); } @@ -183,7 +183,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("8a880b8b1662e31e0b5c65733eac6b74")); + Arrays.asList("41ad9e0edca4b9987390ba5c07f39e4a")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 15655622e..c89b1dfbf 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("3a805f5b823ccac19aaec01a3016100e")); + Arrays.asList("0a4a78da876bfa3d42170249a94357b4")); executeTest(String.format("test multiple technologies"), spec); } @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("25aa0259876692dc3c848a37369bac6a")); + Arrays.asList("89182fd4d9532ab4b2a0a84bfb557089")); executeTest(String.format("test calling with BAQ"), spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index c10b3d6df..a58d3f3a8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("3f8ee598c9b85aa1d2b85746ad46c1af")); + Arrays.asList("52b6086f4597da5b35ab902bea4066fc")); executeTest("test MultiSample Pilot1", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("31c0f0074b3306b54170056e93b69e11")); + Arrays.asList("28bfbff3da3af43d6a1eff673e5cb0f8")); executeTest("test Multiple SNP alleles", spec); } @@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("753d6358b1634107de76900200116805")); + Arrays.asList("a9edd04374ee9c42970291f39a50c191")); executeTest("test reverse trim", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("274eadae8a630a3fda9281d6d6253dea")); + Arrays.asList("6fc32ca9de769060f3c2a3d94f8f2f91")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index b63c591ce..21b7d0f86 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "c5939a7f5f85ea2fe994ce912732e180"); + testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16"); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 4204a0634..a891220c5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "73817d9173b8d9d05dac1f3092871f33"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7b67ac6213b7a6f759057fb9d7148fdc"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "8a110549543412fa682419e9a8f0dd1d"); + "eb41ed6f1d692368a0f67311d139a38a"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "5429c234d471434adc09d9e60b87de24"); + "c4c33c962aca12c51def9b8cde35b7d2"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 292bea50d..51c3296ac 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -47,15 +47,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broad.tribble.TribbleIndexedFeatureReader; import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.vcf.VCFCodec; import org.testng.annotations.Test; import java.io.File; @@ -80,12 +77,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "008958c211a8a439a7213a96f3dd7f6c"); + HCTest(CEUTRIO_BAM, "", "f132843e3c8e065a783cc4fdf9ee5df3"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "3b60c6133eeadfea028dffea93b88478"); + HCTest(NA12878_BAM, "", "15e0201f5c478310d278d2d03483c152"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -96,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "70bd5d0805bf6f51e5f61b377526c979"); + "48d309aed0cdc40cc983eeb5a8d12f53"); } @Test @@ -112,7 +109,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "34c7fcfe17a1d835e2dc403df9eb3591"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "b9d614efdaf38b87b459df421aab93a7"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "eae65d20836d6c6ebca9e25e33566f74"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -159,14 +156,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("35a8edeca7518835d67a10de21493eca")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a3d74040a4966bf7a04cbd4924970685")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c81d7e69dd4116890f06a71b19870300")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("40da88ed3722c512264b72db37f18720")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +185,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("f0a215faed194dc160f19e26293e85f8")); + Arrays.asList("69b83d578c14ed32d08ce4e7ff8a8a18")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +193,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("bea274584344fa6b4b0f98eee327bad8")); + Arrays.asList("0cae60d86a3f86854699217a30ece3e3")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index 96eaa109e..f9a4985b0 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "9a1202d849653f0480932f450ec507b4", nt, nct }); + tests.add(new Object[]{ "BOTH", "aad3a398273ec795e363268997247bd8", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 6dbcd0220..2499183a6 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -142,11 +142,11 @@ public class PairHMMUnitTest extends BaseTest { } public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { - pairHMM.initialize(refBasesWithContext.length, readBasesWithContext.length); + pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), - qualAsBytes(gcp, false, anchorIndel), 0, true); + qualAsBytes(gcp, false, anchorIndel), true); } private byte[] asBytes(final String bases, final boolean left, final boolean right) { @@ -268,8 +268,8 @@ public class PairHMMUnitTest extends BaseTest { if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) { final double exactLogL = cfg.calcLogL( exactHMM, true ); for ( final PairHMM hmm : getHMMs() ) { - double actualLogL = cfg.calcLogL( hmm, true ); - double expectedLogL = cfg.expectedLogL(); + final double actualLogL = cfg.calcLogL( hmm, true ); + final double expectedLogL = cfg.expectedLogL(); // compare to our theoretical expectation with appropriate tolerance Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm); @@ -283,10 +283,10 @@ public class PairHMMUnitTest extends BaseTest { @Test(enabled = !DEBUG, dataProvider = "OptimizedLikelihoodTestProvider") public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) { - double exactLogL = cfg.calcLogL( exactHMM, false ); + final double exactLogL = cfg.calcLogL( exactHMM, false ); for ( final PairHMM hmm : getHMMs() ) { - double calculatedLogL = cfg.calcLogL( hmm, false ); + final double calculatedLogL = cfg.calcLogL( hmm, false ); // compare to the exact reference implementation with appropriate tolerance Assert.assertEquals(calculatedLogL, exactLogL, cfg.getTolerance(hmm), String.format("Test: logL calc=%.2f expected=%.2f for %s with hmm %s", calculatedLogL, exactLogL, cfg.toString(), hmm)); Assert.assertTrue(MathUtils.goodLog10Probability(calculatedLogL), "Bad log10 likelihood " + calculatedLogL); @@ -296,64 +296,55 @@ public class PairHMMUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { - byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); - + final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + final byte matchQual = 90; + final byte mismatchQual = 20; + final byte indelQual = 80; final int offset = 2; - byte[] gop = new byte[haplotype1.length - 2 * offset]; - Arrays.fill(gop, (byte) 80); - byte[] gcp = new byte[haplotype1.length - 2 * offset]; - Arrays.fill(gcp, (byte) 80); + final byte[] gop = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gop, indelQual); + final byte[] gcp = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gcp, indelQual); + loglessHMM.initialize(gop.length, haplotype1.length); for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) { - byte[] quals = new byte[haplotype1.length - 2 * offset]; - Arrays.fill(quals, (byte) 90); - // one read mismatches the haplotype - quals[k] = 20; - - byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); + final byte[] quals = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(quals, matchQual); + // one base mismatches the haplotype + quals[k] = mismatchQual; + final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - originalHMM.initialize(haplotype1.length, mread.length); - double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( - haplotype1, mread, - quals, gop, gop, - gcp, 0, false); - - System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); - - final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20)); + final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false); + final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual)); Assert.assertEquals(res1, expected, 1e-2); } } @Test(enabled = ! DEBUG) public void testMismatchInEveryPositionInTheRead() { - byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + final byte matchQual = 90; + final byte mismatchQual = 20; + final byte indelQual = 80; final int offset = 2; - byte[] gop = new byte[haplotype1.length - offset]; - Arrays.fill(gop, (byte) 80); - byte[] gcp = new byte[haplotype1.length - offset]; - Arrays.fill(gcp, (byte) 80); + final byte[] gop = new byte[haplotype1.length - offset]; + Arrays.fill(gop, indelQual); + final byte[] gcp = new byte[haplotype1.length - offset]; + Arrays.fill(gcp, indelQual); + loglessHMM.initialize(gop.length, haplotype1.length); for( int k = 0; k < haplotype1.length - offset; k++ ) { - byte[] quals = new byte[haplotype1.length - offset]; - Arrays.fill(quals, (byte) 90); - // one read mismatches the haplotype - quals[k] = 20; - - byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); + final byte[] quals = new byte[haplotype1.length - offset]; + Arrays.fill(quals, matchQual); + // one base mismatches the haplotype with low qual + quals[k] = mismatchQual; + final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - originalHMM.initialize(haplotype1.length, mread.length); - double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( - haplotype1, mread, - quals, gop, gop, - gcp, 0, false); - - System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); - - final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20)); + final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false); + final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual)); Assert.assertEquals(res1, expected, 1e-2); } } @@ -376,35 +367,35 @@ public class PairHMMUnitTest extends BaseTest { @Test(enabled = !DEBUG, dataProvider = "HMMProvider") void testMultipleReadMatchesInHaplotype(final PairHMM hmm, final int readSize, final int refSize) { - byte[] readBases = Utils.dupBytes((byte)'A', readSize); - byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes(); - byte baseQual = 20; - byte insQual = 37; - byte delQual = 37; - byte gcp = 10; - hmm.initialize(refBases.length, readBases.length); - double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, + final byte[] readBases = Utils.dupBytes((byte)'A', readSize); + final byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes(); + final byte baseQual = 20; + final byte insQual = 37; + final byte delQual = 37; + final byte gcp = 10; + hmm.initialize(readBases.length, refBases.length); + final double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), 0, true); + Utils.dupBytes(gcp, readBases.length), true); Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d); } @Test(enabled = !DEBUG, dataProvider = "HMMProvider") void testAllMatchingRead(final PairHMM hmm, final int readSize, final int refSize) { - byte[] readBases = Utils.dupBytes((byte)'A', readSize); - byte[] refBases = Utils.dupBytes((byte)'A', refSize); - byte baseQual = 20; - byte insQual = 100; - byte delQual = 100; - byte gcp = 100; - hmm.initialize(refBases.length, readBases.length); + final byte[] readBases = Utils.dupBytes((byte)'A', readSize); + final byte[] refBases = Utils.dupBytes((byte)'A', refSize); + final byte baseQual = 20; + final byte insQual = 100; + final byte delQual = 100; + final byte gcp = 100; + hmm.initialize(readBases.length, refBases.length); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), 0, true); + Utils.dupBytes(gcp, readBases.length), true); double expected = 0; final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length; if (readBases.length < refBases.length) { @@ -439,45 +430,42 @@ public class PairHMMUnitTest extends BaseTest { @Test(enabled = !DEBUG, dataProvider = "HMMProviderWithBigReads") void testReallyBigReads(final PairHMM hmm, final String read, final String ref) { - byte[] readBases = read.getBytes(); - byte[] refBases = ref.getBytes(); - byte baseQual = 30; - byte insQual = 40; - byte delQual = 40; - byte gcp = 10; - hmm.initialize(refBases.length, readBases.length); - double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, + final byte[] readBases = read.getBytes(); + final byte[] refBases = ref.getBytes(); + final byte baseQual = 30; + final byte insQual = 40; + final byte delQual = 40; + final byte gcp = 10; + hmm.initialize(readBases.length, refBases.length); + hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), 0, true); - Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + read.length() + " bases and ref with " + ref.length() + " bases"); + Utils.dupBytes(gcp, readBases.length), true); } @Test(enabled = !DEBUG) void testPreviousBadValue() { - byte[] readBases = "A".getBytes(); - byte[] refBases = "AT".getBytes(); - byte baseQual = 30; - byte insQual = 40; - byte delQual = 40; - byte gcp = 10; + final byte[] readBases = "A".getBytes(); + final byte[] refBases = "AT".getBytes(); + final byte baseQual = 30; + final byte insQual = 40; + final byte delQual = 40; + final byte gcp = 10; - exactHMM.initialize(refBases.length, readBases.length); - double d = exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, + exactHMM.initialize(readBases.length, refBases.length); + exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), 0, true); - //exactHMM.dumpMatrices(); + Utils.dupBytes(gcp, readBases.length), true); - loglessHMM.initialize(refBases.length, readBases.length); - double logless = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, + loglessHMM.initialize(readBases.length, refBases.length); + loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), 0, true); -// loglessHMM.dumpMatrices(); + Utils.dupBytes(gcp, readBases.length), true); } @DataProvider(name = "JustHMMProvider") @@ -493,25 +481,16 @@ public class PairHMMUnitTest extends BaseTest { @Test(enabled = !DEBUG, dataProvider = "JustHMMProvider") void testMaxLengthsBiggerThanProvidedRead(final PairHMM hmm) { + final byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes(); + final byte[] refBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes(); + + final byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29}; + final byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45}; + final byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43}; + final byte[] gcp = Utils.dupBytes((byte) 10, delQual.length); + hmm.initialize(readBases.length + 100, refBases.length + 100); for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) { - byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes(); - byte[] refBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes(); - byte gcp = 10; - - byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29}; - byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45}; - byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43}; - - final int maxHaplotypeLength = refBases.length + nExtraMaxSize; - final int maxReadLength = readBases.length + nExtraMaxSize; - - hmm.initialize(maxHaplotypeLength, maxReadLength); - double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - quals, - insQual, - delQual, - Utils.dupBytes(gcp, readBases.length), 0, true); - Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + readBases.length + " bases and ref with " + refBases.length + " bases"); + hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true); } } @@ -551,7 +530,7 @@ public class PairHMMUnitTest extends BaseTest { final int maxHaplotypeLength = prefix.length() + root1.length(); // the initialization occurs once, at the start of the evalution of reads - hmm.initialize(maxHaplotypeLength, maxReadLength); + hmm.initialize(maxReadLength, maxHaplotypeLength); for ( int prefixStart = prefix.length(); prefixStart >= 0; prefixStart-- ) { final String myPrefix = prefix.substring(prefixStart, prefix.length()); @@ -574,9 +553,7 @@ public class PairHMMUnitTest extends BaseTest { final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length); final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length); final byte[] gcp = Utils.dupBytes((byte)10, readBases.length); - double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( - hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, - hapStart, recache); + double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache); Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart); return d; } @@ -629,7 +606,7 @@ public class PairHMMUnitTest extends BaseTest { // didn't call initialize => should exception out double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, 0, true); + baseQuals, baseQuals, baseQuals, baseQuals, true); } @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @@ -640,7 +617,7 @@ public class PairHMMUnitTest extends BaseTest { hmm.initialize(3, 3); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, 0, true); + baseQuals, baseQuals, baseQuals, baseQuals, true); } @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @@ -649,8 +626,8 @@ public class PairHMMUnitTest extends BaseTest { byte[] refBases = "AAAT".getBytes(); byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length); - hmm.initialize(3, 2); + hmm.initialize(2, 3); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, 0, true); + baseQuals, baseQuals, baseQuals, baseQuals, true); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index d7c55e37c..ab6c321e8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.pairhmm; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; @@ -34,15 +35,22 @@ import java.util.Arrays; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * - * User: rpoplin + * User: rpoplin, carneiro * Date: 3/1/12 */ -public class Log10PairHMM extends PairHMM { +public final class Log10PairHMM extends PairHMM { /** * Should we use exact log10 calculation (true), or an approximation (false)? */ private final boolean doExactLog10; + private static final int matchToMatch = 0; + private static final int indelToMatch = 1; + private static final int matchToInsertion = 2; + private static final int insertionToInsertion = 3; + private static final int matchToDeletion = 4; + private static final int deletionToDeletion = 5; + /** * Create an uninitialized PairHMM * @@ -64,14 +72,17 @@ public class Log10PairHMM extends PairHMM { * {@inheritDoc} */ @Override - public void initialize( final int haplotypeMaxLength, final int readMaxLength) { - super.initialize(haplotypeMaxLength, readMaxLength); + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); for( int iii=0; iii < paddedMaxReadLength; iii++ ) { Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** @@ -86,38 +97,91 @@ public class Log10PairHMM extends PairHMM { final byte[] overallGCP, final int hapStartIndex, final boolean recacheReadValues ) { - // the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual - // read and haplotypes, not the maximum - final double initialValue = Math.log10((double) 1/haplotypeBases.length); - matchMatrix[1][1] = initialValue; - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMatrix, insertionMatrix, deletionMatrix); + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + final double initialValue = Math.log10(1.0 / haplotypeBases.length); + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; } } - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - double result = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); - for (int j = 2; j < Y_METRIC_LENGTH; j++) - result = myLog10SumLog10(new double[]{result, matchMatrix[endI][j], insertionMatrix[endI][j]}); + if ( ! constantsAreInitialized || recacheReadValues ) + initializeProbabilities(insertionGOP, deletionGOP, overallGCP); + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); - return result; + for (int i = 1; i < paddedReadLength; i++) { + // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based + for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + final int endI = paddedReadLength - 1; + double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); + for (int j = 2; j < paddedHaplotypeLength; j++) + finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); + + return finalSumProbabilities; } + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + for (int i = 0; i < insertionGOP.length; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + transition[i+1][indelToMatch] = QualityUtils.qualToProbLog10(overallGCP[i]); + transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + + // note that we initialized the constants + constantsAreInitialized = true; + } + + /** * Compute the log10SumLog10 of the values * @@ -136,37 +200,24 @@ public class Log10PairHMM extends PairHMM { return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); } - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMatrix, final double[][] insertionMatrix, final double[][] deletionMatrix ) { + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMatrix[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + d0, insertionMatrix[indI - 1][indJ - 1] + e0, deletionMatrix[indI - 1][indJ - 1] + e0}); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - insertionMatrix[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ] + d1, insertionMatrix[indI - 1][indJ] + e1}); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]); - final double e2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - deletionMatrix[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMatrix[indI][indJ - 1] + d2, deletionMatrix[indI][indJ - 1] + e2}); + matchMatrix[indI][indJ] = prior + + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], + insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch], + deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]}); + insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]}); + deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]}); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index f71819a69..33cd191f6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -40,9 +40,11 @@ import java.util.Arrays; public abstract class PairHMM { protected final static Logger logger = Logger.getLogger(PairHMM.class); - protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - protected static final byte DEFAULT_GOP = (byte) 45; - protected static final byte DEFAULT_GCP = (byte) 10; + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + protected boolean constantsAreInitialized = false; + + protected byte[] previousHaplotypeBases; public enum HMM_IMPLEMENTATION { /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ @@ -58,14 +60,18 @@ public abstract class PairHMM { protected double[][] deletionMatrix = null; protected int maxHaplotypeLength, maxReadLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength; + protected int paddedReadLength, paddedHaplotypeLength; private boolean initialized = false; /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM */ - public void initialize( final int haplotypeMaxLength, final int readMaxLength ) { + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); @@ -73,15 +79,21 @@ public abstract class PairHMM { maxReadLength = readMaxLength; // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - paddedMaxReadLength = readMaxLength + 2; - paddedMaxHaplotypeLength = haplotypeMaxLength + 2; + paddedMaxReadLength = readMaxLength + 1; + paddedMaxHaplotypeLength = haplotypeMaxLength + 1; matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + previousHaplotypeBases = null; + + constantsAreInitialized = false; initialized = true; } + + /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. @@ -98,8 +110,6 @@ public abstract class PairHMM { * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases - * @param hapStartIndex start the hmm calculation at this offset in haplotype bases. Used in the caching calculation - * where multiple haplotypes are used, and they only diff starting at hapStartIndex * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated * parameters are the same, and only the haplotype bases are changing underneath us * @return the log10 probability of read coming from the haplotype under the provided error model @@ -110,7 +120,6 @@ public abstract class PairHMM { final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final int hapStartIndex, final boolean recacheReadValues ) { if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); @@ -121,14 +130,22 @@ public abstract class PairHMM { if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); - if ( hapStartIndex < 0 || hapStartIndex > haplotypeBases.length ) throw new IllegalArgumentException("hapStartIndex is bad, must be between 0 and haplotype length " + haplotypeBases.length + " but got " + hapStartIndex); + + paddedReadLength = readBases.length + 1; + paddedHaplotypeLength = haplotypeBases.length + 1; + + final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length ) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases); double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); - if ( MathUtils.goodLog10Probability(result) ) - return result; - else + if ( ! MathUtils.goodLog10Probability(result) ) throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); + + // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). + // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. + previousHaplotypeBases = haplotypeBases; + + return result; } /** From c9d3c67a9b02e4f261400e9d08d12557dadd07e2 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 8 Apr 2013 11:44:22 -0400 Subject: [PATCH 137/226] Small Queue/scala improvements, and commiting pipeline scripts developed for ancient DNA processing for posterity: -- Picard extension so Queue scripts can use FastqToSam -- Single-sample BAM processing: merge/trim reads + BWA + IR + MD + BQSR. Mostly identical to standard pipeline, except for the adaptor trimming/merging which is critical for short-insert libraries. -- Single-sample calling (experimental, work in progress): standard UG run but outputting at all sites, meant for deep whole genomes. New scripts --- .../queue/extensions/picard/FastqToSam.scala | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala new file mode 100644 index 000000000..7b9e657bf --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/FastqToSam.scala @@ -0,0 +1,104 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.queue.extensions.picard + +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +class FastqToSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction /*with PicardBamFunction*/ { + analysisName = "FastqToSam" + javaMainClass = "net.sf.picard.sam.FastqToSam" + + @Input(shortName = "fq1", fullName = "input_fq_file1", required = true, doc = "Input Fastq file to extract reads from (single-end fastq or, if paired, first end of the pair fastq)") + var fastq: File = _ + + @Input(shortName = "fq2", fullName = "input_fq_file2", required = false, doc = "Input Fastq file to extract reads from (if paired, second end of the pair fastq).") + var secondEndFastQ: File = _ + + @Output(shortName = "bam", fullName = "output_bam_file", required = true, doc = "Output bam file .") + var bam: File = _ + + @Argument(shortName = "SM", fullName = "SM", required = false, doc = "SM") + var SM: String = "SM" + + @Argument(shortName = "LIB", fullName = "LIB", required = false, doc = "LIB") + var LIB: String = "LIB" + + @Argument(shortName = "PU", fullName = "PU", required = false, doc = "PU") + var PU: String = "PU" + + @Argument(shortName = "RG", fullName = "RG", required = false, doc = "RG") + var RG: String = "RG" + + @Argument(shortName = "PL", fullName = "PL", required = false, doc = "PL") + var PL: String = "illumina" + + @Argument(shortName = "CN", fullName = "CN", required = false, doc = "CN") + var CN: String = "CN" + + +// override def inputBams = Seq(fastq) +// override def outputBam = bam +// this.sortOrder = null + val createIndex:Boolean = true + override def commandLine = super.commandLine + + required("FASTQ=" + fastq) + + optional("FASTQ2=", secondEndFastQ, spaceSeparated=false) + + required("OUTPUT=" + bam) + + optional("READ_GROUP_NAME=", RG, spaceSeparated=false) + + required("SAMPLE_NAME=" + SM) + + optional("LIBRARY_NAME=", LIB, spaceSeparated=false) + + optional("PLATFORM_UNIT=", PU, spaceSeparated=false) + + optional("PLATFORM=", PL, spaceSeparated=false) + + optional("CREATE_INDEX=", createIndex, spaceSeparated=false) + + optional("SEQUENCING_CENTER=", CN, spaceSeparated=false) +} \ No newline at end of file From b7d59ea13bfc1602a3152269853979d2d1f98d3a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 1 Apr 2013 10:43:43 -0400 Subject: [PATCH 138/226] LIBS unit test debugging should be false --- .../utils/locusiterator/LocusIteratorByStateUnitTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index fd87c1c12..d2f29ee7a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -51,7 +51,7 @@ import java.util.*; * testing of the new (non-legacy) version of LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { - private static final boolean DEBUG = true; + private static final boolean DEBUG = false; protected LocusIteratorByState li; @Test(enabled = true) @@ -361,7 +361,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { // comprehensive LIBS/PileupElement tests // //////////////////////////////////////////// - @DataProvider(name = "LIBSTest") + @DataProvider(name = "MyLIBSTest") public Object[][] makeLIBSTest() { final List tests = new LinkedList(); @@ -377,7 +377,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { // Arrays.asList(3)); } - @Test(enabled = true && ! DEBUG, dataProvider = "LIBSTest") + @Test(enabled = ! DEBUG, dataProvider = "MyLIBSTest") public void testLIBS(LIBSTest params) { // create the iterator by state with the fake reads and fake records final GATKSAMRecord read = params.makeRead(); From bff13bb5c56035276646711b318ac479e30074cc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Mar 2013 08:34:28 -0400 Subject: [PATCH 139/226] Move Haplotype class to its own package in utils --- .../gatk/walkers/genotyper/ErrorModel.java | 2 +- ...GeneralPloidyIndelGenotypeLikelihoods.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 1 + ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 20 +++--- .../haplotypecaller/HaplotypeCaller.java | 1 + .../haplotypecaller/HaplotypeResolver.java | 2 +- .../LikelihoodCalculationEngine.java | 2 +- .../haplotypecaller/LocalAssemblyEngine.java | 2 +- .../indels/HaplotypeIndelErrorModel.java | 2 +- .../indels/PairHMMIndelErrorModel.java | 2 +- .../DeBruijnAssemblerUnitTest.java | 2 +- .../GenotypingEngineUnitTest.java | 1 + .../KMerErrorCorrectorUnitTest.java | 67 +++++++------------ .../LikelihoodCalculationEngineUnitTest.java | 4 -- .../utils/{ => haplotype}/Haplotype.java | 3 +- .../AllHaplotypeBAMWriter.java | 5 +- .../CalledHaplotypeBAMWriter.java | 2 +- .../HaplotypeBAMWriter.java | 2 +- .../{ => haplotype}/HaplotypeUnitTest.java | 4 +- .../HaplotypeBAMWriterUnitTest.java | 2 +- 22 files changed, 54 insertions(+), 78 deletions(-) rename public/java/src/org/broadinstitute/sting/utils/{ => haplotype}/Haplotype.java (99%) rename public/java/test/org/broadinstitute/sting/utils/{ => haplotype}/HaplotypeUnitTest.java (98%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 49494ebb0..7ce736b0c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index c957bb9db..2f2a93fa4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index bd25fb6c5..9c4694955 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.*; import java.util.*; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 858a3370b..8a766ba48 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -55,7 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 198abeac8..9bc0713c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -55,7 +55,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index ee9993b4f..34d81d405 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -58,9 +58,9 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.*; import java.io.PrintStream; @@ -697,15 +697,6 @@ public class GenotypingEngine { return eventAllelesForSample; } - protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { - for( final VariantContext vc : list ) { - if( vc.hasSameAllelesAs(vcToTest) ) { - return true; - } - } - return false; - } - protected static Map generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { final Map vcs = new LinkedHashMap(); @@ -794,6 +785,15 @@ public class GenotypingEngine { return vcs; } + protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { + for( final VariantContext vc : list ) { + if( vc.hasSameAllelesAs(vcToTest) ) { + return true; + } + } + return false; + } + protected static class Event { public VariantContext vc; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index da077ff02..d77caa2a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -77,6 +77,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index facc929cd..03af9b59b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -58,7 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.vcf.VCFHeader; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 4ea2498c4..df1c9aabc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -50,7 +50,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index c31405872..23cbc3265 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.variant.variantcontext.VariantContext; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index f7686bdf5..cd4ea778d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 4c5490395..a1ce5afdb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -48,7 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 86d331dae..59d13dee4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -57,7 +57,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 8b09e91ae..2be42337d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -56,6 +56,7 @@ import net.sf.picard.reference.ReferenceSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java index a4edfcacc..f8a540b70 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java @@ -1,48 +1,27 @@ /* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index 58f9a2e74..48c9d3c1a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -53,14 +53,10 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; */ import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; - /** * Unit tests for LikelihoodCalculationEngine */ diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/utils/Haplotype.java rename to public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 070ae4f5d..6dc223616 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.haplotype; import com.google.java.contract.Requires; import net.sf.samtools.Cigar; @@ -31,6 +31,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java index f6fa44ac5..9936bd9ab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -26,11 +26,8 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index aae00c3ea..08b4fff7c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index c0d3b38fa..c80287bca 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -28,7 +28,7 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java index 0e4ec2b63..fe02aea9f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java @@ -23,15 +23,15 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.haplotype; -import net.sf.picard.util.CigarUtil; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java index 43969c7a0..89d87a3c3 100644 --- a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; From 0310499b656f4a6bc41b46c64abaabd5e473d984 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Mar 2013 09:36:14 -0400 Subject: [PATCH 140/226] System to merge multiple nearby alleles into block substitutions -- Block substitution algorithm that merges nearby events based on distance. -- Also does some cleanup of GenotypingEngine --- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 339 +++++++----------- .../haplotypecaller/HaplotypeResolver.java | 4 +- .../GenotypingEngineUnitTest.java | 3 +- .../sting/utils/haplotype/EventExtractor.java | 307 ++++++++++++++++ .../sting/utils/haplotype/Haplotype.java | 12 +- .../sting/utils/sam/AlignmentUtils.java | 61 ++++ .../variant/GATKVariantContextUtils.java | 17 + .../haplotype/EventExtractorUnitTest.java | 171 +++++++++ 9 files changed, 707 insertions(+), 209 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 9bc0713c0..1fd2b9c00 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -432,7 +432,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { - final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place + final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 34d81d405..8e76b6ea6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -58,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventExtractor; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; @@ -72,7 +73,6 @@ public class GenotypingEngine { private final boolean DEBUG; private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied - private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); private final VariantAnnotatorEngine annotationEngine; public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) { @@ -145,99 +145,26 @@ public class GenotypingEngine { final GenomeLocParser genomeLocParser, final List activeAllelesToGenotype ) { // sanity check input arguments - if (UG_engine == null) - throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); - if (haplotypes == null || haplotypes.isEmpty()) - throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); - if (samples == null || samples.isEmpty()) - throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples); - if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) - throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); - if (ref == null || ref.length == 0 ) - throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) - throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); - if (activeRegionWindow == null ) - throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); - if (activeAllelesToGenotype == null ) - throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); - if (genomeLocParser == null ) - throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); + if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); + if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); + if (samples == null || samples.isEmpty()) throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples); + if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); + if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); + if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); + if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); + if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); - final List returnCalls = new ArrayList(); - final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { logger.info("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } - if( DEBUG ) { - logger.info(h.toString()); - logger.info("> Cigar = " + h.getCigar()); - logger.info(">> Events = " + h.getEventMap()); - } - } - - cleanUpSymbolicUnassembledEvents( haplotypes ); - if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); - cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events - } - if( in_GGA_mode ) { - for( final VariantContext compVC : activeAllelesToGenotype ) { - startPosKeySet.add( compVC.getStart() ); - } - } - - final Set calledHaplotypes = new HashSet(); + // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference + // that carry events among the haplotypes + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, samples, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); // Walk along each position in the key set and create each event to be outputted + final Set calledHaplotypes = new HashSet(); + final List returnCalls = new ArrayList(); for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final List eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view - final List priorityList = new ArrayList(); // used to merge overlapping events into common reference view - - if( !in_GGA_mode ) { - for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - priorityList.add(vc.getSource()); - } - } - } else { // we are in GGA mode! - int compCount = 0; - for( final VariantContext compVC : activeAllelesToGenotype ) { - if( compVC.getStart() == loc ) { - int alleleCount = 0; - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - List alleleSet = new ArrayList(2); - alleleSet.add(compVC.getReference()); - alleleSet.add(compAltAllele); - final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; - // check if this event is already in the list of events due to a repeat in the input alleles track - final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); - boolean alreadyExists = false; - for( final VariantContext eventToTest : eventsAtThisLoc ) { - if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { - alreadyExists = true; - } - } - if( !alreadyExists ) { - priorityList.add(vcSourceName); - eventsAtThisLoc.add(candidateEventToAdd); - } - alleleCount++; - } - } - compCount++; - } - } + final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); if( eventsAtThisLoc.isEmpty() ) { continue; } @@ -245,7 +172,7 @@ public class GenotypingEngine { final Map> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes); // Sanity check the priority list for mistakes - validatePriorityList( priorityList, eventsAtThisLoc ); + final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); @@ -264,7 +191,6 @@ public class GenotypingEngine { if( DEBUG ) { logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); } final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); @@ -277,7 +203,6 @@ public class GenotypingEngine { final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = call; - // TODO -- should be before annotated call, so that QDL works correctly if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); } @@ -295,6 +220,117 @@ public class GenotypingEngine { return new CalledHaplotypes(returnCalls, calledHaplotypes); } + /** + * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts + * + * @param haplotypes the list of haplotypes we're working with + * @param samples the samples we're working with + * @param haplotypeReadMap map from samples -> the per read allele likelihoods + * @param ref the reference bases (over the same interval as the haplotypes) + * @param refLoc the span of the reference bases + * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) + * @return + */ + private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final byte[] ref, + final GenomeLoc refLoc, + final List activeAllelesToGenotype) { + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); + int hapNumber = 0; + + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + + if( DEBUG ) logger.info("=== Best Haplotypes ==="); + for( final Haplotype h : haplotypes ) { + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( new EventExtractor( h, ref, refLoc, "HC" + hapNumber++ ) ); + if( ! in_GGA_mode ) { + startPosKeySet.addAll(h.getEventMap().getStartPositions()); + } + + if( DEBUG ) { + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); + } + } + + cleanUpSymbolicUnassembledEvents( haplotypes ); + if ( !in_GGA_mode && samples.size() >= 10 ) { + // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); + cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events + } + + if ( in_GGA_mode ) { + for( final VariantContext compVC : activeAllelesToGenotype ) { + startPosKeySet.add( compVC.getStart() ); + } + } + + return startPosKeySet; + } + + /** + * Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view + * @param vcs a list of variant contexts + * @return the list of the sources of vcs in the same order + */ + private List makePriorityList(final List vcs) { + final List priorityList = new LinkedList(); + for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); + + return priorityList; + } + + private List getVCsAtThisLocation(final List haplotypes, + final int loc, + final List activeAllelesToGenotype) { + // the overlapping events to merge into a common reference view + final List eventsAtThisLoc = new ArrayList(); + + if( activeAllelesToGenotype.isEmpty() ) { + for( final Haplotype h : haplotypes ) { + final EventExtractor eventMap = h.getEventMap(); + final VariantContext vc = eventMap.get(loc); + if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { + eventsAtThisLoc.add(vc); + } + } + } else { // we are in GGA mode! + int compCount = 0; + for( final VariantContext compVC : activeAllelesToGenotype ) { + if( compVC.getStart() == loc ) { + int alleleCount = 0; + for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { + List alleleSet = new ArrayList(2); + alleleSet.add(compVC.getReference()); + alleleSet.add(compAltAllele); + final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; + // check if this event is already in the list of events due to a repeat in the input alleles track + final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make(); + boolean alreadyExists = false; + for( final VariantContext eventToTest : eventsAtThisLoc ) { + if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) { + alreadyExists = true; + } + } + if( !alreadyExists ) { + eventsAtThisLoc.add(candidateEventToAdd); + } + alleleCount++; + } + } + compCount++; + } + } + + return eventsAtThisLoc; + } + /** * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele * @param samples List of samples to genotype @@ -322,23 +358,6 @@ public class GenotypingEngine { return genotypes; } - private void validatePriorityList( final List priorityList, final List eventsAtThisLoc ) { - for( final VariantContext vc : eventsAtThisLoc ) { - if( !priorityList.contains(vc.getSource()) ) { - throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); - } - } - for( final String name : priorityList ) { - boolean found = false; - for( final VariantContext vc : eventsAtThisLoc ) { - if(vc.getSource().equals(name)) { found = true; break; } - } - if( !found ) { - throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles."); - } - } - } - private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, final Map perSampleReadMap, final Map> perSampleFilteredReadList, @@ -382,10 +401,10 @@ public class GenotypingEngine { protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final List haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { - for( final VariantContext vc : h.getEventMap().values() ) { + for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { if( vc.isSymbolic() ) { for( final Haplotype h2 : haplotypes ) { - for( final VariantContext vc2 : h2.getEventMap().values() ) { + for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) { if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events haplotypesToRemove.add(h); break; @@ -512,11 +531,10 @@ public class GenotypingEngine { // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) { - eventMap.remove(thisStart); - eventMap.remove(nextStart); - eventMap.put(mergedVC.getStart(), mergedVC); + if( h.getEventMap().containsKey(thisStart) && h.getEventMap().containsKey(nextStart) ) { + h.getEventMap().remove(thisStart); + h.getEventMap().remove(nextStart); + h.getEventMap().put(mergedVC.getStart(), mergedVC); } } startPosKeySet.add(mergedVC.getStart()); @@ -697,92 +715,9 @@ public class GenotypingEngine { return eventAllelesForSample; } - protected static Map generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { - final Map vcs = new LinkedHashMap(); - - int refPos = alignmentStartHapwrtRef; - if( refPos < 0 ) { return null; } // Protection against SW failures - int alignmentPos = 0; - - for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { - final CigarElement ce = cigar.getCigarElement(cigarIndex); - final int elementLength = ce.getLength(); - switch( ce.getOperator() ) { - case I: - { - if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig - final List insertionAlleles = new ArrayList(); - final int insertionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) ) { - insertionAlleles.add( Allele.create(refByte, true) ); - } - if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele - insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); - } else { - byte[] insertionBases = new byte[]{}; - insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base - insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); - if( BaseUtils.isAllRegularBases(insertionBases) ) { - insertionAlleles.add( Allele.create(insertionBases, false) ); - } - } - if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele - vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); - } - } - alignmentPos += elementLength; - break; - } - case S: - { - alignmentPos += elementLength; - break; - } - case D: - { - if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig - final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base - final List deletionAlleles = new ArrayList(); - final int deletionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { - deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(refByte, false) ); - vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); - } - } - refPos += elementLength; - break; - } - case M: - case EQ: - case X: - { - for( int iii = 0; iii < elementLength; iii++ ) { - final byte refByte = ref[refPos]; - final byte altByte = alignment[alignmentPos]; - if( refByte != altByte ) { // SNP! - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { - final List snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refByte, true ) ); - snpAlleles.add( Allele.create( altByte, false ) ); - vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); - } - } - refPos++; - alignmentPos++; - } - break; - } - case N: - case H: - case P: - default: - throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); - } - } - return vcs; + @Deprecated + protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + return new EventExtractor(haplotype, ref, refLoc, sourceNameToAdd); } protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index 03af9b59b..134863b8b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker { } // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2)); + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2)); if ( source1Map.size() == 0 || source2Map.size() == 0 ) { // TODO -- handle errors appropriately logger.debug("No source alleles; aborting at " + refContext.getLocus()); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 2be42337d..9fb75463a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -199,7 +199,8 @@ public class GenotypingEngineUnitTest extends BaseTest { public Map calcAlignment() { final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); - return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); + final Haplotype h = new Haplotype(hap, false, alignment.getAlignmentStart2wrt1(), alignment.getCigar()); + return GenotypingEngine.generateVCsFromAlignment( h, ref, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java new file mode 100644 index 000000000..c32cde641 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotype; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; + +import java.util.*; + +/** + * Extract simple VariantContext events from a single haplotype + * + * User: depristo + * Date: 3/27/13 + * Time: 8:35 AM + */ +public class EventExtractor extends TreeMap { + private final static Logger logger = Logger.getLogger(EventExtractor.class); + private final static boolean mergeClumpedEvents = true; + protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; + public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + + public EventExtractor( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + super(); + + processCigarForInitialEvents(haplotype, ref, refLoc, sourceNameToAdd); + if ( mergeClumpedEvents && getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { + replaceClumpedEventsWithBlockSubstititions(haplotype, ref, refLoc); + } + } + + /** + * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference + * @param stateForTesting + */ + protected EventExtractor(final Map stateForTesting) { + super(stateForTesting); + } + + /** + * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference + * @param stateForTesting + */ + protected EventExtractor(final Collection stateForTesting) { + for ( final VariantContext vc : stateForTesting ) + addVC(vc); + } + + protected void processCigarForInitialEvents(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { + final Cigar cigar = haplotype.getCigar(); + final byte[] alignment = haplotype.getBases(); + + int refPos = haplotype.getAlignmentStartHapwrtRef(); + if( refPos < 0 ) { + return; + } // Protection against SW failures + + int alignmentPos = 0; + + for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { + final CigarElement ce = cigar.getCigarElement(cigarIndex); + final int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case I: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final List insertionAlleles = new ArrayList(); + final int insertionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele + insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); + } else { + byte[] insertionBases = new byte[]{}; + insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base + insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength)); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); + } + } + alignmentPos += elementLength; + break; + } + case S: + { + alignmentPos += elementLength; + break; + } + case D: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base + final List deletionAlleles = new ArrayList(); + final int deletionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { + deletionAlleles.add( Allele.create(deletionBases, true) ); + deletionAlleles.add( Allele.create(refByte, false) ); + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } + } + refPos += elementLength; + break; + } + case M: + case EQ: + case X: + { + for( int iii = 0; iii < elementLength; iii++ ) { + final byte refByte = ref[refPos]; + final byte altByte = alignment[alignmentPos]; + if( refByte != altByte ) { // SNP! + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { + final List snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refByte, true ) ); + snpAlleles.add( Allele.create( altByte, false ) ); + addVC(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); + } + } + refPos++; + alignmentPos++; + } + break; + } + case N: + case H: + case P: + default: + throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); + } + } + } + + private void addVC(final VariantContext vc) { + addVC(vc, true); + } + + private void addVC(final VariantContext vc, final boolean merge) { + if ( containsKey(vc.getStart()) ) { + if ( merge ) { + final VariantContext prev = get(vc.getStart()); + put(vc.getStart(), makeBlock(prev, vc)); + } else { + throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc); + } + } else + put(vc.getStart(), vc); + } + + private VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { + if ( ! vc1.isSNP() ) throw new IllegalArgumentException("vc1 must be a snp"); + + Allele ref, alt; + final VariantContextBuilder b = new VariantContextBuilder(vc1); + if ( vc1.getReference().equals(vc2.getReference()) ) { + // we've got an insertion, so we just update the alt to have the prev alt + ref = vc1.getReference(); + alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); + } else { + // we're dealing with a deletion, so we patch the ref + ref = vc2.getReference(); + alt = vc1.getAlternateAllele(0); + b.stop(vc2.getEnd()); + } + + return b.alleles(Arrays.asList(ref, alt)).make(); + } + + // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it + @Requires("getNumberOfEvents() > 0") + protected void replaceClumpedEventsWithBlockSubstititions(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) { + int lastStart = -1; + for ( boolean foundOne = true; foundOne; ) { + foundOne = false; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() > lastStart ) { + lastStart = vc.getStart(); + final List neighborhood = getNeighborhood(vc, 10); + if ( updateToBlockSubstitutionIfBetter(neighborhood, haplotype, ref, refLoc) ) { + foundOne = true; + break; + } + } + } + } + } + + protected boolean updateToBlockSubstitutionIfBetter(final List neighbors, final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) { + if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) + return false; + // TODO -- need more tests to decide if this is really so good + + final VariantContext first = neighbors.get(0); + final int refStartOffset = first.getStart() - refLoc.getStart(); + final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart(); + + final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1); + final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar()); + + final VariantContextBuilder builder = new VariantContextBuilder(first); + builder.stop(first.getStart() + refBases.length - 1); + builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases))); + final VariantContext block = builder.make(); + + // remove all merged events + for ( final VariantContext merged : neighbors ) { + if ( remove(merged.getStart()) == null ) + throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged); + } + + // note must be after we remove the previous events as the treeset only allows one key per start + logger.info("Transforming into block substitution at " + block); + addVC(block, false); + + return true; + } + + /** + * Get all of the variant contexts starting at leftMost that are within maxBP of each other + * + * @param leftMost the left most (smallest position) variant context that will start the neighborhood + * @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next + * to be included the the resulting list + * @return a list that contains at least one element (leftMost) + */ + @Requires({"leftMost != null", "maxBPBetweenEvents >= 0"}) + @Ensures({"result != null", "! result.isEmpty()"}) + protected List getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) { + final List neighbors = new LinkedList(); + + VariantContext left = leftMost; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() < leftMost.getStart() ) + continue; + + if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) { + // this vc is within max distance to the end of the left event, so accumulate it + neighbors.add(vc); + left = vc; + } + } + + return neighbors; + } + + public Set getStartPositions() { + return keySet(); + } + + public Collection getVariantContexts() { + return values(); + } + + public int getNumberOfEvents() { + return size(); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("EventExtractor{"); + for ( final VariantContext vc : getVariantContexts() ) + b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles())); + b.append("}"); + return b.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 6dc223616..2e95fb03a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -43,7 +43,7 @@ import java.util.*; public class Haplotype extends Allele { private GenomeLoc genomeLocation = null; - private Map eventMap = null; + private EventExtractor eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; @@ -63,6 +63,12 @@ public class Haplotype extends Allele { this(bases, false); } + public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { + this(bases, isRef); + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + this.cigar = cigar; + } + /** * Copy constructor. Note the ref state of the provided allele is ignored! * @@ -92,11 +98,11 @@ public class Haplotype extends Allele { return Arrays.hashCode(getBases()); } - public Map getEventMap() { + public EventExtractor getEventMap() { return eventMap; } - public void setEventMap( final Map eventMap ) { + public void setEventMap( final EventExtractor eventMap ) { this.eventMap = eventMap; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 58f70d4b6..9b25b00c6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -48,6 +48,67 @@ public final class AlignmentUtils { // cannot be instantiated private AlignmentUtils() { } + /** + * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the + * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference + * + * refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference + * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X. + * + * @param bases + * @param refStart + * @param refEnd + * @param basesStartOnRef where does the bases array start w.r.t. the reference start? For example, bases[0] of + * could be at refStart == 0 if basesStartOnRef == 0, but it could just as easily be at + * 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10. + * It's not trivial to eliminate this parameter because it's tied up with the cigar + * @param basesToRefCigar the cigar that maps the bases to the reference genome + * @return a non-null byte[] + */ + public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) { + if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd); + if ( basesStartOnRef < 0 ) throw new IllegalArgumentException("BasesStartOnRef must be >= 0 but got " + basesStartOnRef); + if ( bases == null ) throw new IllegalArgumentException("Bases cannot be null"); + if ( basesToRefCigar == null ) throw new IllegalArgumentException("basesToRefCigar cannot be null"); + if ( bases.length != basesToRefCigar.getReadLength() ) throw new IllegalArgumentException("Mismatch in length between reference bases " + bases.length + " and cigar length " + basesToRefCigar); + + int refPos = basesStartOnRef; + int basesPos = 0; + + int basesStart = -1; + int basesStop = -1; + boolean done = false; + + for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) { + final CigarElement ce = basesToRefCigar.getCigarElement(iii); + final int bInc, rInc; + switch ( ce.getOperator() ) { + case I: bInc = 1; rInc = 0; break; + case M: case X: case EQ: bInc = rInc = 1; break; + case D: bInc = 0; rInc = 1; break; + default: + throw new IllegalStateException("Unsupported operator " + ce); + } + + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refStart ) + basesStart = basesPos; + if ( refPos == refEnd ) { + basesStop = basesPos; + done = true; + break; + } + refPos += rInc; + basesPos += bInc; + } + } + + if ( basesStart == -1 || basesStop == -1 ) + throw new IllegalStateException("Never found start " + basesStart + " or stop " + basesStop + " given cigar " + basesToRefCigar); + + return Arrays.copyOfRange(bases, basesStart, basesStop + 1); + } + /** * Get the number of bases at which refSeq and readSeq differ, given their alignment * diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 0bd30c3a4..4565402b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -1424,4 +1424,21 @@ public class GATKVariantContextUtils { return result; } + + /** + * Are vc1 and 2 equal including their position and alleles? + * @param vc1 non-null VariantContext + * @param vc2 non-null VariantContext + * @return true if vc1 and vc2 are equal, false otherwise + */ + public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { + if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); + if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); + + if ( vc1.getStart() != vc2.getStart() ) return false; + if ( vc1.getEnd() != vc2.getEnd() ) return false; + if ( ! vc1.getChr().equals(vc2.getChr())) return false; + if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; + return true; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java new file mode 100644 index 000000000..480f82a46 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotype; + +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class EventExtractorUnitTest extends BaseTest { + private final static String CHR = "20"; + private final static String NAME = "foo"; + + @DataProvider(name = "MyDataProvider") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + final List SNP_ALLELES = Arrays.asList("A", "C"); + final List INS_ALLELES = Arrays.asList("A", "ACGTGA"); + final List DEL_ALLELES = Arrays.asList("ACGTA", "C"); + final List> allAlleles = Arrays.asList(SNP_ALLELES, INS_ALLELES, DEL_ALLELES); + for ( final int leftNotClump : Arrays.asList(-1, 3) ) { + for ( final int middleNotClump : Arrays.asList(-1, 10, 500) ) { + for ( final int rightNotClump : Arrays.asList(-1, 1000) ) { + for ( final int nClumped : Arrays.asList(3, 4) ) { + for ( final List> alleles : Utils.makePermutations(allAlleles, nClumped, true)) { + final List allVCS = new LinkedList(); + + if ( leftNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, leftNotClump, SNP_ALLELES)); + if ( middleNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, middleNotClump, SNP_ALLELES)); + if ( rightNotClump != -1 ) allVCS.add(GATKVariantContextUtils.makeFromAlleles(NAME, CHR, rightNotClump, SNP_ALLELES)); + + int clumpStart = 50; + final List vcs = new LinkedList(); + for ( final List myAlleles : alleles ) { + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles(NAME, CHR, clumpStart, myAlleles); + clumpStart = vc.getEnd() + 3; + vcs.add(vc); + } + + tests.add(new Object[]{new EventExtractor(new LinkedList(allVCS)), Collections.emptyList()}); + allVCS.addAll(vcs); + tests.add(new Object[]{new EventExtractor(allVCS), vcs}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProvider", enabled = true) // TODO == reenable + public void testGetNeighborhood(final EventExtractor eventExtractor, final List expectedNeighbors) { + final VariantContext leftOfNeighors = expectedNeighbors.isEmpty() ? null : expectedNeighbors.get(0); + + for ( final VariantContext vc : eventExtractor.getVariantContexts() ) { + final List n = eventExtractor.getNeighborhood(vc, 5); + if ( leftOfNeighors == vc ) + Assert.assertEquals(n, expectedNeighbors); + else if ( ! expectedNeighbors.contains(vc) ) + Assert.assertEquals(n, Collections.singletonList(vc), "Should only contain the original vc but " + n); + } + } + + @DataProvider(name = "BlockSubstitutionsData") + public Object[][] makeBlockSubstitutionsData() { + List tests = new ArrayList(); + + for ( int size = EventExtractor.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) { + final String ref = Utils.dupString("A", size); + final String alt = Utils.dupString("C", size); + tests.add(new Object[]{ref, alt, size + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList(ref, alt))}); + } + + tests.add(new Object[]{"AAAAAA", "GAGAGA", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAA", "GAGAG"))}); + tests.add(new Object[]{"AAAAAA", "GAGAGG", "6M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAAAA", "GAGAGG"))}); + + for ( int len = 0; len < 10; len++ ) { + final String s = len == 0 ? "" : Utils.dupString("A", len); + tests.add(new Object[]{s + "AACCCCAA", s + "GAAG", len + 2 + "M4D2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AACCCCAA", "GAAG"))}); + tests.add(new Object[]{s + "AAAA", s + "GACCCCAG", len + 2 + "M4I2M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1 + len, Arrays.asList("AAAA", "GACCCCAG"))}); + + tests.add(new Object[]{"AACCCCAA" + s, "GAAG" + s, "2M4D" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AACCCCAA", "GAAG"))}); + tests.add(new Object[]{"AAAA" + s, "GACCCCAG" + s, "2M4I" + (len + 2) + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList("AAAA", "GACCCCAG"))}); + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "BlockSubstitutionsData") + public void testBlockSubstitutionsData(final String refBases, final String haplotypeBases, final String cigar, final VariantContext expectedBlock) { + final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); + final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME); + Assert.assertEquals(ee.getNumberOfEvents(), 1); + final VariantContext actual = ee.getVariantContexts().iterator().next(); + Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual); + } + + @DataProvider(name = "AdjacentSNPIndelTest") + public Object[][] makeAdjacentSNPIndelTest() { + List tests = new ArrayList(); + + tests.add(new Object[]{"TT", "GCT", "1M1I1M", Arrays.asList(Arrays.asList("T", "GC"))}); + tests.add(new Object[]{"GCT", "TT", "1M1D", Arrays.asList(Arrays.asList("GC", "T"))}); + tests.add(new Object[]{"TT", "GCCT", "1M2I1M", Arrays.asList(Arrays.asList("T", "GCC"))}); + tests.add(new Object[]{"GCCT", "TT", "1M2D", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "AATT", "3M2D", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "GATT", "3M2D", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAAAA", "AGACA", "5M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("A", "C"))}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "AdjacentSNPIndelTest", enabled = true) + public void testAdjacentSNPIndelTest(final String refBases, final String haplotypeBases, final String cigar, final List> expectedAlleles) { + final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); + final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME); + Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size()); + final List actuals = new ArrayList(ee.getVariantContexts()); + for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) { + final VariantContext actual = actuals.get(i); + Assert.assertEquals(actual.getReference().getDisplayString(), expectedAlleles.get(i).get(0)); + Assert.assertEquals(actual.getAlternateAllele(0).getDisplayString(), expectedAlleles.get(i).get(1)); + } + } +} From 67cd407854d5d0acaa06861745c64e9ba65e8d82 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 27 Mar 2013 09:41:04 -0400 Subject: [PATCH 141/226] The GenotypingEngine now uses the samples from the mapping of Samples -> PerReadAllele likelihoods instead of passing around a redundant list of samples --- .../haplotypecaller/GenotypingEngine.java | 28 ++++++++----------- .../haplotypecaller/HaplotypeCaller.java | 1 - 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 8e76b6ea6..59cadbdf9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -119,9 +119,10 @@ public class GenotypingEngine { * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * + * The list of samples we're working with is obtained from the haplotypeReadMap + * * @param UG_engine UG Engine with basic input parameters * @param haplotypes Haplotypes to assign likelihoods to - * @param samples Samples to genotype * @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) * @param perSampleFilteredReadList * @param ref Reference bytes at active region @@ -136,7 +137,6 @@ public class GenotypingEngine { // TODO - can this be refactored? this is hard to follow! public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, final List haplotypes, - final List samples, final Map haplotypeReadMap, final Map> perSampleFilteredReadList, final byte[] ref, @@ -147,7 +147,6 @@ public class GenotypingEngine { // sanity check input arguments if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); - if (samples == null || samples.isEmpty()) throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples); if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); @@ -157,7 +156,7 @@ public class GenotypingEngine { // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference // that carry events among the haplotypes - final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, samples, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); // Walk along each position in the key set and create each event to be outputted final Set calledHaplotypes = new HashSet(); @@ -195,7 +194,7 @@ public class GenotypingEngine { final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); - final GenotypesContext genotypes = calculateGLsForThisEvent( samples, alleleReadMap, mergedVC ); + final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : @@ -224,7 +223,6 @@ public class GenotypingEngine { * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts * * @param haplotypes the list of haplotypes we're working with - * @param samples the samples we're working with * @param haplotypeReadMap map from samples -> the per read allele likelihoods * @param ref the reference bases (over the same interval as the haplotypes) * @param refLoc the span of the reference bases @@ -232,7 +230,6 @@ public class GenotypingEngine { * @return */ private TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, - final List samples, final Map haplotypeReadMap, final byte[] ref, final GenomeLoc refLoc, @@ -259,9 +256,9 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if ( !in_GGA_mode && samples.size() >= 10 ) { + if ( !in_GGA_mode && haplotypeReadMap.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc ); + mergeConsecutiveEventsBasedOnLD( haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc ); cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events } @@ -282,7 +279,6 @@ public class GenotypingEngine { private List makePriorityList(final List vcs) { final List priorityList = new LinkedList(); for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); - return priorityList; } @@ -333,17 +329,16 @@ public class GenotypingEngine { /** * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele - * @param samples List of samples to genotype * @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods * @param mergedVC Input VC with event to genotype * @return GenotypesContext object wrapping genotype objects with PLs */ - @Requires({"samples != null","alleleReadMap!= null", "mergedVC != null"}) + @Requires({"alleleReadMap!= null", "mergedVC != null"}) @Ensures("result != null") - private GenotypesContext calculateGLsForThisEvent( final List samples, final Map alleleReadMap, final VariantContext mergedVC ) { - final GenotypesContext genotypes = GenotypesContext.create(samples.size()); + private GenotypesContext calculateGLsForThisEvent( final Map alleleReadMap, final VariantContext mergedVC ) { + final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - for( final String sample : samples ) { + for( final String sample : alleleReadMap.keySet() ) { final int numHaplotypes = mergedVC.getAlleles().size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles()); @@ -448,14 +443,12 @@ public class GenotypingEngine { /** * TODO - comment me, clean me, refactor me! * @param haplotypes - * @param samples * @param haplotypeReadMap * @param startPosKeySet * @param ref * @param refLoc */ protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, - final List samples, final Map haplotypeReadMap, final TreeSet startPosKeySet, final byte[] ref, @@ -465,6 +458,7 @@ public class GenotypingEngine { final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } + final Set samples = haplotypeReadMap.keySet(); boolean mapWasUpdated = true; while( mapWasUpdated ) { mapWasUpdated = false; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index d77caa2a2..a6b19826b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -559,7 +559,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, bestHaplotypes, - samplesList, stratifiedReadMap, perSampleFilteredReadList, fullReferenceWithPadding, From 8656bd5e29980029e997e0339e732e6379034f75 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 28 Mar 2013 09:40:08 -0400 Subject: [PATCH 142/226] Haplotype now consolidates cigars in setCigar -- This fixes edge base bugs where non-consolidated cigars are causing problems in users of the Haplotype object. Input arguments are now checks (let's see if we blow up) --- .../sting/utils/haplotype/Haplotype.java | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 2e95fb03a..a94c08198 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -36,10 +36,12 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; -import java.util.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.List; public class Haplotype extends Allele { private GenomeLoc genomeLocation = null; @@ -52,21 +54,36 @@ public class Haplotype extends Allele { /** * Main constructor * - * @param bases bases - * @param isRef is reference allele? + * @param bases a non-null array of bases + * @param isRef is this the reference haplotype? */ public Haplotype( final byte[] bases, final boolean isRef ) { super(bases.clone(), isRef); } + /** + * Create a new non-ref haplotype + * + * @param bases a non-null array of bases + */ public Haplotype( final byte[] bases ) { this(bases, false); } + /** + * Create a new haplotype with bases + * + * Requires bases.length == cigar.getReadLength() + * + * @param bases a non-null array of bases + * @param isRef is this the reference haplotype? + * @param alignmentStartHapwrtRef offset of this haplotype w.r.t. the reference + * @param cigar the cigar that maps this haplotype to the reference sequence + */ public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { this(bases, isRef); this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; - this.cigar = cigar; + setCigar(cigar); } /** @@ -127,6 +144,11 @@ public class Haplotype extends Allele { this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; } + /** + * Get the cigar for this haplotype. Note that cigar is guarenteed to be consolidated + * in that multiple adjacent equal operates will have been merged + * @return the cigar of this haplotype + */ public Cigar getCigar() { return cigar; } @@ -144,8 +166,17 @@ public class Haplotype extends Allele { return AlignmentUtils.consolidateCigar(extendedHaplotypeCigar); } + /** + * Set the cigar of this haplotype to cigar. + * + * Note that this function consolidates the cigar, so that 1M1M1I1M1M => 2M1I2M + * + * @param cigar a cigar whose readLength == length() + */ public void setCigar( final Cigar cigar ) { - this.cigar = cigar; + this.cigar = AlignmentUtils.consolidateCigar(cigar); + if ( this.cigar.getReadLength() != length() ) + throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength()); } public boolean isArtificialHaplotype() { From 167cd49e710f8a37e37ba28528b93722f50e6253 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 28 Mar 2013 17:35:00 -0400 Subject: [PATCH 143/226] Added -forceActive argument to ActiveRegionWalkers -- Causes the ART tool to treat all bases as active. Useful for debugging --- .../sting/gatk/traversals/TraverseActiveRegions.java | 1 + .../sting/gatk/walkers/ActiveRegionWalker.java | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 7b831db32..908755a24 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -487,6 +487,7 @@ public class TraverseActiveRegions extends TraversalEngine extends Walker Date: Thu, 28 Mar 2013 18:17:27 -0400 Subject: [PATCH 144/226] LD-based merging algorithm for nearby events in the haplotypes -- Moved R^2 LD haplotype merging system to the utils.haplotype package -- New LD merging only enabled with HC argument. -- EventExtractor and EventExtractorUnitTest refactors so we can test the block substitution code without having to enabled it via a static variable -- A few misc. bug fixes in LDMerger itself -- Refactoring of Haplotype event splitting and merging code -- Renamed EventExtractor to EventMap -- EventMap has a static method that computes the event maps among n haplotypes -- Refactor Haplotype score and base comparators into their own classes and unit tested them -- Refactored R^2 based LD merging code into its own class HaplotypeR2Calculator and unit tested much of it. -- LDMerger now uses the HaplotypeR2Calculator, which cleans up the code a bunch and allowed me to easily test that code with a MockHaplotypeR2Calculator. For those who haven't seen this testing idiom, have a look, and very useful -- New algorithm uses a likelihood-ratio test to compute the probability that only the phased haplotypes exist in the population. -- Fixed fundamental bug in the way the previous R^2 implementation worked -- Optimizations for HaplotypeLDCalculator: only compute the per sample per haplotype summed likelihoods once, regardless of how many calls there are -- Previous version would enter infinite loop if it merged two events but the second event had other low likelihood events in other haplotypes that didn't get removed. Now when events are removed they are removed from all event maps, regardless of whether the haplotypes carry both events -- Bugfixes for EventMap in the HaplotypeCaller as well. Previous version was overly restrictive, requiring that the first event to make into a block substitution was a snp. In some cases we need to merge an insertion with a deletion, such as when the cigar is 10M2I3D4M. The new code supports this. UnitTested and documented as well. LDMerger handles case where merging two alleles results in a no-op event. Merging CA/C + A/AA -> CAA/CAA -> no op. Handles this case by removing the two events. UnitTested -- Turn off debugging output for the LDMerger in the HaplotypeCaller unless -debug was enabled -- This new version does a much more specific test (that's actually right). Here's the new algorithm: * Compute probability that two variants are in phase with each other and that no * compound hets exist in the population. * * Implemented as a likelihood ratio test of the hypothesis: * * x11 and x22 are the only haplotypes in the populations * * vs. * * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population. * * Now, since we have to have both variants in the population, we exclude the x11 & x11 state. So the * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22). * * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are: * * - P(x11 & x12 & x21) -- we have hom-ref and both hets * - P(x22 & x12 & x21) -- we have hom-alt and both hets * - P(x22 & x12) -- one haplotype is 22 and the other is het 12 * - P(x22 & x21) -- one haplotype is 22 and the other is het 21 --- .../haplotypecaller/GenotypingEngine.java | 203 ++--------- .../haplotypecaller/HaplotypeCaller.java | 13 +- .../LikelihoodCalculationEngine.java | 14 +- .../haplotype/HaplotypeLDCalculator.java | 194 ++++++++++ .../sting/utils/haplotype/LDMerger.java | 303 ++++++++++++++++ .../GenotypingEngineUnitTest.java | 142 -------- .../HaplotypeBaseComparatorUnitTest.java | 77 ++++ .../HaplotypeLDCalculatorUnitTest.java | 118 +++++++ .../HaplotypeScoreComparatorUnitTest.java | 76 ++++ .../utils/haplotype/LDMergerUnitTest.java | 334 ++++++++++++++++++ .../{EventExtractor.java => EventMap.java} | 221 ++++++++---- .../sting/utils/haplotype/Haplotype.java | 38 +- .../haplotype/HaplotypeBaseComparator.java | 42 +++ .../haplotype/HaplotypeScoreComparator.java | 39 ++ ...torUnitTest.java => EventMapUnitTest.java} | 112 +++--- 15 files changed, 1451 insertions(+), 475 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java create mode 100644 protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java rename public/java/src/org/broadinstitute/sting/utils/haplotype/{EventExtractor.java => EventMap.java} (58%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java rename public/java/test/org/broadinstitute/sting/utils/haplotype/{EventExtractorUnitTest.java => EventMapUnitTest.java} (61%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 59cadbdf9..7cdc57464 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -48,18 +48,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.EventExtractor; +import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.LDMerger; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; @@ -74,12 +74,16 @@ public class GenotypingEngine { private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final VariantAnnotatorEngine annotationEngine; + private final LDMerger ldMerger; - public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) { + public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, + final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, + final LDMerger ldMerger) { this.DEBUG = DEBUG; this.annotationEngine = annotationEngine; this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; noCall.add(Allele.NO_CALL); + this.ldMerger = ldMerger; } /** @@ -235,31 +239,18 @@ public class GenotypingEngine { final GenomeLoc refLoc, final List activeAllelesToGenotype) { final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); - int hapNumber = 0; // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); + final TreeSet startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); - if( DEBUG ) logger.info("=== Best Haplotypes ==="); - for( final Haplotype h : haplotypes ) { - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( new EventExtractor( h, ref, refLoc, "HC" + hapNumber++ ) ); - if( ! in_GGA_mode ) { - startPosKeySet.addAll(h.getEventMap().getStartPositions()); - } - - if( DEBUG ) { - logger.info(h.toString()); - logger.info("> Cigar = " + h.getCigar()); - logger.info(">> Events = " + h.getEventMap()); - } - } + if ( in_GGA_mode ) startPosKeySet.clear(); cleanUpSymbolicUnassembledEvents( haplotypes ); - if ( !in_GGA_mode && haplotypeReadMap.size() >= 10 ) { + if ( !in_GGA_mode ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - mergeConsecutiveEventsBasedOnLD( haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc ); - cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events + final boolean mergedAnything = ldMerger.mergeConsecutiveEventsBasedOnLD( haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc ); + if ( mergedAnything ) + cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events } if ( in_GGA_mode ) { @@ -290,7 +281,7 @@ public class GenotypingEngine { if( activeAllelesToGenotype.isEmpty() ) { for( final Haplotype h : haplotypes ) { - final EventExtractor eventMap = h.getEventMap(); + final EventMap eventMap = h.getEventMap(); final VariantContext vc = eventMap.get(loc); if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { eventsAtThisLoc.add(vc); @@ -341,14 +332,14 @@ public class GenotypingEngine { for( final String sample : alleleReadMap.keySet() ) { final int numHaplotypes = mergedVC.getAlleles().size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles()); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC } } - genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); + genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make()); } return genotypes; } @@ -440,156 +431,6 @@ public class GenotypingEngine { return alleleReadMap; } - /** - * TODO - comment me, clean me, refactor me! - * @param haplotypes - * @param haplotypeReadMap - * @param startPosKeySet - * @param ref - * @param refLoc - */ - protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, - final Map haplotypeReadMap, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - - final int MAX_SIZE_TO_COMBINE = 15; - final double MERGE_EVENTS_R2_THRESHOLD = 0.95; - if( startPosKeySet.size() <= 1 ) { return; } - - final Set samples = haplotypeReadMap.keySet(); - boolean mapWasUpdated = true; - while( mapWasUpdated ) { - mapWasUpdated = false; - - // loop over the set of start locations and consider pairs that start near each other - final Iterator iter = startPosKeySet.iterator(); - int thisStart = iter.next(); - while( iter.hasNext() ) { - final int nextStart = iter.next(); - if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) { - boolean isBiallelic = true; - VariantContext thisVC = null; - VariantContext nextVC = null; - double x11 = Double.NEGATIVE_INFINITY; - double x12 = Double.NEGATIVE_INFINITY; - double x21 = Double.NEGATIVE_INFINITY; - double x22 = Double.NEGATIVE_INFINITY; - - for( final Haplotype h : haplotypes ) { - // only make complex substitutions out of consecutive biallelic sites - final VariantContext thisHapVC = h.getEventMap().get(thisStart); - if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype - if( thisVC == null ) { - thisVC = thisHapVC; - } else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) { - isBiallelic = false; - break; - } - } - final VariantContext nextHapVC = h.getEventMap().get(nextStart); - if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype - if( nextVC == null ) { - nextVC = nextHapVC; - } else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) { - isBiallelic = false; - break; - } - } - // count up the co-occurrences of the events for the R^2 calculation - for( final String sample : samples ) { - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( Collections.singleton(sample), haplotypeReadMap, Collections.singletonList(Allele.create(h, true)) )[0][0]; - if( thisHapVC == null ) { - if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } - else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } - } else { - if( nextHapVC == null ) { x21 = MathUtils.approximateLog10SumLog10(x21, haplotypeLikelihood); } - else { x22 = MathUtils.approximateLog10SumLog10(x22, haplotypeLikelihood); } - } - } - } - if( thisVC == null || nextVC == null ) { - continue; - } - if( isBiallelic ) { - final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) ); - if( DEBUG ) { - logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2)); - logger.info("-- " + thisVC); - logger.info("-- " + nextVC); - } - if( R2 > MERGE_EVENTS_R2_THRESHOLD ) { - - final VariantContext mergedVC = createMergedVariantContext(thisVC, nextVC, ref, refLoc); - - // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().containsKey(thisStart) && h.getEventMap().containsKey(nextStart) ) { - h.getEventMap().remove(thisStart); - h.getEventMap().remove(nextStart); - h.getEventMap().put(mergedVC.getStart(), mergedVC); - } - } - startPosKeySet.add(mergedVC.getStart()); - boolean containsStart = false; - boolean containsNext = false; - for( final Haplotype h : haplotypes ) { - final Map eventMap = h.getEventMap(); - if( eventMap.containsKey(thisStart) ) { containsStart = true; } - if( eventMap.containsKey(nextStart) ) { containsNext = true; } - } - if(!containsStart) { startPosKeySet.remove(thisStart); } - if(!containsNext) { startPosKeySet.remove(nextStart); } - - if( DEBUG ) { logger.info("====> " + mergedVC); } - mapWasUpdated = true; - break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events - } - } - } - thisStart = nextStart; - } - } - } - - // BUGBUG: make this merge function more general - protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { - final int thisStart = thisVC.getStart(); - final int nextStart = nextVC.getStart(); - byte[] refBases = new byte[]{}; - byte[] altBases = new byte[]{}; - refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); - altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); - int locus; - for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) { - final byte refByte = ref[locus - refLoc.getStart()]; - refBases = ArrayUtils.add(refBases, refByte); - altBases = ArrayUtils.add(altBases, refByte); - } - refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel - altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); - - int iii = 0; - if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele - while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } - } - final List mergedAlleles = new ArrayList(); - mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) ); - mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) ); - return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make(); - } - - protected static double calculateR2LD( final double x11, final double x12, final double x21, final double x22 ) { - final double total = x11 + x12 + x21 + x22; - final double pa1b1 = x11 / total; - final double pa1b2 = x12 / total; - final double pa2b1 = x21 / total; - final double pa1 = pa1b1 + pa1b2; - final double pb1 = pa1b1 + pa2b1; - return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) ); - } - protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { final Map> alleleMapper = new LinkedHashMap>(); for( final Map.Entry entry : mergeMap.entrySet() ) { @@ -616,8 +457,8 @@ public class GenotypingEngine { alleles.add(h.getArtificialRefAllele()); alleles.add(h.getArtificialAltAllele()); final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype") - .alleles(alleles) - .loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() ); + .alleles(alleles) + .loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() ); if( eventMapper.containsKey(artificialVC) ) { eventMapper.get(artificialVC).add(h); } @@ -711,7 +552,7 @@ public class GenotypingEngine { @Deprecated protected static Map generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { - return new EventExtractor(haplotype, ref, refLoc, sourceNameToAdd); + return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); } protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a6b19826b..53fffec61 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -78,6 +78,8 @@ import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotype.HaplotypeBaseComparator; +import org.broadinstitute.sting.utils.haplotype.LDMerger; import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; @@ -302,6 +304,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; + @Hidden + @Argument(fullName="useNewLDMerger", shortName="useNewLDMerger", doc="If specified, we will include low quality bases when doing the assembly", required = false) + protected boolean useNewLDMerger = false; + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -412,7 +418,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); + + final LDMerger ldMerger = new LDMerger(DEBUG, useNewLDMerger ? 10 : 10, useNewLDMerger ? 1 : 10); + + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, ldMerger ); if ( bamWriter != null ) haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); @@ -545,7 +554,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM - Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + Collections.sort( haplotypes, new HaplotypeBaseComparator() ); if (dontGenotype) return 1; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index df1c9aabc..543b23d9c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -158,17 +158,17 @@ public class LikelihoodCalculationEngine { @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map stratifiedReadMap, - final List alleleOrdering ) { - final TreeSet sampleSet = new TreeSet(); - sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering); + final List alleleOrdering, + final boolean normalize ) { + return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); } @Requires({"alleleOrdering.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map stratifiedReadMap, - final List alleleOrdering ) { + final List alleleOrdering, + final boolean normalize) { final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; @@ -195,7 +195,7 @@ public class LikelihoodCalculationEngine { } // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix; } @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @@ -230,7 +230,7 @@ public class LikelihoodCalculationEngine { final List haplotypesAsAlleles = new ArrayList(); for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together int hap1 = 0; int hap2 = 0; diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java new file mode 100644 index 000000000..4609c3209 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java @@ -0,0 +1,194 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.*; + +/** + * Computes the likelihood based probability that haplotypes for first and second variant contexts + * only appear in their fully linked form (x11 and x22) given a set of haplotypes where they might occur + * and read likelihoods per sample + * + * User: depristo + * Date: 3/29/13 + * Time: 9:23 AM + */ +public class HaplotypeLDCalculator { + private final List haplotypes; + private final Map haplotypeReadMap; + private List> haplotypeLikelihoodsPerSample = null; + + // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + private final double[] table = new double[4]; + + /** + * For testing + */ + protected HaplotypeLDCalculator() { + haplotypes = Collections.emptyList(); + haplotypeReadMap = Collections.emptyMap(); + } + + public HaplotypeLDCalculator(List haplotypes, Map haplotypeReadMap) { + this.haplotypes = haplotypes; + this.haplotypeReadMap = haplotypeReadMap; + } + + /** + * Construct the cached list of summed haplotype likelihoods per sample if it + * hasn't already been computed. This data structure is lazy created but only + * needs to be made once when we make 1 merge decision as the data doesn't change + * no matter how many calls to computeProbOfBeingPhased + */ + private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { + if ( haplotypeLikelihoodsPerSample == null ) { + // do the lazy computation + final Set samples = haplotypeReadMap.keySet(); + haplotypeLikelihoodsPerSample = new LinkedList>(); + for( final String sample : samples ) { + final Map map = new HashMap(haplotypes.size()); + for( final Haplotype h : haplotypes ) { + // count up the co-occurrences of the events for the R^2 calculation + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0]; + map.put(h, haplotypeLikelihood); + } + haplotypeLikelihoodsPerSample.add(map); + } + } + } + + /** + * Compute the likelihood based probability that that haplotypes for first and second are only x11 and x22 + * + * As opposed to the hypothesis that all four haplotypes (x11, x12, x21, and x22) exist in the population + * + * @param first a non-null VariantContext + * @param second a non-null VariantContext + * @return the probability that only x11 and x22 exist among the samples + */ + protected double computeProbOfBeingPhased(final VariantContext first, final VariantContext second) { + buildHaplotypeLikelihoodsPerSampleIfNecessary(); + + Arrays.fill(table, Double.NEGATIVE_INFINITY); + + for ( final Map entry : haplotypeLikelihoodsPerSample ) { + for ( final Map.Entry haplotypeLikelihood : entry.entrySet() ) { + final Haplotype h = haplotypeLikelihood.getKey(); + // count up the co-occurrences of the events for the R^2 calculation + final VariantContext thisHapVC = h.getEventMap().get(first.getStart()); + final VariantContext nextHapVC = h.getEventMap().get(second.getStart()); // TODO -- add function to take a VC + final int i = thisHapVC == null ? 0 : 1; + final int j = nextHapVC == null ? 0 : 1; + final int index = 2 * i + j; + table[index] = MathUtils.approximateLog10SumLog10(table[index], haplotypeLikelihood.getValue()); + } + } + + return pPhased(table); + } + + /** + * Compute probability that two variants are in phase with each other and that no + * compound hets exist in the population. + * + * Implemented as a likelihood ratio test of the hypothesis: + * + * x11 and x22 are the only haplotypes in the populations + * + * vs. + * + * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population. + * + * Now, since we have to have both variants in the population, we exclude the x11 & x11 state. So the + * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22). + * + * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are: + * + * - P(x11 & x12 & x21) -- we have hom-ref and both hets + * - P(x22 & x12 & x21) -- we have hom-alt and both hets + * - P(x22 & x12) -- one haplotype is 22 and the other is het 12 + * - P(x22 & x21) -- one haplotype is 22 and the other is het 21 + * + * The probability is just p11_22 / (p11_22 + p hets) + * + * @table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + * doesn't have to be normalized as this function does the normalization internally + * @return the real space probability that the data is phased + */ + @Requires("table.length == 4") + protected double pPhased( double[] table ) { + final double[] normTable = MathUtils.normalizeFromLog10(table, true); + + final double x11 = normTable[0], x12 = normTable[1], x21 = normTable[2], x22 = normTable[3]; + + // probability that we are only x11 && x22 + final double p11_22 = MathUtils.approximateLog10SumLog10(x11 + x22, x22 + x22); + + // probability of having any of the other pairs + final double p11_12_21 = MathUtils.approximateLog10SumLog10(x11 + x12, x11 + x21, x12 + x21); + final double p22_12_21 = MathUtils.approximateLog10SumLog10(x22 + x12, x22 + x21, x12 + x21); + final double p22_12 = x22 + x12; + final double p22_21 = x22 + x21; + final double pOthers = MathUtils.approximateLog10SumLog10(new double[]{p11_12_21, p22_12_21, p22_12, p22_21}); + + // probability of being phases is the ratio of p11_22 / pOthers which in log space is just a substraction + final double log10phased = p11_22 - (MathUtils.approximateLog10SumLog10(p11_22, pOthers)); + + return Math.pow(10.0, log10phased); + } + + protected double pPhasedTest( final double x11, final double x12, final double x21, final double x22 ) { + return pPhased(new double[]{x11, x12, x21, x22}); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java new file mode 100644 index 000000000..ea00a1901 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java @@ -0,0 +1,303 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; + +import java.util.*; + +/** + * Merges VariantContexts in a series of haplotypes according to their pairwise LD + * + * User: depristo + * Date: 3/28/13 + * Time: 6:17 PM + */ +public class LDMerger { + private final static Logger logger = Logger.getLogger(LDMerger.class); + + private final boolean DEBUG; + private final int minSamplesToMergeSNPs; + private final int minSamplesToMergeOtherEvents; + + public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) { + this.DEBUG = DEBUG; + this.minSamplesToMergeSNPs = minSamplesToMergeSNPs; + this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents; + } + + protected LDMerger() { + this(false, 1, 1); + } + + // TODO -- should be class arguments and static variables in HC + protected final static int MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE = 6; + protected final static int MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE = 25; + + /** + * We require 99% confidence that only the phased haplotypes exist in the population to merge the records + */ + protected final static double MERGE_EVENTS_PROB_PHASED_THRESHOLD = 0.99; + + /** + * Merge as many events among the haplotypes as possible based on pairwise LD among variants + * + * @param haplotypes a list of haplotypes whose events we want to merge + * @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype + * @param startPosKeySet a set of starting positions of all events among the haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + */ + public boolean mergeConsecutiveEventsBasedOnLD( final List haplotypes, + final Map haplotypeReadMap, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); + if ( haplotypeReadMap == null ) throw new IllegalArgumentException("haplotypeReadMap cannot be null"); + if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); + if ( ref == null ) throw new IllegalArgumentException("ref cannot be null"); + if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null"); + if ( refLoc.size() != ref.length ) throw new IllegalArgumentException("refLoc size " + refLoc.size() + " != ref.length " + ref.length + " at " + refLoc); + + if( startPosKeySet.size() <= 1 ) { return false; } + + final int nSamples = haplotypeReadMap.keySet().size(); + final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, haplotypeReadMap); + boolean somethingWasMerged = false; + boolean mapWasUpdated = true; + while( mapWasUpdated ) { + mapWasUpdated = mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calculator, nSamples, startPosKeySet, ref, refLoc); + somethingWasMerged |= mapWasUpdated; + } + return somethingWasMerged; + } + + /** + * Merge the next pair of events, if possible + * + * @param haplotypes a list of haplotypes whose events we want to merge + * @param ldCalculator calculates R^2 for pairs of events on demand + * @param startPosKeySet a set of starting positions of all events among the haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @return true if something was merged, false otherwise + */ + protected boolean mergeConsecutiveEventsBasedOnLDOnce( final List haplotypes, + final HaplotypeLDCalculator ldCalculator, + final int nSamples, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + // loop over the set of start locations and consider pairs that start near each other + final Iterator iter = startPosKeySet.iterator(); + int thisStart = iter.next(); + while( iter.hasNext() ) { + final int nextStart = iter.next(); + final LDMergeData toMerge = getPairOfEventsToMerge(haplotypes, thisStart, nextStart); + + if ( toMerge.canBeMerged(nSamples) ) { + final double pPhased = ldCalculator.computeProbOfBeingPhased(toMerge.firstVC, toMerge.secondVC); + + if( DEBUG ) { + logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", pPhased)); + logger.info("-- " + toMerge.firstVC); + logger.info("-- " + toMerge.secondVC); + } + + if( pPhased > MERGE_EVENTS_PROB_PHASED_THRESHOLD) { + final VariantContext mergedVC = createMergedVariantContext(toMerge.firstVC, toMerge.secondVC, ref, refLoc); + // if for some reason the merging resulting in a bad allele, mergedVC will be null, and we will just remove first and second + replaceVariantContextsInMap(haplotypes, startPosKeySet, mergedVC, toMerge.firstVC, toMerge.secondVC); + return true; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events + } + } + + thisStart = nextStart; + } + + return false; + } + + /** + * Info about potential LD merge of two variant contexts + */ + private class LDMergeData { + VariantContext firstVC = null, secondVC = null; + boolean canBeMerged = true; + + /** Tell this object that it cant be merged for some reason */ + public LDMergeData cantBeMerged() { + canBeMerged = false; + return this; + } + + /** + * Can these two events be merged + * @param nSamples the number of samples we're considering + * @return true if we can merge our two variant contexts + */ + public boolean canBeMerged(final int nSamples) { + if ( ! canBeMerged || firstVC == null || secondVC == null ) + return false; + + final int distance = secondVC.getStart() - firstVC.getEnd(); + if ( firstVC.isSNP() && secondVC.isSNP() ) { + return nSamples >= minSamplesToMergeSNPs && distance <= MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE; + } else { + return nSamples >= minSamplesToMergeOtherEvents && distance <= MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE; + } + } + } + + /** + * Get the information about the potential merge of two events starting at thisStart and nextStart + * @param haplotypes our haplotypes + * @param thisStart the starting position of the first event to merge + * @param nextStart the starting position of the next event to merge + * @return + */ + private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) { + final LDMergeData mergeData = new LDMergeData(); + + for( final Haplotype h : haplotypes ) { + // only make complex substitutions out of consecutive biallelic sites + final VariantContext thisHapVC = h.getEventMap().get(thisStart); + if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype + if( mergeData.firstVC == null ) { + mergeData.firstVC = thisHapVC; + } else if( !thisHapVC.hasSameAllelesAs( mergeData.firstVC) ) { + return mergeData.cantBeMerged(); + } + } + final VariantContext nextHapVC = h.getEventMap().get(nextStart); + if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype + if( mergeData.secondVC == null ) { + mergeData.secondVC = nextHapVC; + } else if( !nextHapVC.hasSameAllelesAs( mergeData.secondVC) ) { + return mergeData.cantBeMerged(); + } + } + } + + // don't try to merge overlapping events + if ( mergeData.firstVC != null && mergeData.secondVC != null && mergeData.firstVC.getEnd() >= mergeData.secondVC.getStart() ) + return mergeData.cantBeMerged(); + + return mergeData; + } + + // BUGBUG: make this merge function more general + protected VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { + final int thisStart = thisVC.getStart(); + final int nextStart = nextVC.getStart(); + byte[] refBases = new byte[]{}; + byte[] altBases = new byte[]{}; + refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); + altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); + int locus; + for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) { + final byte refByte = ref[locus - refLoc.getStart()]; + refBases = ArrayUtils.add(refBases, refByte); + altBases = ArrayUtils.add(altBases, refByte); + } + refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel + altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); + + int iii = 0; + if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele + while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } + if ( iii == refBases.length ) { + // we've become a null allele, such as with CA/C + A/AA -> CA/CA => after trimming there's nothing left + // so return a null variant context so we can eliminate the variants from consideration + return null; + } + } + + + final Allele refAllele = Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ); + final Allele altAllele = Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ); + return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), Arrays.asList(refAllele, altAllele)).make(); + } + + /** + * Update the event maps in all haplotypes to replace a replacement of update1 and 2 with replacement + * + * @param haplotypes the haplotypes whose event maps we need to update + * @param startPosKeySet a sorted set of start positions that we must update + * @param replacement a VariantContext to replace update1 and update2 with. Can be null, indicating that we just want to remove update1 and update2 + * @param update1 the first VC we want to update + * @param update2 the second VC we want to update + */ + private void replaceVariantContextsInMap(final List haplotypes, + final TreeSet startPosKeySet, + final VariantContext replacement, + final VariantContext update1, final VariantContext update2) { + // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event + for( final Haplotype h : haplotypes ) { + // if we had both events, add replacement. In some cases the haplotype may not have both + // events but they were still merged because the haplotype isn't a particularly informative + // haplotype in any case. The order of operations here is important because we are modifying the map + final boolean shouldAdd = h.getEventMap().containsKey(update1.getStart()) && h.getEventMap().containsKey(update2.getStart()); + h.getEventMap().remove(update1.getStart()); + h.getEventMap().remove(update2.getStart()); + if ( shouldAdd && replacement != null ) { + h.getEventMap().addVC(replacement, false); // cannot merge we other events at the same position + } + } + + startPosKeySet.remove(update1.getStart()); + startPosKeySet.remove(update2.getStart()); + if ( replacement != null ) startPosKeySet.add(replacement.getStart()); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 9fb75463a..6a66d9845 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -279,148 +279,6 @@ public class GenotypingEngineUnitTest extends BaseTest { Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap)); } - /** - * Tests that we get the right values from the R^2 calculation - */ - @Test - public void testCalculateR2LD() { - logger.warn("Executing testCalculateR2LD"); - - Assert.assertEquals(GenotypingEngine.calculateR2LD(1,1,1,1), 0.0, 0.00001); - Assert.assertEquals(GenotypingEngine.calculateR2LD(100,100,100,100), 0.0, 0.00001); - Assert.assertEquals(GenotypingEngine.calculateR2LD(1,0,0,1), 1.0, 0.00001); - Assert.assertEquals(GenotypingEngine.calculateR2LD(100,0,0,100), 1.0, 0.00001); - Assert.assertEquals(GenotypingEngine.calculateR2LD(1,2,3,4), (0.1 - 0.12) * (0.1 - 0.12) / (0.3 * 0.7 * 0.4 * 0.6), 0.00001); - } - - @Test - public void testCreateMergedVariantContext() { - logger.warn("Executing testCreateMergedVariantContext"); - - final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); - - // SNP + SNP = simple MNP - VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); - VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); - VariantContext mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + ref + SNP = MNP with ref base gap - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + SNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + insertion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + SNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + deletion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + deletion = MNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + deletion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + insertion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + deletion - thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + insertion (abutting) - thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); - nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make(); - truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // complex + complex - thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); - mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - } - /** * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext */ diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java new file mode 100644 index 000000000..26384c190 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparatorUnitTest.java @@ -0,0 +1,77 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeBaseComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + final List rawStrings = Arrays.asList("A", "C", "AC", "CT", "GTC", "ACGT"); + final List lexStrings = new ArrayList(rawStrings); + Collections.sort(lexStrings); + + for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { + final List haps = new ArrayList(seqs.size()); + for ( final String seq : seqs ) { + haps.add(new Haplotype(seq.getBytes(), false)); + } + + Collections.sort(haps, new HaplotypeBaseComparator()); + for ( int i = 0; i < lexStrings.size(); i++ ) + Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java new file mode 100644 index 000000000..3c3452bbf --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculatorUnitTest.java @@ -0,0 +1,118 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class HaplotypeLDCalculatorUnitTest extends BaseTest { + HaplotypeLDCalculator calculator; + + @BeforeMethod + public void setUp() throws Exception { + calculator = new HaplotypeLDCalculator(); + } + + /** + * Tests that we get the right values from the R^2 calculation + */ + @Test + public void computeProbOfBeingPhased() { + logger.warn("Executing testCalculateR2LD"); + + // See AA, AB, and BA in population + Assert.assertEquals(calculator.pPhasedTest(0, 0, 0, -100), 0, 0.00001); + + // See AA, AB, BB in population + Assert.assertTrue(calculator.pPhasedTest(0, 0, -100, 0) < 0.5); + + // See AA and BB in population + Assert.assertEquals(calculator.pPhasedTest(0, -100, -100, 0), 1, 0.00001); + + // See AA, AB, and BA but no BBs in population + Assert.assertEquals(calculator.pPhasedTest(0, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // See BB, AB, and BA but no AAs in population, so BB is the best explanation + Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, 0), 1, 0.00001); + + // See only AB and BA but no AAs nor BBs in population + Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // Previously bad input + Assert.assertEquals(calculator.pPhasedTest(-400, -600, -1200, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // first variant is just bad, so BA and BB are both very bad, shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -1000, -100, -10000), 0, 0.00001); + + // second variant is just bad, so AB and BB are both very bad, shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -100, -1000, -10000), 0, 0.00001); + + // AA is very good, all all others are quite poor. Shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -1000, -1000, -10000), 0, 0.00001); + + + for ( int i = -10; i > -10000; i -= 10 ) { + // only bad het states + Assert.assertTrue(calculator.pPhasedTest(0, i, i, 0) > 0.99, "Failed for " + i); + + // BB state is terrible + Assert.assertTrue(calculator.pPhasedTest(0, 0, 0, i) < 0.5, "Failed for " + i); + + // truth is AB, BA, and BB + Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, 0) < 0.5, "Failed for " + i); + + // truth is AB, BA + Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, i) < 0.5, "Failed for " + i); + + // Only good signal is AB, so we shouldn't be phased + Assert.assertTrue(calculator.pPhasedTest(i, i, 0, i) < 0.5, "Failed for " + i); + Assert.assertTrue(calculator.pPhasedTest(i, 0, i, i) < 0.5, "Failed for " + i); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java new file mode 100644 index 000000000..64a62bc02 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparatorUnitTest.java @@ -0,0 +1,76 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeScoreComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + final List scores = Arrays.asList(3.0, 2.0, 1.0); + for ( final List myScores : Utils.makePermutations(scores, scores.size(), false) ) { + final List haps = new ArrayList(myScores.size()); + for ( final double score : myScores ) { + final Haplotype h = new Haplotype("ACT".getBytes(), false); + h.setScore(score); + haps.add(h); + } + + Collections.sort(haps, new HaplotypeScoreComparator()); + for ( int i = 0; i < myScores.size(); i++ ) + Assert.assertEquals(haps.get(i).getScore(), scores.get(i)); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java new file mode 100644 index 000000000..a2c69e535 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/LDMergerUnitTest.java @@ -0,0 +1,334 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; + +public class LDMergerUnitTest extends BaseTest { + LDMerger merger; + GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + @BeforeMethod + public void setUp() throws Exception { + merger = new LDMerger(); + } + + @Test + public void testCreateMergedVariantContext() { + logger.warn("Executing testCreateMergedVariantContext"); + + final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // SNP + SNP = simple MNP + VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); + VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); + VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + ref + SNP = MNP with ref base gap + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + deletion = MNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + deletion + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + insertion (abutting) + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); + nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // complex + complex + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + } + + @Test + public void testInsertionDeletionBecomingNullAllele() { + final byte[] ref = "CAAA".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // insertion + deletion results in a null allele, should return false + final VariantContext thisVC = new VariantContextBuilder().loc("2", 1700, 1701).alleles("CA","C").make(); + final VariantContext nextVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("A","AA").make(); + final VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + Assert.assertNull(mergedVC, "Insertion deletion becoming a null allele should return a null variant context"); + } + + /** + * Just returns a given R2 value for testing + */ + private static class MockLDCalculator extends HaplotypeLDCalculator { + private final double R2; + + private MockLDCalculator(double r2) { + R2 = r2; + } + + @Override + protected double computeProbOfBeingPhased(VariantContext first, VariantContext second) { + return R2; + } + } + + @DataProvider(name = "R2MergerData") + public Object[][] makeR2MergerData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final double thres = LDMerger.MERGE_EVENTS_PROB_PHASED_THRESHOLD; + for ( final double r2 : Arrays.asList(0.0, thres - 0.01, thres + 0.01, 1.0) ) { + tests.add(new Object[]{"ACGT", "CCGC", 2, "4M", "ACGT", "CCGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "AGGC", 2, "4M", "CGT", "GGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "ACCC", 2, "4M", "GT", "CC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "ACCGTT", 2, "2M1I1M1I1M", "CG", "CCGT", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "AGCT", 2, "4M", "CG", "GC", r2, r2 >= thres}); + tests.add(new Object[]{"ACAGT", "AAGC", 2, "1M1D3M", "ACAGT", "AAGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACAGT", "AAT", 2, "1M1D1M1D1M", "ACAG", "AA", r2, r2 >= thres}); + + // cannot be merged -- only 1 event + tests.add(new Object[]{"AAA", "ACA", 1, "3M", null, null, r2, false}); + + final int dist = LDMerger.MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE + 2; + tests.add(new Object[]{Utils.dupString("A", dist), "C" + Utils.dupString("A", dist - 2) + "C", 2, dist + "M", null, null, r2, false}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "R2MergerData") + public void testR2Merger(final String refS, final String hapS, int nEvents, final String cigar, final String expectedMergedRef, final String expectedMergedAlt, final double r2, final boolean expectMerge) { + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); + final Haplotype hap = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + + final List haplotypes = Arrays.asList(ref, hap); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(r2); + + Assert.assertEquals(vcStarts.size(), nEvents); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, expectMerge); + Assert.assertEquals(vcStarts.size(), expectMerge ? 1 : nEvents); + if ( expectMerge ) { + final VariantContext vc = hap.getEventMap().getVariantContexts().iterator().next(); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertEquals(vc.getReference().getDisplayString(), expectedMergedRef); + Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), expectedMergedAlt); + } + } + + @Test + public void testR2MergerWithThirdHapWithoutEvent() { + final String refS = "ACGT"; + final String hapS = "CCGA"; + final String cigar = "4M"; + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); + final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final Haplotype hap2 = new Haplotype("ACGA".getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + + final List haplotypes = Arrays.asList(ref, hap1, hap2); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(1.0); + + Assert.assertEquals(vcStarts.size(), 2); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, true); + Assert.assertEquals(vcStarts.size(), 1); + + final VariantContext vc = hap1.getEventMap().getVariantContexts().iterator().next(); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertEquals(vc.getReference().getDisplayString(), "ACGT"); + Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), "CCGA"); + + Assert.assertEquals(hap2.getEventMap().size(), 0); + } + + @Test + public void testR2MergerWithMultipleAllelesAtSites() { + final String refS = "ACGT"; + final String hapS = "TCGA"; + final String cigar = "4M"; + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); + final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + for (final String hap2S : Arrays.asList("GCGA", "TCGG")) { + final Haplotype hap2 = new Haplotype(hap2S.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + + final List haplotypes = Arrays.asList(ref, hap1, hap2); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(1.0); + + Assert.assertEquals(vcStarts.size(), 2); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, false); + Assert.assertEquals(vcStarts.size(), 2); + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java similarity index 58% rename from public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java rename to public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java index c32cde641..7bc6acbfe 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventExtractor.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java @@ -1,27 +1,27 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.haplotype; @@ -35,7 +35,6 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; @@ -49,39 +48,40 @@ import java.util.*; * Date: 3/27/13 * Time: 8:35 AM */ -public class EventExtractor extends TreeMap { - private final static Logger logger = Logger.getLogger(EventExtractor.class); - private final static boolean mergeClumpedEvents = true; +public class EventMap extends TreeMap { + private final static Logger logger = Logger.getLogger(EventMap.class); protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public EventExtractor( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + private final Haplotype haplotype; + private final byte[] ref; + private final GenomeLoc refLoc; + private final String sourceNameToAdd; + + public EventMap(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { super(); + this.haplotype = haplotype; + this.ref = ref; + this.refLoc = refLoc; + this.sourceNameToAdd = sourceNameToAdd; - processCigarForInitialEvents(haplotype, ref, refLoc, sourceNameToAdd); - if ( mergeClumpedEvents && getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { - replaceClumpedEventsWithBlockSubstititions(haplotype, ref, refLoc); - } + processCigarForInitialEvents(); } /** * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference * @param stateForTesting */ - protected EventExtractor(final Map stateForTesting) { - super(stateForTesting); - } - - /** - * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference - * @param stateForTesting - */ - protected EventExtractor(final Collection stateForTesting) { + protected EventMap(final Collection stateForTesting) { + haplotype = null; + ref = null; + refLoc = null; + sourceNameToAdd = null; for ( final VariantContext vc : stateForTesting ) addVC(vc); } - protected void processCigarForInitialEvents(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { + protected void processCigarForInitialEvents() { final Cigar cigar = haplotype.getCigar(); final byte[] alignment = haplotype.getBases(); @@ -172,11 +172,22 @@ public class EventExtractor extends TreeMap { } } - private void addVC(final VariantContext vc) { + /** + * Add VariantContext vc to this map, merging events with the same start sites if necessary + * @param vc the variant context to add + */ + protected void addVC(final VariantContext vc) { addVC(vc, true); } - private void addVC(final VariantContext vc, final boolean merge) { + /** + * Add VariantContext vc to this map + * @param vc the variant context to add + * @param merge should we attempt to merge it with an already existing element, or should we throw an error in that case? + */ + protected void addVC(final VariantContext vc, final boolean merge) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + if ( containsKey(vc.getStart()) ) { if ( merge ) { final VariantContext prev = get(vc.getStart()); @@ -188,20 +199,46 @@ public class EventExtractor extends TreeMap { put(vc.getStart(), vc); } - private VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { - if ( ! vc1.isSNP() ) throw new IllegalArgumentException("vc1 must be a snp"); + /** + * Create a block substitution out of two variant contexts that start at the same position + * + * vc1 can be SNP, and vc2 can then be either a insertion or deletion. + * If vc1 is an indel, then vc2 must be the opposite type (vc1 deletion => vc2 must be an insertion) + * + * @param vc1 the first variant context we want to merge + * @param vc2 the second + * @return a block substitution that represents the composite substitution implied by vc1 and vc2 + */ + protected VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { + if ( vc1.getStart() != vc2.getStart() ) throw new IllegalArgumentException("vc1 and 2 must have the same start but got " + vc1 + " and " + vc2); + if ( ! vc1.isBiallelic() ) throw new IllegalArgumentException("vc1 must be biallelic"); + if ( ! vc1.isSNP() ) { + if ( ! ((vc1.isSimpleDeletion() && vc2.isSimpleInsertion()) || (vc1.isSimpleInsertion() && vc2.isSimpleDeletion()))) + throw new IllegalArgumentException("Can only merge single insertion with deletion (or vice versa) but got " + vc1 + " merging with " + vc2); + } else if ( vc2.isSNP() ) { + throw new IllegalArgumentException("vc1 is " + vc1 + " but vc2 is a SNP, which implies there's been some terrible bug in the cigar " + vc2); + } - Allele ref, alt; + final Allele ref, alt; final VariantContextBuilder b = new VariantContextBuilder(vc1); - if ( vc1.getReference().equals(vc2.getReference()) ) { - // we've got an insertion, so we just update the alt to have the prev alt - ref = vc1.getReference(); - alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); + if ( vc1.isSNP() ) { + // we have to repair the first base, so SNP case is special cased + if ( vc1.getReference().equals(vc2.getReference()) ) { + // we've got an insertion, so we just update the alt to have the prev alt + ref = vc1.getReference(); + alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); + } else { + // we're dealing with a deletion, so we patch the ref + ref = vc2.getReference(); + alt = vc1.getAlternateAllele(0); + b.stop(vc2.getEnd()); + } } else { - // we're dealing with a deletion, so we patch the ref - ref = vc2.getReference(); - alt = vc1.getAlternateAllele(0); - b.stop(vc2.getEnd()); + final VariantContext insertion = vc1.isSimpleInsertion() ? vc1 : vc2; + final VariantContext deletion = vc1.isSimpleInsertion() ? vc2 : vc1; + ref = deletion.getReference(); + alt = insertion.getAlternateAllele(0); + b.stop(deletion.getEnd()); } return b.alleles(Arrays.asList(ref, alt)).make(); @@ -209,24 +246,26 @@ public class EventExtractor extends TreeMap { // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it @Requires("getNumberOfEvents() > 0") - protected void replaceClumpedEventsWithBlockSubstititions(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) { - int lastStart = -1; - for ( boolean foundOne = true; foundOne; ) { - foundOne = false; - for ( final VariantContext vc : getVariantContexts() ) { - if ( vc.getStart() > lastStart ) { - lastStart = vc.getStart(); - final List neighborhood = getNeighborhood(vc, 10); - if ( updateToBlockSubstitutionIfBetter(neighborhood, haplotype, ref, refLoc) ) { - foundOne = true; - break; + protected void replaceClumpedEventsWithBlockSubstititions() { + if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { + int lastStart = -1; + for ( boolean foundOne = true; foundOne; ) { + foundOne = false; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() > lastStart ) { + lastStart = vc.getStart(); + final List neighborhood = getNeighborhood(vc, 10); + if ( updateToBlockSubstitutionIfBetter(neighborhood) ) { + foundOne = true; + break; + } } } } } } - protected boolean updateToBlockSubstitutionIfBetter(final List neighbors, final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc) { + protected boolean updateToBlockSubstitutionIfBetter(final List neighbors) { if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) return false; // TODO -- need more tests to decide if this is really so good @@ -284,24 +323,70 @@ public class EventExtractor extends TreeMap { return neighbors; } + /** + * Get the starting positions of events in this event map + * @return + */ public Set getStartPositions() { return keySet(); } + /** + * Get the variant contexts in order of start position in this event map + * @return + */ public Collection getVariantContexts() { return values(); } + /** + * How many events do we have? + * @return + */ public int getNumberOfEvents() { return size(); } @Override public String toString() { - final StringBuilder b = new StringBuilder("EventExtractor{"); + final StringBuilder b = new StringBuilder("EventMap{"); for ( final VariantContext vc : getVariantContexts() ) b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles())); b.append("}"); return b.toString(); } + + /** + * Build event maps for each haplotype, returning the sorted set of all of the starting positions of all + * events across all haplotypes + * + * @param haplotypes a list of haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @param debug if true, we'll emit debugging information during this operation + * @return a sorted set of start positions of all events among all haplotypes + */ + public static TreeSet buildEventMapsForHaplotypes( final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final boolean debug) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + int hapNumber = 0; + + if( debug ) logger.info("=== Best Haplotypes ==="); + for( final Haplotype h : haplotypes ) { + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( new EventMap( h, ref, refLoc, "HC" + hapNumber++ ) ); + startPosKeySet.addAll(h.getEventMap().getStartPositions()); + + if( debug ) { + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); + } + } + + return startPosKeySet; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index a94c08198..081fd14e0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -37,15 +37,13 @@ import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import java.io.Serializable; import java.util.Arrays; -import java.util.Comparator; import java.util.LinkedHashMap; import java.util.List; public class Haplotype extends Allele { private GenomeLoc genomeLocation = null; - private EventExtractor eventMap = null; + private EventMap eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; private Event artificialEvent = null; @@ -115,11 +113,11 @@ public class Haplotype extends Allele { return Arrays.hashCode(getBases()); } - public EventExtractor getEventMap() { + public EventMap getEventMap() { return eventMap; } - public void setEventMap( final EventExtractor eventMap ) { + public void setEventMap( final EventMap eventMap ) { this.eventMap = eventMap; } @@ -219,25 +217,6 @@ public class Haplotype extends Allele { return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation)); } - public static class HaplotypeBaseComparator implements Comparator, Serializable { - @Override - public int compare( final Haplotype hap1, final Haplotype hap2 ) { - return compareHaplotypeBases(hap1, hap2); - } - - public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) { - final byte[] arr1 = hap1.getBases(); - final byte[] arr2 = hap2.getBases(); - // compares byte arrays using lexical ordering - final int len = Math.min(arr1.length, arr2.length); - for( int iii = 0; iii < len; iii++ ) { - final int cmp = arr1[iii] - arr2[iii]; - if (cmp != 0) { return cmp; } - } - return arr2.length - arr1.length; - } - } - public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, @@ -316,15 +295,4 @@ public class Haplotype extends Allele { public void setScore(double score) { this.score = this.isReference() ? Double.MAX_VALUE : score; } - - /** - * A comparator that sorts haplotypes in decreasing order of score, so that the best supported - * haplotypes are at the top - */ - public static class ScoreComparator implements Comparator { - @Override - public int compare(Haplotype o1, Haplotype o2) { - return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore()); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java new file mode 100644 index 000000000..191442e3e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeBaseComparator.java @@ -0,0 +1,42 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import java.util.Comparator; + +/** + * Compares two haplotypes in the lexicographic order of their bases + * + * User: depristo + * Date: 3/29/13 + * Time: 11:09 AM + */ +public class HaplotypeBaseComparator implements Comparator { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + return hap1.getBaseString().compareTo(hap2.getBaseString()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java new file mode 100644 index 000000000..40146ba88 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeScoreComparator.java @@ -0,0 +1,39 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import java.util.Comparator; + +/** + * A comparator that sorts haplotypes in decreasing order of score, so that the best supported + * haplotypes are at the top + */ +public class HaplotypeScoreComparator implements Comparator { + @Override + public int compare(Haplotype o1, Haplotype o2) { + return -1 * Double.valueOf(o1.getScore()).compareTo(o2.getScore()); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java similarity index 61% rename from public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java index 480f82a46..d0b418b96 100644 --- a/public/java/test/org/broadinstitute/sting/utils/haplotype/EventExtractorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/EventMapUnitTest.java @@ -1,27 +1,27 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.haplotype; @@ -31,16 +31,14 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; -public class EventExtractorUnitTest extends BaseTest { +public class EventMapUnitTest extends BaseTest { private final static String CHR = "20"; private final static String NAME = "foo"; @@ -71,9 +69,9 @@ public class EventExtractorUnitTest extends BaseTest { vcs.add(vc); } - tests.add(new Object[]{new EventExtractor(new LinkedList(allVCS)), Collections.emptyList()}); + tests.add(new Object[]{new EventMap(new LinkedList(allVCS)), Collections.emptyList()}); allVCS.addAll(vcs); - tests.add(new Object[]{new EventExtractor(allVCS), vcs}); + tests.add(new Object[]{new EventMap(allVCS), vcs}); } } } @@ -86,12 +84,12 @@ public class EventExtractorUnitTest extends BaseTest { /** * Example testng test using MyDataProvider */ - @Test(dataProvider = "MyDataProvider", enabled = true) // TODO == reenable - public void testGetNeighborhood(final EventExtractor eventExtractor, final List expectedNeighbors) { + @Test(dataProvider = "MyDataProvider", enabled = true) + public void testGetNeighborhood(final EventMap eventMap, final List expectedNeighbors) { final VariantContext leftOfNeighors = expectedNeighbors.isEmpty() ? null : expectedNeighbors.get(0); - for ( final VariantContext vc : eventExtractor.getVariantContexts() ) { - final List n = eventExtractor.getNeighborhood(vc, 5); + for ( final VariantContext vc : eventMap.getVariantContexts() ) { + final List n = eventMap.getNeighborhood(vc, 5); if ( leftOfNeighors == vc ) Assert.assertEquals(n, expectedNeighbors); else if ( ! expectedNeighbors.contains(vc) ) @@ -103,7 +101,7 @@ public class EventExtractorUnitTest extends BaseTest { public Object[][] makeBlockSubstitutionsData() { List tests = new ArrayList(); - for ( int size = EventExtractor.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) { + for ( int size = EventMap.MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION; size < 10; size++ ) { final String ref = Utils.dupString("A", size); final String alt = Utils.dupString("C", size); tests.add(new Object[]{ref, alt, size + "M", GATKVariantContextUtils.makeFromAlleles(NAME, CHR, 1, Arrays.asList(ref, alt))}); @@ -131,7 +129,8 @@ public class EventExtractorUnitTest extends BaseTest { public void testBlockSubstitutionsData(final String refBases, final String haplotypeBases, final String cigar, final VariantContext expectedBlock) { final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); - final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME); + final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); + ee.replaceClumpedEventsWithBlockSubstititions(); Assert.assertEquals(ee.getNumberOfEvents(), 1); final VariantContext actual = ee.getVariantContexts().iterator().next(); Assert.assertTrue(GATKVariantContextUtils.equalSites(actual, expectedBlock), "Failed with " + actual); @@ -142,11 +141,11 @@ public class EventExtractorUnitTest extends BaseTest { List tests = new ArrayList(); tests.add(new Object[]{"TT", "GCT", "1M1I1M", Arrays.asList(Arrays.asList("T", "GC"))}); - tests.add(new Object[]{"GCT", "TT", "1M1D", Arrays.asList(Arrays.asList("GC", "T"))}); + tests.add(new Object[]{"GCT", "TT", "1M1D1M", Arrays.asList(Arrays.asList("GC", "T"))}); tests.add(new Object[]{"TT", "GCCT", "1M2I1M", Arrays.asList(Arrays.asList("T", "GCC"))}); - tests.add(new Object[]{"GCCT", "TT", "1M2D", Arrays.asList(Arrays.asList("GCC", "T"))}); - tests.add(new Object[]{"AAGCCT", "AATT", "3M2D", Arrays.asList(Arrays.asList("GCC", "T"))}); - tests.add(new Object[]{"AAGCCT", "GATT", "3M2D", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"GCCT", "TT", "1M2D1M", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "AATT", "3M2D1M", Arrays.asList(Arrays.asList("GCC", "T"))}); + tests.add(new Object[]{"AAGCCT", "GATT", "3M2D1M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("GCC", "T"))}); tests.add(new Object[]{"AAAAA", "AGACA", "5M", Arrays.asList(Arrays.asList("A", "G"), Arrays.asList("A", "C"))}); return tests.toArray(new Object[][]{}); @@ -155,11 +154,12 @@ public class EventExtractorUnitTest extends BaseTest { /** * Example testng test using MyDataProvider */ - @Test(dataProvider = "AdjacentSNPIndelTest", enabled = true) + @Test(dataProvider = "AdjacentSNPIndelTest") public void testAdjacentSNPIndelTest(final String refBases, final String haplotypeBases, final String cigar, final List> expectedAlleles) { final Haplotype hap = new Haplotype(haplotypeBases.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); final GenomeLoc loc = new UnvalidatingGenomeLoc(CHR, 0, 1, refBases.length()); - final EventExtractor ee = new EventExtractor(hap, refBases.getBytes(), loc, NAME); + final EventMap ee = new EventMap(hap, refBases.getBytes(), loc, NAME); + ee.replaceClumpedEventsWithBlockSubstititions(); Assert.assertEquals(ee.getNumberOfEvents(), expectedAlleles.size()); final List actuals = new ArrayList(ee.getVariantContexts()); for ( int i = 0; i < ee.getNumberOfEvents(); i++ ) { @@ -168,4 +168,36 @@ public class EventExtractorUnitTest extends BaseTest { Assert.assertEquals(actual.getAlternateAllele(0).getDisplayString(), expectedAlleles.get(i).get(1)); } } + + @DataProvider(name = "MakeBlockData") + public Object[][] makeMakeBlockData() { + List tests = new ArrayList(); + + tests.add(new Object[]{Arrays.asList("A", "G"), Arrays.asList("AGT", "A"), Arrays.asList("AGT", "G")}); + tests.add(new Object[]{Arrays.asList("A", "G"), Arrays.asList("A", "AGT"), Arrays.asList("A", "GGT")}); + + tests.add(new Object[]{Arrays.asList("AC", "A"), Arrays.asList("A", "AGT"), Arrays.asList("AC", "AGT")}); + tests.add(new Object[]{Arrays.asList("ACGTA", "A"), Arrays.asList("A", "AG"), Arrays.asList("ACGTA", "AG")}); + tests.add(new Object[]{Arrays.asList("AC", "A"), Arrays.asList("A", "AGCGT"), Arrays.asList("AC", "AGCGT")}); + tests.add(new Object[]{Arrays.asList("A", "ACGTA"), Arrays.asList("AG", "A"), Arrays.asList("AG", "ACGTA")}); + tests.add(new Object[]{Arrays.asList("A", "AC"), Arrays.asList("AGCGT", "A"), Arrays.asList("AGCGT", "AC")}); + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MakeBlockData", enabled = true) + public void testGetNeighborhood(final List firstAlleles, final List secondAlleles, final List expectedAlleles) { + final VariantContext vc1 = GATKVariantContextUtils.makeFromAlleles("x", "20", 10, firstAlleles); + final VariantContext vc2 = GATKVariantContextUtils.makeFromAlleles("x", "20", 10, secondAlleles); + final VariantContext expected = GATKVariantContextUtils.makeFromAlleles("x", "20", 10, expectedAlleles); + + final EventMap eventMap = new EventMap(Collections.emptyList()); + final VariantContext block = eventMap.makeBlock(vc1, vc2); + + Assert.assertEquals(block.getStart(), expected.getStart()); + Assert.assertEquals(block.getAlleles(), expected.getAlleles()); + } } From 7105ad65a6ab37a675d74cd468316f122cb3c40c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 2 Apr 2013 15:57:45 -0400 Subject: [PATCH 145/226] Remove the capability of EventMap to emit symbolic alleles for unassembled events -- These events always occur on the very edge of the haplotypes, and are intrinsically dodgy. So instead of emitting them and then potentially having to deal with merging real basepair events into them we just no longer emit those events. --- .../org/broadinstitute/sting/utils/haplotype/EventMap.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java index 7bc6acbfe..1d33e328d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java @@ -105,8 +105,9 @@ public class EventMap extends TreeMap { if( BaseUtils.isRegularBase(refByte) ) { insertionAlleles.add( Allele.create(refByte, true) ); } - if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele - insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { + // if the insertion isn't completely resolved in the haplotype, skip it + // note this used to emit SYMBOLIC_UNASSEMBLED_EVENT_ALLELE but that seems dangerous } else { byte[] insertionBases = new byte[]{}; insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base From 2aac9e2782aaac2aaf60e06ca6734415c8d06743 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 31 Mar 2013 14:40:14 -0400 Subject: [PATCH 146/226] More efficient ZipLinearChains algorithm -- Goes through the graph looking for chains to zip, accumulates the vertices of the chains, and then finally go through and updates the graph in one big go. Vastly more efficient than the previous version, but unfortunately doesn't actually work now -- Also incorporate edge weight propagation into SeqGraph zipLinearChains. The edge weights for all incoming and outgoing edges are now their previous value, plus the sum of the internal chain edges / n such edges --- .../haplotypecaller/graphs/SeqGraph.java | 208 +++++++++++++----- .../graphs/SeqGraphUnitTest.java | 177 ++++++++++++++- 2 files changed, 328 insertions(+), 57 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 400b5c7ee..d08c2f211 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -46,10 +46,13 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; -import org.apache.commons.lang.ArrayUtils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import java.io.File; import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; import java.util.Set; /** @@ -58,7 +61,7 @@ import java.util.Set; * @author: depristo * @since 03/2013 */ -public class SeqGraph extends BaseGraph { +public final class SeqGraph extends BaseGraph { private final static boolean PRINT_SIMPLIFY_GRAPHS = false; private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; @@ -118,18 +121,8 @@ public class SeqGraph extends BaseGraph { /** * Zip up all of the simple linear chains present in this graph. - */ - public boolean zipLinearChains() { - boolean foundOne = false; - while( zipOneLinearChain() ) { - // just keep going until zipOneLinearChain says its done - foundOne = true; - } - return foundOne; - } - - /** - * Merge together two vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence + * + * Merges together all pairs of vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence * * Only works on vertices where v1's only outgoing edge is to v2 and v2's only incoming edge is from v1. * @@ -137,44 +130,153 @@ public class SeqGraph extends BaseGraph { * * @return true if any such pair of vertices could be found, false otherwise */ - protected boolean zipOneLinearChain() { - for( final BaseEdge e : edgeSet() ) { - final SeqVertex outgoingVertex = getEdgeTarget(e); - final SeqVertex incomingVertex = getEdgeSource(e); - if( !outgoingVertex.equals(incomingVertex) - && outDegreeOf(incomingVertex) == 1 && inDegreeOf(outgoingVertex) == 1 - && isReferenceNode(incomingVertex) == isReferenceNode(outgoingVertex) ) { - - final Set outEdges = outgoingEdgesOf(outgoingVertex); - final Set inEdges = incomingEdgesOf(incomingVertex); - final BaseEdge singleOutEdge = outEdges.isEmpty() ? null : outEdges.iterator().next(); - final BaseEdge singleInEdge = inEdges.isEmpty() ? null : inEdges.iterator().next(); - - if( inEdges.size() == 1 && outEdges.size() == 1 ) { - singleInEdge.setMultiplicity( singleInEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - singleOutEdge.setMultiplicity( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() / 2 ) ); - } else if( inEdges.size() == 1 ) { - singleInEdge.setMultiplicity( Math.max(singleInEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); - } else if( outEdges.size() == 1 ) { - singleOutEdge.setMultiplicity( Math.max( singleOutEdge.getMultiplicity() + ( e.getMultiplicity() - 1 ), 0) ); - } - - final SeqVertex addedVertex = new SeqVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSequence()) ); - addVertex(addedVertex); - for( final BaseEdge edge : outEdges ) { - addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge.isRef(), edge.getMultiplicity())); - } - for( final BaseEdge edge : inEdges ) { - addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge.isRef(), edge.getMultiplicity())); - } - - removeVertex(incomingVertex); - removeVertex(outgoingVertex); - return true; - } + public boolean zipLinearChains() { + // create the list of start sites [doesn't modify graph yet] + final List zipStarts = new LinkedList(); + for ( final SeqVertex source : vertexSet() ) { + if ( isLinearChainStart(source) ) + zipStarts.add(source); } - return false; + if ( zipStarts.isEmpty() ) // nothing to do, as nothing could start a chain + return false; + + // At this point, zipStarts contains all of the vertices in this graph that might start some linear + // chain of vertices. We walk through each start, building up the linear chain of vertices and then + // zipping them up with mergeLinearChain, if possible + boolean mergedOne = false; + for ( final SeqVertex zipStart : zipStarts ) { + final LinkedList linearChain = traceLinearChain(zipStart); + + // merge the linearized chain, recording if we actually did some useful work + mergedOne |= mergeLinearChain(linearChain); + } + + return mergedOne; + } + + /** + * Is source vertex potentially a start of a linear chain of vertices? + * + * We are a start of a zip chain if our out degree is 1 and either the + * the vertex has no incoming connections or 2 or more (we must start a chain) or + * we have exactly one incoming vertex and that one has out-degree > 1 (i.e., source's incoming + * vertex couldn't be a start itself + * + * @param source a non-null vertex + * @return true if source might start a linear chain + */ + @Requires("source != null") + private boolean isLinearChainStart(final SeqVertex source) { + return outDegreeOf(source) == 1 + && ( inDegreeOf(source) != 1 + || outDegreeOf(incomingVerticesOf(source).iterator().next()) > 1 ); + } + + /** + * Get all of the vertices in a linear chain of vertices starting at zipStart + * + * Build a list of vertices (in order) starting from zipStart such that each sequential pair of vertices + * in the chain A and B can be zipped together. + * + * @param zipStart a vertex that starts a linear chain + * @return a list of vertices that comprise a linear chain starting with zipStart. The resulting + * list will always contain at least zipStart as the first element. + */ + @Requires("isLinearChainStart(zipStart)") + @Ensures({"result != null", "result.size() >= 1"}) + private LinkedList traceLinearChain(final SeqVertex zipStart) { + final LinkedList linearChain = new LinkedList(); + linearChain.add(zipStart); + + boolean lastIsRef = isReferenceNode(zipStart); // remember because this calculation is expensive + SeqVertex last = zipStart; + while (true) { + if ( outDegreeOf(last) != 1 ) + // cannot extend a chain from last if last has multiple outgoing branches + break; + + // there can only be one (outgoing edge of last) by contract + final SeqVertex target = getEdgeTarget(outgoingEdgeOf(last)); + + if ( inDegreeOf(target) != 1 || last.equals(target) ) + // cannot zip up a target that has multiple incoming nodes or that's a cycle to the last node + break; + + final boolean targetIsRef = isReferenceNode(target); + if ( lastIsRef != targetIsRef ) // both our isRef states must be equal + break; + + linearChain.add(target); // extend our chain by one + + // update our last state to be the current state, and continue + last = target; + lastIsRef = targetIsRef; + } + + return linearChain; + } + + /** + * Merge a linear chain of vertices into a single combined vertex, and update this graph to such that + * the incoming edges into the first element of the linearChain and the outgoing edges from linearChain.getLast() + * all point to this new combined vertex. + * + * @param linearChain a non-empty chain of vertices that can be zipped up into a single vertex + * @return true if we actually merged at least two vertices together + */ + protected boolean mergeLinearChain(final LinkedList linearChain) { + if ( linearChain.isEmpty() ) throw new IllegalArgumentException("BUG: cannot have linear chain with 0 elements but got " + linearChain); + + final SeqVertex first = linearChain.getFirst(); + final SeqVertex last = linearChain.getLast(); + + if ( first == last ) return false; // only one element in the chain, cannot be extended + + // create the combined vertex, and add it to the graph + // TODO -- performance problem -- can be optimized if we want + final List seqs = new LinkedList(); + for ( SeqVertex v : linearChain ) seqs.add(v.getSequence()); + final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{})); + final SeqVertex addedVertex = new SeqVertex( seqsCat ); + addVertex(addedVertex); + + final Set inEdges = incomingEdgesOf(first); + final Set outEdges = outgoingEdgesOf(last); + + final int nEdges = inEdges.size() + outEdges.size(); + int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges; + final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy + + // update the incoming and outgoing edges to point to the new vertex + for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge).add(inc)); } + for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge).add(inc)); } + + removeAllVertices(linearChain); + return true; + } + + /** + * Get the sum of the edge weights on a linear chain of at least 2 elements + * + * @param chain a linear chain of vertices with at least 2 vertices + * @return the sum of the multiplicities along all edges connecting vertices within the chain + */ + @Requires({"chain != null", "chain.size() >= 2"}) + private int sumEdgeWeightAlongChain(final LinkedList chain) { + int sum = 0; + SeqVertex prev = null; + + for ( final SeqVertex v : chain ) { + if ( prev != null ) { + final BaseEdge e = getEdge(prev, v); + if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v); + sum += e.getMultiplicity(); + } + prev = v; + } + + return sum; } /** @@ -241,7 +343,7 @@ public class SeqGraph extends BaseGraph { protected class MergeDiamonds extends VertexBasedTransformer { @Override protected boolean tryToTransform(final SeqVertex top) { - final Set middles = outgoingVerticesOf(top); + final List middles = outgoingVerticesOf(top); if ( middles.size() <= 1 ) // we can only merge if there's at least two middle nodes return false; @@ -295,7 +397,7 @@ public class SeqGraph extends BaseGraph { protected class MergeTails extends VertexBasedTransformer { @Override protected boolean tryToTransform(final SeqVertex top) { - final Set tails = outgoingVerticesOf(top); + final List tails = outgoingVerticesOf(top); if ( tails.size() <= 1 ) return false; @@ -379,7 +481,7 @@ public class SeqGraph extends BaseGraph { protected class MergeHeadlessIncomingSources extends VertexBasedTransformer { @Override boolean tryToTransform(final SeqVertex bottom) { - final Set incoming = incomingVerticesOf(bottom); + final List incoming = incomingVerticesOf(bottom); if ( incoming.size() <= 1 ) return false; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index cbd7b1063..698b83199 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -51,11 +51,15 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.ArrayList; import java.util.Arrays; +import java.util.LinkedList; import java.util.List; public class SeqGraphUnitTest extends BaseTest { + private final static boolean DEBUG = true; + private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { public byte[] sequence; public int KMER_LENGTH; @@ -98,7 +102,7 @@ public class SeqGraphUnitTest extends BaseTest { return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class); } - @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true) + @Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = !DEBUG) public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) { logger.warn(String.format("Test: %s", cfg.toString())); @@ -178,7 +182,7 @@ public class SeqGraphUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "IsDiamondData", enabled = true) + @Test(dataProvider = "IsDiamondData", enabled = !DEBUG) public void testIsDiamond(final SeqGraph graph, final SeqVertex v, final boolean isRootOfDiamond) { final SeqGraph.MergeDiamonds merger = graph.new MergeDiamonds(); merger.setDontModifyGraphEvenIfPossible(); @@ -311,7 +315,7 @@ public class SeqGraphUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "MergingData", enabled = true) + @Test(dataProvider = "MergingData", enabled = !DEBUG) public void testMerging(final SeqGraph graph, final SeqGraph expected) { final SeqGraph merged = (SeqGraph)graph.clone(); merged.simplifyGraph(1); @@ -333,7 +337,7 @@ public class SeqGraphUnitTest extends BaseTest { // // Should become A -> ACT -> C [ref and non-ref edges] // - @Test + @Test(enabled = !DEBUG) public void testBubbleSameBasesWithRef() { final SeqGraph graph = new SeqGraph(); final SeqVertex top = new SeqVertex("A"); @@ -351,4 +355,169 @@ public class SeqGraphUnitTest extends BaseTest { actual.simplifyGraph(); Assert.assertTrue(BaseGraph.graphEquals(actual, expected), "Wrong merging result after complete merging"); } + + @DataProvider(name = "LinearZipData") + public Object[][] makeLinearZipData() throws Exception { + List tests = new ArrayList(); + + SeqGraph graph = new SeqGraph(); + SeqGraph expected = new SeqGraph(); + + // empty graph => empty graph + tests.add(new Object[]{graph.clone(), expected.clone()}); + + SeqVertex a1 = new SeqVertex("A"); + SeqVertex c1 = new SeqVertex("C"); + SeqVertex ac1 = new SeqVertex("AC"); + + // just a single vertex + graph.addVertices(a1, c1); + expected.addVertices(a1, c1); + + tests.add(new Object[]{graph.clone(), expected.clone()}); + + graph.addEdges(a1, c1); + expected = new SeqGraph(); + expected.addVertices(ac1); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + // three long chain merged corrected + SeqVertex g1 = new SeqVertex("G"); + graph.addVertices(g1); + graph.addEdges(c1, g1); + expected = new SeqGraph(); + expected.addVertex(new SeqVertex("ACG")); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + // adding something that isn't connected isn't a problem + SeqVertex t1 = new SeqVertex("T"); + graph.addVertices(t1); + expected = new SeqGraph(); + expected.addVertices(new SeqVertex("ACG"), new SeqVertex("T")); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + // splitting chain with branch produces the correct zipped subgraphs + final SeqVertex a2 = new SeqVertex("A"); + final SeqVertex c2 = new SeqVertex("C"); + graph = new SeqGraph(); + graph.addVertices(a1, c1, g1, t1, a2, c2); + graph.addEdges(a1, c1, g1, t1, a2); + graph.addEdges(g1, c2); + expected = new SeqGraph(); + SeqVertex acg = new SeqVertex("ACG"); + SeqVertex ta = new SeqVertex("TA"); + expected.addVertices(acg, ta, c2); + expected.addEdges(acg, ta); + expected.addEdges(acg, c2); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + // Can merge chains with loops in them + { + graph = new SeqGraph(); + graph.addVertices(a1, c1, g1); + graph.addEdges(a1, c1, g1); + graph.addEdges(a1, a1); + expected = new SeqGraph(); + + SeqVertex ac = new SeqVertex("AC"); + SeqVertex cg = new SeqVertex("CG"); + + expected.addVertices(a1, cg); + expected.addEdges(a1, cg); + expected.addEdges(a1, a1); + tests.add(new Object[]{graph.clone(), expected.clone()}); + + graph.removeEdge(a1, a1); + graph.addEdges(c1, c1); + tests.add(new Object[]{graph.clone(), graph.clone()}); + + graph.removeEdge(c1, c1); + graph.addEdges(g1, g1); + expected = new SeqGraph(); + expected.addVertices(ac, g1); + expected.addEdges(ac, g1, g1); + tests.add(new Object[]{graph.clone(), expected.clone()}); + } + + // check building n element long chains + { + final List bases = Arrays.asList("A", "C", "G", "T", "TT", "GG", "CC", "AA"); + for ( final int len : Arrays.asList(1, 2, 10, 100, 1000)) { + graph = new SeqGraph(); + expected = new SeqGraph(); + SeqVertex last = null; + String expectedBases = ""; + for ( int i = 0; i < len; i++ ) { + final String seq = bases.get(i % bases.size()); + expectedBases += seq; + SeqVertex a = new SeqVertex(seq); + graph.addVertex(a); + if ( last != null ) graph.addEdge(last, a); + last = a; + } + expected.addVertex(new SeqVertex(expectedBases)); + tests.add(new Object[]{graph.clone(), expected.clone()}); + } + } + + // check that edge connections are properly maintained + { + int edgeWeight = 1; + for ( final int nIncoming : Arrays.asList(0, 2, 5, 10) ) { + for ( final int nOutgoing : Arrays.asList(0, 2, 5, 10) ) { + graph = new SeqGraph(); + expected = new SeqGraph(); + + graph.addVertices(a1, c1, g1); + graph.addEdges(a1, c1, g1); + expected.addVertex(acg); + + for ( final SeqVertex v : makeVertices(nIncoming) ) { + final BaseEdge e = new BaseEdge(false, edgeWeight++); + graph.addVertices(v); + graph.addEdge(v, a1, e); + expected.addVertex(v); + expected.addEdge(v, acg, e); + } + + for ( final SeqVertex v : makeVertices(nOutgoing) ) { + final BaseEdge e = new BaseEdge(false, edgeWeight++); + graph.addVertices(v); + graph.addEdge(g1, v, e); + expected.addVertex(v); + expected.addEdge(acg, v, e); + } + + tests.add(new Object[]{graph, expected}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private List makeVertices(final int n) { + final List vs = new LinkedList(); + final List bases = Arrays.asList("A", "C", "G", "T", "TT", "GG", "CC", "AA"); + + for ( int i = 0; i < n; i++ ) + vs.add(new SeqVertex(bases.get(i % bases.size()))); + return vs; + } + + @Test(dataProvider = "LinearZipData", enabled = true) + public void testLinearZip(final SeqGraph graph, final SeqGraph expected) { + final SeqGraph merged = (SeqGraph)graph.clone(); + merged.zipLinearChains(); + try { + Assert.assertTrue(SeqGraph.graphEquals(merged, expected)); + } catch (AssertionError e) { + if ( ! SeqGraph.graphEquals(merged, expected) ) { + graph.printGraph(new File("graph.dot"), 0); + merged.printGraph(new File("merged.dot"), 0); + expected.printGraph(new File("expected.dot"), 0); + } + throw e; + } + } } From e9169987843d69fbca6b986f0d342fdb654f43a1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 3 Apr 2013 10:39:45 -0400 Subject: [PATCH 147/226] Bugfix for head and tail merging code in SeqGraph -- The previous version of the head merging (and tail merging to a lesser degree) would inappropriately merge source and sinks without sufficient evidence to do so. This would introduce large deletion events at the start / end of the assemblies. Refcatored code to require 20 bp of overlap in the head or tail nodes, as well as unit tested functions to support this. --- .../haplotypecaller/graphs/SeqGraph.java | 33 ++++++++++---- .../graphs/SharedVertexSequenceSplitter.java | 43 ++++++++++++++++--- .../SharedVertexSequenceSplitterUnitTest.java | 41 ++++++++++++++++++ 3 files changed, 102 insertions(+), 15 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index d08c2f211..4cc7aae2a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -63,7 +63,14 @@ import java.util.Set; */ public final class SeqGraph extends BaseGraph { private final static boolean PRINT_SIMPLIFY_GRAPHS = false; - private final static int MIN_SUFFIX_TO_MERGE_TAILS = 5; + + /** + * The minimum number of common bp from the prefix (head merging) or suffix (tail merging) + * required before we'll merge in such configurations. A large value here is critical to avoid + * merging inappropriate head or tail nodes, which introduces large insertion / deletion events + * as the merge operation creates a link among the non-linked sink / source vertices + */ + private final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; /** * Construct an empty SeqGraph @@ -103,15 +110,15 @@ public final class SeqGraph extends BaseGraph { //logger.info("simplifyGraph iteration " + i); // iterate until we haven't don't anything useful didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0); didSomeWork |= new MergeDiamonds().transformUntilComplete(); didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".diamonds_and_tails.dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0); didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".split_suffix.dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0); didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".merge_suffix.dot"), 0); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0); didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); didSomeWork |= zipLinearChains(); @@ -375,7 +382,10 @@ public final class SeqGraph extends BaseGraph { // actually do the merging, returning true if at least 1 base was successfully split final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles); - return splitter.splitAndUpdate(top, bottom, 1); + if (splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(1)) + return splitter.splitAndUpdate(top, bottom); + else + return false; } } @@ -408,7 +418,11 @@ public final class SeqGraph extends BaseGraph { if ( dontModifyGraphEvenIfPossible() ) return true; final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails); - return splitter.splitAndUpdate(top, null, MIN_SUFFIX_TO_MERGE_TAILS); + + if (splitter.meetsMinMergableSequenceForSuffix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)) + return splitter.splitAndUpdate(top, null); + else + return false; } } @@ -492,7 +506,10 @@ public final class SeqGraph extends BaseGraph { if ( dontModifyGraphEvenIfPossible() ) return true; final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming); - return splitter.splitAndUpdate(null, bottom, 1); + if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)) + return splitter.splitAndUpdate(null, bottom); + else + return false; } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index 9834653a6..ca7faa444 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -133,6 +133,14 @@ public class SharedVertexSequenceSplitter { suffixV = prefixAndSuffix.getSecond(); } + /** + * Given sequencing that are all equal, does this splitter make those into prefix or suffix nodes? + * @return true if we merge equal nodes into prefix nodes or suffix nodes + */ + protected static boolean prefersPrefixMerging() { + return true; + } + /** * Simple single-function interface to split and then update a graph * @@ -140,20 +148,41 @@ public class SharedVertexSequenceSplitter { * * @param top the top vertex, may be null * @param bottom the bottom vertex, may be null - * @param minCommonSequence the minimum prefix or suffix size necessary among the vertices to split up - * before we'll go ahead and actually do the splitting. Allows one to determine - * whether there's actually any useful splitting to do, as well as protect - * yourself against spurious splitting of nodes based on trivial amounts of overall * @return true if some useful splitting was done, false otherwise */ - public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom, final int minCommonSequence) { - if ( prefixV.length() < minCommonSequence && suffixV.length() < minCommonSequence ) - return false; + public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom) { split(); updateGraph(top, bottom); return true; } + /** + * Does either the common suffix or prefix have at least minCommonSequence bases in it? + * @param minCommonSequence a minimum length of the common sequence, must be >= 0 + * @return true if either suffix or prefix length >= minCommonSequence + */ + public boolean meetsMinMergableSequenceForEitherPrefixOrSuffix(final int minCommonSequence) { + return meetsMinMergableSequenceForPrefix(minCommonSequence) || meetsMinMergableSequenceForSuffix(minCommonSequence); + } + + /** + * Does the common prefix have at least minCommonSequence bases in it? + * @param minCommonSequence a minimum length of the common sequence, must be >= 0 + * @return true if prefix length >= minCommonSequence + */ + public boolean meetsMinMergableSequenceForPrefix(final int minCommonSequence) { + return prefixV.length() >= minCommonSequence; + } + + /** + * Does the common suffix have at least minCommonSequence bases in it? + * @param minCommonSequence a minimum length of the common sequence, must be >= 0 + * @return true if suffix length >= minCommonSequence + */ + public boolean meetsMinMergableSequenceForSuffix(final int minCommonSequence) { + return suffixV.length() >= minCommonSequence; + } + /** * Actually do the splitting up of the vertices * diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 77857c367..0930d497f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -250,4 +250,45 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { } } } + + @DataProvider(name = "MeetsMinSequenceData") + public Object[][] makeMeetsMinSequenceData() { + List tests = new ArrayList(); + + final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging(); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 1, prefixBiased, ! prefixBiased}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 2, prefixBiased, ! prefixBiased}); + tests.add(new Object[]{Arrays.asList("AC", "AC"), 3, false, false}); + tests.add(new Object[]{Arrays.asList("A", "AC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("A", "AC"), 2, false, false}); + tests.add(new Object[]{Arrays.asList("AT", "AC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 1, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 2, true, false}); + tests.add(new Object[]{Arrays.asList("AAT", "AAC"), 3, false, false}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 1, true, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 2, true, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 3, false, true}); + tests.add(new Object[]{Arrays.asList("AATCCC", "AACCCC"), 4, false, false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MeetsMinSequenceData") + public void testSplitterCompleteCycle(final List mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) { + final SeqGraph graph = new SeqGraph(); + + final SeqVertex top = new SeqVertex("AAAAAAAA"); + final SeqVertex bot = new SeqVertex("GGGGGGGG"); + final List v = new ArrayList(); + for ( final String s : mids ) { v.add(new SeqVertex(s)); } + graph.addVertices(v.toArray(new SeqVertex[]{})); + graph.addVertices(top, bot); + for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); } + + final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); + Assert.assertEquals(splitter.meetsMinMergableSequenceForPrefix(minSeqLength), prefixMeets, "Prefix failed"); + Assert.assertEquals(splitter.meetsMinMergableSequenceForSuffix(minSeqLength), suffixMeets, "Suffix failed"); + Assert.assertEquals(splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(minSeqLength), suffixMeets || prefixMeets, "Either prefix or suffix failed"); + } } From 4d389a823467e355d502ee77056ed4434a04e6e3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 31 Mar 2013 16:57:36 -0400 Subject: [PATCH 148/226] Optimizations for HC infrastructure -- outgoingVerticesOf and incomingVerticesOf return a list not a set now, as the corresponding values must be unique since our super directed graph doesn't allow multiple edges between vertices -- Make DeBruijnGraph, SeqGraph, SeqVertex, and DeBruijnVertex all final -- Cache HashCode calculation in BaseVertex -- Better docs before the pruneGraph call --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 8 ++++++++ .../gatk/walkers/haplotypecaller/graphs/BaseVertex.java | 7 +++++-- .../walkers/haplotypecaller/graphs/DeBruijnGraph.java | 2 +- .../walkers/haplotypecaller/graphs/DeBruijnVertex.java | 2 +- .../gatk/walkers/haplotypecaller/graphs/SeqVertex.java | 2 +- .../haplotypecaller/graphs/SharedSequenceMerger.java | 2 +- .../walkers/haplotypecaller/graphs/BaseGraphUnitTest.java | 6 ++++-- .../walkers/haplotypecaller/graphs/SeqGraphUnitTest.java | 2 +- 8 files changed, 22 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 1fd2b9c00..5d8113212 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -185,6 +185,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor); + // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm + // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect + // TODO -- to anything from one that's actuall has good support along the chain but just happens + // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately + // TODO -- the pruning algorithm really should be an error correction algorithm that knows more + // TODO -- about the structure of the data and can differeniate between an infrequent path but + // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) + // TODO -- from a error with lots of weight going along another similar path // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive seqGraph.zipLinearChains(); if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java index f50b4a155..65643a2cc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java @@ -58,6 +58,7 @@ import java.util.Arrays; */ public class BaseVertex { final byte[] sequence; + int cachedHashCode = -1; /** * Create a new sequence vertex with sequence @@ -128,8 +129,10 @@ public class BaseVertex { */ @Override public int hashCode() { - // TODO -- optimization, could compute upfront once and cached in debruijn graph - return Arrays.hashCode(sequence); + if ( cachedHashCode == -1 ) { + cachedHashCode = Arrays.hashCode(sequence); + } + return cachedHashCode; } @Override diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index 109598029..66085fcad 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -59,7 +59,7 @@ import java.util.Map; * User: rpoplin * Date: 2/6/13 */ -public class DeBruijnGraph extends BaseGraph { +public final class DeBruijnGraph extends BaseGraph { /** * Create an empty DeBruijnGraph with default kmer size */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index 4d9441efe..c240949d9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -54,7 +54,7 @@ import com.google.java.contract.Ensures; * User: ebanks, mdepristo * Date: Mar 23, 2011 */ -public class DeBruijnVertex extends BaseVertex { +public final class DeBruijnVertex extends BaseVertex { private final static byte[][] sufficesAsByteArray = new byte[256][]; static { for ( int i = 0; i < sufficesAsByteArray.length; i++ ) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java index cfc2abfdc..f192b54aa 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java @@ -70,7 +70,7 @@ import java.util.Arrays; * @author: depristo * @since 03/2013 */ -public class SeqVertex extends BaseVertex { +public final class SeqVertex extends BaseVertex { private static int idCounter = 0; public final int id; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java index 1c53f2332..28734e505 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -75,7 +75,7 @@ public class SharedSequenceMerger { if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v); - final Set prevs = graph.incomingVerticesOf(v); + final List prevs = graph.incomingVerticesOf(v); if ( ! canMerge(graph, v, prevs) ) return false; else { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index 9737f72f5..c829488ba 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -241,9 +241,11 @@ public class BaseGraphUnitTest extends BaseTest { graph.printGraph(tmp, 10); } - private void assertVertexSetEquals(final Set actual, final SeqVertex ... expected) { + private void assertVertexSetEquals(final Collection actual, final SeqVertex ... expected) { + final Set actualSet = new HashSet(actual); + Assert.assertEquals(actualSet.size(), actual.size(), "Duplicate elements found in vertex list"); final Set expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); - Assert.assertEquals(actual, expectedSet); + Assert.assertEquals(actualSet, expectedSet); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 698b83199..ca43ced69 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -58,7 +58,7 @@ import java.util.LinkedList; import java.util.List; public class SeqGraphUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private class MergeNodesWithNoVariationTestProvider extends TestDataProvider { public byte[] sequence; From af593094a2ddcb598b00e649d90aa95a4c500df5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 30 Mar 2013 14:22:45 -0400 Subject: [PATCH 149/226] Major improvements to HC that trims down active regions before genotyping -- Trims down active regions and associated reads and haplotypes to a smaller interval based on the events actually in the haplotypes within the original active region (without extension). Radically speeds up calculations when using large active region extensions. The ActiveRegion.trim algorithm does the best job it can of trimming an active region down to a requested interval while ensuring the resulting active region has a region (and extension) no bigger than the original while spanning as much of the requested extend as possible. The trimming results in an active region that is a subset of the previous active region based on the position and types of variants found among the haplotypes -- Retire error corrector, archive old code and repurpose subsystem into a general kmer counter. The previous error corrector was just broken (conceptually) and was disabled by default in the engine. Now turning on error correction throws a UserException. Old part of the error corrector that counts kmers was extracted and put into KMerCounter.java -- Add final simplify graph call after we prune away the non-reference paths in DeBruijnAssembler --- .../haplotypecaller/DeBruijnAssembler.java | 52 ++--- .../haplotypecaller/GenotypingEngine.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 217 +++++++++++++++--- ...erErrorCorrector.java => KMerCounter.java} | 215 ++--------------- .../haplotypecaller/graphs/DeBruijnGraph.java | 1 - .../haplotypecaller/KMerCounterUnitTest.java | 84 +++++++ .../KMerErrorCorrectorUnitTest.java | 66 ------ .../utils/activeregion/ActiveRegion.java | 61 ++++- .../sting/utils/haplotype/EventMap.java | 23 ++ .../sting/utils/haplotype/Haplotype.java | 46 ++++ .../sting/utils/sam/AlignmentUtils.java | 64 ++++-- .../activeregion/ActiveRegionUnitTest.java | 73 +++++- .../utils/haplotype/HaplotypeUnitTest.java | 71 +++++- .../utils/sam/AlignmentUtilsUnitTest.java | 85 +++++++ 14 files changed, 703 insertions(+), 357 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{KMerErrorCorrector.java => KMerCounter.java} (50%) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java delete mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 5d8113212..40a6a79e0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -55,6 +55,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; @@ -161,8 +162,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); if ( shouldErrorCorrectKmers() ) { - graph = errorCorrect(graph); - if ( debugGraphTransformations ) graph.printGraph(new File("errorCorrected.dot"), pruneFactor); + throw new UserException("Error correction no longer supported because of the " + + "incredibly naive way this was implemented. The command line argument remains because some" + + " future subsystem will actually go and error correct the reads"); } final SeqGraph seqGraph = toSeqGraph(graph); @@ -214,6 +216,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return null; seqGraph.removePathsNotConnectedToRef(); + seqGraph.simplifyGraph(); + if ( seqGraph.vertexSet().size() == 1 ) { + // we've prefectly assembled into a single reference haplotype, add a empty seq vertex to stop + // the code from blowing up. + // TODO -- ref properties should really be on the vertices, not the graph itself + final SeqVertex complete = seqGraph.vertexSet().iterator().next(); + final SeqVertex dummy = new SeqVertex(""); + seqGraph.addVertex(dummy); + seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); + } if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor); return seqGraph; @@ -332,39 +344,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { return true; } - /** - * Error correct the kmers in this graph, returning a new graph built from those error corrected kmers - * @return an error corrected version of this (freshly allocated graph) or simply this graph if for some reason - * we cannot actually do the error correction - */ - public DeBruijnGraph errorCorrect(final DeBruijnGraph graph) { - final KMerErrorCorrector corrector = new KMerErrorCorrector(graph.getKmerSize(), 1, 1, 5); // TODO -- should be static variables - - for( final BaseEdge e : graph.edgeSet() ) { - for ( final byte[] kmer : Arrays.asList(graph.getEdgeSource(e).getSequence(), graph.getEdgeTarget(e).getSequence())) { - // TODO -- need a cleaner way to deal with the ref weight - corrector.addKmer(kmer, e.isRef() ? 1000 : e.getMultiplicity()); - } - } - - if ( corrector.computeErrorCorrectionMap() ) { - final DeBruijnGraph correctedGraph = new DeBruijnGraph(graph.getKmerSize()); - - for( final BaseEdge e : graph.edgeSet() ) { - final byte[] source = corrector.getErrorCorrectedKmer(graph.getEdgeSource(e).getSequence()); - final byte[] target = corrector.getErrorCorrectedKmer(graph.getEdgeTarget(e).getSequence()); - if ( source != null && target != null ) { - correctedGraph.addKmersToGraph(source, target, e.isRef(), e.getMultiplicity()); - } - } - - return correctedGraph; - } else { - // the error correction wasn't possible, simply return this graph - return graph; - } - } - protected void printGraphs(final List graphs) { final int writeFirstGraphWithSizeSmallerThan = 50; @@ -461,6 +440,9 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } } + // add genome locs to the haplotypes + for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow); + if ( returnHaplotypes.size() < returnHaplotypes.size() ) logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 7cdc57464..abd502c2b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -153,7 +153,7 @@ public class GenotypingEngine { if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref); - if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); + if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype); if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 53fffec61..bce179ee1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -77,6 +77,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.haplotype.HaplotypeBaseComparator; import org.broadinstitute.sting.utils.haplotype.LDMerger; @@ -300,6 +301,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected int debugGraphTransformations = -1; + // TODO -- not currently useful @Hidden @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; @@ -308,6 +310,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="useNewLDMerger", shortName="useNewLDMerger", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useNewLDMerger = false; + @Hidden + @Argument(fullName="trimActiveRegions", shortName="trimActiveRegions", doc="If specified, we will trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + protected boolean trimActiveRegions = false; + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -329,6 +335,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // reference base padding size private static final int REFERENCE_PADDING = 500; + // include at least this many bases around an event for calling it + private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20; + private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150; + + // the maximum extent into the full active region extension that we're willing to go in genotyping our events + private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25; + private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument @@ -490,7 +503,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final byte qual = p.getQual(); if( p.isDeletion() || qual > (byte) 18) { int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { AA = 2; BB = 0; if( p.isNextToSoftClip() ) { @@ -521,58 +534,53 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- @Override - public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + public Integer map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) { if ( justDetermineActiveRegions ) // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; - final List activeAllelesToGenotype = new ArrayList(); + if( !originalActiveRegion.isActive() ) { return 0; } // Not active so nothing to do! + final List activeAllelesToGenotype = new ArrayList(); if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : allelesToGenotype ) { - if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { + if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) { activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode } } allelesToGenotype.removeAll( activeAllelesToGenotype ); + // No alleles found in this region so nothing to do! + if ( activeAllelesToGenotype.isEmpty() ) { return 0; } + } else { + if( originalActiveRegion.size() == 0 ) { return 0; } // No reads here so nothing to do! } - if( !activeRegion.isActive() ) { return 0; } // Not active so nothing to do! - if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! - if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! + // run the local assembler, getting back a collection of information on how we should proceed + final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); - finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails - - final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region - final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); - final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); - - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); - if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! - - final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria - if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! - - // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM - Collections.sort( haplotypes, new HaplotypeBaseComparator() ); - - if (dontGenotype) - return 1; + // abort early if something is out of the acceptable range + if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! + if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + if (dontGenotype) return 1; // user requested we not proceed // evaluate each sample's reads against all haplotypes - final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); + //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); + + // filter out reads from genotyping which fail mapping quality based criteria + final List filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final List bestHaplotypes = selectBestHaplotypesForGenotyping(haplotypes, stratifiedReadMap); + final List bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap); final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, bestHaplotypes, stratifiedReadMap, perSampleFilteredReadList, - fullReferenceWithPadding, - paddedReferenceLoc, - activeRegion.getLocation(), + assemblyResult.fullReferenceWithPadding, + assemblyResult.paddedReferenceLoc, + assemblyResult.regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ); @@ -583,7 +591,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } if ( bamWriter != null ) { - haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); + haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc, + bestHaplotypes, + calledHaplotypes.getCalledHaplotypes(), + stratifiedReadMap); } if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } @@ -591,6 +602,152 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return 1; // One active region was processed during this map call } + private final static class AssemblyResult { + final List haplotypes; + final ActiveRegion regionForGenotyping; + final byte[] fullReferenceWithPadding; + final GenomeLoc paddedReferenceLoc; + + private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) { + this.haplotypes = haplotypes; + this.regionForGenotyping = regionForGenotyping; + this.fullReferenceWithPadding = fullReferenceWithPadding; + this.paddedReferenceLoc = paddedReferenceLoc; + } + } + + /** + * High-level function that runs the assembler on the active region reads, + * returning a data structure with the resulting information needed + * for further HC steps + * + * @param activeRegion the region we should assemble + * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) + * @return the AssemblyResult describing how to proceed with genotyping + */ + protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + // Create the reference haplotype which is the bases from the reference that make up the active region + finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails + + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); + final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); + final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); + + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); + + if ( trimActiveRegions ) { + return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + } else { + // we don't want to or cannot create a trimmed active region, so go ahead and use the old one + return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc); + } + } + + /** + * Trim down the active region to just enough to properly genotype the events among the haplotypes + * + * This function merely creates the region, but it doesn't populate the reads back into the region + * + * @param region our full active region + * @param haplotypes the list of haplotypes we've created from assembly + * @param ref the reference bases over the full padded location + * @param refLoc the span of the reference bases + * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully + */ + private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List haplotypes, final byte[] ref, final GenomeLoc refLoc) { + EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG); + final TreeSet allContexts = EventMap.getAllVariantContexts(haplotypes); + final GenomeLocParser parser = getToolkit().getGenomeLocParser(); + + if ( allContexts.isEmpty() ) // no variants, so just return the current region + return null; + + final List withinActiveRegion = new LinkedList(); + int pad = PADDING_AROUND_SNPS_FOR_CALLING; + GenomeLoc trimLoc = null; + for ( final VariantContext vc : allContexts ) { + final GenomeLoc vcLoc = parser.createGenomeLoc(vc); + if ( region.getLocation().overlapsP(vcLoc) ) { + if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding + pad = PADDING_AROUND_OTHERS_FOR_CALLING; + trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); + withinActiveRegion.add(vc); + } + } + + // we don't actually have anything in the region after removing variants that don't overlap the region's full location + if ( trimLoc == null ) return null; + + final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION); + final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad); + final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); + + final ActiveRegion trimmedRegion = region.trim(finalSpan); + if ( DEBUG ) { + logger.info("events : " + withinActiveRegion); + logger.info("trimLoc : " + trimLoc); + logger.info("pad : " + pad); + logger.info("idealSpan : " + idealSpan); + logger.info("maxSpan : " + maxSpan); + logger.info("finalSpan : " + finalSpan); + logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size()); + } + return trimmedRegion; + } + + /** + * Trim down the active region to just enough to properly genotype the events among the haplotypes + * + * @param originalActiveRegion our full active region + * @param haplotypes the list of haplotypes we've created from assembly + * @param fullReferenceWithPadding the reference bases over the full padded location + * @param paddedReferenceLoc the span of the reference bases + * @return an AssemblyResult containing the trimmed active region with all of the reads we should use + * trimmed down as well, and a revised set of haplotypes. If trimming failed this function + * may choose to use the originalActiveRegion without modification + */ + private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion, + final List haplotypes, + final byte[] fullReferenceWithPadding, + final GenomeLoc paddedReferenceLoc) { + final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); + + if ( trimmedActiveRegion == null ) + return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + + // trim down the haplotypes + final Set haplotypeSet = new HashSet(haplotypes.size()); + for ( final Haplotype h : haplotypes ) { + final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); + if ( trimmed != null ) { + haplotypeSet.add(trimmed); + } else if ( DEBUG ) { + logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc()); + } + } + + // create the final list of trimmed haplotypes + final List trimmedHaplotypes = new ArrayList(haplotypeSet); + + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); + + if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + + // trim down the reads and add them to the trimmed active region + final List trimmedReads = new ArrayList(originalActiveRegion.getReads().size()); + for( final GATKSAMRecord read : originalActiveRegion.getReads() ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); + if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + trimmedReads.add(clippedRead); + } + } + trimmedActiveRegion.clearReads(); + trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); + + return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc); + } + /** * Select the best N haplotypes according to their likelihoods, if appropriate * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java similarity index 50% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java index b051e5411..1f0903753 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrector.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java @@ -51,110 +51,31 @@ import org.apache.log4j.Logger; import java.util.*; /** - * generic utility function that error corrects kmers based on counts - * - * This class provides a generic facility for remapping kmers (byte[] of constant size) - * that occur infrequently to those that occur frequently, based on their simple edit distance - * as measured by mismatches. - * - * The overall workflow of using this class is simple. First, you create the class with - * parameters determining how the error correction should proceed. Next, you provide all - * of the kmers you see in your data. Once all kmers have been added, you call computeErrorCorrectionMap - * to tell this class that all kmers have been added and its time to determine error correcting - * mapping from observed kmers to corrected kmers. This correction looks for low-count (as determined - * by maxCountToCorrect) kmers and chooses the best kmer (minimizing mismatches) among those - * with at least minCountOfKmerToBeCorrection occurrences to error correct the kmer to. If - * there is no kmer with less than maxMismatchesToCorrect then the kmer will be mapped to - * null, indicating the kmer should not be used. - * - * TODO -- for ease of implementation this class uses strings instead of byte[] as those cannot - * TODO -- be added to hashmaps (more specifically, those don't implement .equals). A more efficient - * TODO -- version would use the byte[] directly - * - * TODO -- this is just not the right way to implement error correction in the graph. Basically, the - * right way to think about this is error correcting reads: - * - * * - * ACTGAT - * ACT - * CTG - * TGA - * GAT - * - * Now suppose the G is an error. What you are doing is asking for each 3mer in the read whether it's high quality - * or not. Suppose the answer is - * - * * - * ACTGAT - * ACT -- yes - * CTG -- no [CTG is unusual] - * TGA -- no [TGA is unusual] - * GAT -- yes [maybe GAT is just common, even through its an error] - * - * As we do this process it's clear how we can figure out which positions in the read likely harbor errors, and - * then go search around those bases in the read in an attempt to fix the read. We don't have to compute for - * every bad kmer it's best match, as that's just not the problem we are thinking looking to solve. We are actually - * looking for a change to a read such that all spanning kmers are well-supported. This class is being disabled - * until we figure implement this change. + * generic utility class that counts kmers * + * Basically you add kmers to the counter, and it tells you how many occurrences of each kmer it's seen. * * User: depristo * Date: 3/8/13 * Time: 1:16 PM */ -public class KMerErrorCorrector { - private final static Logger logger = Logger.getLogger(KMerErrorCorrector.class); - - /** - * The maximum number of bad kmer -> good kmer correction operations we'll consider doing before - * aborting for efficiency reasons. Basically, the current algorithm sucks, and is O(n^2), and - * so we cannot simply error correct 10K bad kmers against a db of 100K kmers if we ever want - * to finish running in a reasonable amount of time. This isn't worth fixing because fundamentally - * the entire error correction algorithm is just not right (i.e., it's correct but not ideal conceptually - * so we'll just fix the conceptual problem than the performance issue). - */ - private final static int MAX_CORRECTION_OPS_TO_ALLOW = 5000 * 1000; +public class KMerCounter { + private final static Logger logger = Logger.getLogger(KMerCounter.class); /** * A map of for each kmer to its num occurrences in addKmers */ - Map countsByKMer = new HashMap(); + private final Map countsByKMer = new HashMap(); + private final int kmerLength; /** - * A map from raw kmer -> error corrected kmer - */ - Map rawToErrorCorrectedMap = null; - - final int kmerLength; - final int maxCountToCorrect; - final int maxMismatchesToCorrect; - final int minCountOfKmerToBeCorrection; - - /** - * Create a new kmer corrector + * Create a new kmer counter * * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 - * @param maxCountToCorrect kmers with < maxCountToCorrect will try to be error corrected to another kmer, must be >= 0 - * @param maxMismatchesToCorrect the maximum number of mismatches between a to-be-corrected kmer and its - * best match that we attempt to error correct. If no sufficiently similar - * kmer exists, it will be remapped to null. Must be >= 1 - * @param minCountOfKmerToBeCorrection the minimum count of a kmer to be considered a target for correction. - * That is, kmers that need correction will only be matched with kmers - * with at least minCountOfKmerToBeCorrection occurrences. Must be >= 1 */ - public KMerErrorCorrector(final int kmerLength, - final int maxCountToCorrect, - final int maxMismatchesToCorrect, - final int minCountOfKmerToBeCorrection) { + public KMerCounter(final int kmerLength) { if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); - if ( maxCountToCorrect < 0 ) throw new IllegalArgumentException("maxCountToCorrect must be >= 0 but got " + maxCountToCorrect); - if ( maxMismatchesToCorrect < 1 ) throw new IllegalArgumentException("maxMismatchesToCorrect must be >= 1 but got " + maxMismatchesToCorrect); - if ( minCountOfKmerToBeCorrection < 1 ) throw new IllegalArgumentException("minCountOfKmerToBeCorrection must be >= 1 but got " + minCountOfKmerToBeCorrection); - this.kmerLength = kmerLength; - this.maxCountToCorrect = maxCountToCorrect; - this.maxMismatchesToCorrect = maxMismatchesToCorrect; - this.minCountOfKmerToBeCorrection = minCountOfKmerToBeCorrection; } /** @@ -165,7 +86,17 @@ public class KMerErrorCorrector { protected void addKmers(final String ... kmers) { for ( final String kmer : kmers ) addKmer(kmer, 1); - computeErrorCorrectionMap(); + } + + /** + * Get the count of kmer in this kmer counter + * @param kmer a non-null counter to get + * @return a positive integer + */ + public int getKmerCount(final byte[] kmer) { + if ( kmer == null ) throw new IllegalArgumentException("kmer cannot be null"); + final CountedKmer counted = countsByKMer.get(new String(kmer)); + return counted == null ? 0 : counted.count; } /** @@ -178,68 +109,9 @@ public class KMerErrorCorrector { addKmer(new String(rawKmer), kmerCount); } - - /** - * Get the error corrected kmer for rawKmer - * - * @param rawKmer a kmer that was already added that we want to get an error corrected version for - * @return an error corrected kmer to use instead of rawKmer. May be == rawKmer if no error correction - * is not necessary. May be null, indicating the rawKmer shouldn't be used at all - */ - public byte[] getErrorCorrectedKmer(final byte[] rawKmer) { - final String result = getErrorCorrectedKmer(new String(rawKmer)); - return result == null ? null : result.getBytes(); - } - - /** - * Indicate that no more kmers will be added to the kmer error corrector, so that the - * error correction data structure should be computed from the added kmers. Enabled calls - * to getErrorCorrectedKmer, and disable calls to addKmer. - * - * @return true if the error correction map could actually be computed, false if for any reason - * (efficiency, memory, we're out to lunch) a correction map couldn't be created. - */ - public boolean computeErrorCorrectionMap() { - if ( countsByKMer == null ) - throw new IllegalStateException("computeErrorCorrectionMap can only be called once"); - - final LinkedList needsCorrection = new LinkedList(); - final List goodKmers = new ArrayList(countsByKMer.size()); - - rawToErrorCorrectedMap = new HashMap(countsByKMer.size()); - for ( final CountedKmer countedKmer: countsByKMer.values() ) { - if ( countedKmer.count <= maxCountToCorrect ) - needsCorrection.add(countedKmer); - else { - // todo -- optimization could make not in map mean == - rawToErrorCorrectedMap.put(countedKmer.kmer, countedKmer.kmer); - - // only allow corrections to kmers with at least this count - if ( countedKmer.count >= minCountOfKmerToBeCorrection ) - goodKmers.add(countedKmer); - } - } - - // cleanup memory -- we don't need the counts for each kmer any longer - countsByKMer = null; - - if ( goodKmers.size() * needsCorrection.size() > MAX_CORRECTION_OPS_TO_ALLOW ) - return false; - else { - Collections.sort(goodKmers); - for ( final CountedKmer toCorrect : needsCorrection ) { - final String corrected = findClosestKMer(toCorrect, goodKmers); - rawToErrorCorrectedMap.put(toCorrect.kmer, corrected); - } - - return true; - } - } - protected void addKmer(final String rawKmer, final int kmerCount) { if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); - if ( countsByKMer == null ) throw new IllegalStateException("Cannot add kmers to an already finalized error corrector"); CountedKmer countFromMap = countsByKMer.get(rawKmer); if ( countFromMap == null ) { @@ -249,55 +121,10 @@ public class KMerErrorCorrector { countFromMap.count += kmerCount; } - protected String findClosestKMer(final CountedKmer kmer, final Collection goodKmers) { - String bestMatch = null; - int minMismatches = Integer.MAX_VALUE; - - for ( final CountedKmer goodKmer : goodKmers ) { - final int mismatches = countMismatches(kmer.kmer, goodKmer.kmer, minMismatches); - if ( mismatches < minMismatches ) { - minMismatches = mismatches; - bestMatch = goodKmer.kmer; - } - - // if we find an edit-distance 1 result, abort early, as we know there can be no edit distance 0 results - if ( mismatches == 1 ) - break; - } - - return minMismatches > maxMismatchesToCorrect ? null : bestMatch; - } - - protected int countMismatches(final String one, final String two, final int currentBest) { - int mismatches = 0; - for ( int i = 0; i < one.length(); i++ ) { - mismatches += one.charAt(i) == two.charAt(i) ? 0 : 1; - if ( mismatches > currentBest ) - break; - if ( mismatches > maxMismatchesToCorrect ) - return Integer.MAX_VALUE; - } - return mismatches; - } - - protected String getErrorCorrectedKmer(final String rawKmer) { - if ( rawToErrorCorrectedMap == null ) throw new IllegalStateException("Cannot get error corrected kmers until after computeErrorCorrectionMap has been called"); - if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); - return rawToErrorCorrectedMap.get(rawKmer); - } - @Override public String toString() { - final StringBuilder b = new StringBuilder("KMerErrorCorrector{"); - if ( rawToErrorCorrectedMap == null ) { - b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); - } else { - for ( Map.Entry toCorrect : rawToErrorCorrectedMap.entrySet() ) { - final boolean correcting = ! toCorrect.getKey().equals(toCorrect.getValue()); - if ( correcting ) - b.append(String.format("%n\tCorrecting %s -> %s", toCorrect.getKey(), toCorrect.getValue())); - } - } + final StringBuilder b = new StringBuilder("KMerCounter{"); + b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); b.append("\n}"); return b.toString(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index 66085fcad..c11841dac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerErrorCorrector; import java.util.Arrays; import java.util.HashMap; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java new file mode 100644 index 000000000..56197047b --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KMerCounterUnitTest extends BaseTest { + @Test + public void testMyData() { + final KMerCounter counter = new KMerCounter(3); + + Assert.assertNotNull(counter.toString()); + + counter.addKmers( + "ATG", "ATG", "ATG", "ATG", + "ACC", "ACC", "ACC", + "AAA", "AAA", + "CTG", + "NNA", + "CCC" + ); + + testCounting(counter, "ATG", 4); + testCounting(counter, "ACC", 3); + testCounting(counter, "AAA", 2); + testCounting(counter, "CTG", 1); + testCounting(counter, "NNA", 1); + testCounting(counter, "CCC", 1); + testCounting(counter, "NNN", 0); + testCounting(counter, "NNC", 0); + + Assert.assertNotNull(counter.toString()); + } + + private void testCounting(final KMerCounter counter, final String in, final int expectedCount) { + Assert.assertEquals(counter.getKmerCount(in.getBytes()), expectedCount); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java deleted file mode 100644 index f8a540b70..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerErrorCorrectorUnitTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -public class KMerErrorCorrectorUnitTest extends BaseTest { - @Test - public void testMyData() { - final KMerErrorCorrector corrector = new KMerErrorCorrector(3, 1, 2, 2); - - Assert.assertNotNull(corrector.toString()); - - corrector.addKmers( - "ATG", "ATG", "ATG", "ATG", - "ACC", "ACC", "ACC", - "AAA", "AAA", - "CTG", // -> ATG - "NNA", // -> AAA - "CCC", // => ACC - "NNN", // => null - "NNC" // => ACC [because of min count won't go to NNA] - ); - - testCorrection(corrector, "ATG", "ATG"); - testCorrection(corrector, "ACC", "ACC"); - testCorrection(corrector, "AAA", "AAA"); - testCorrection(corrector, "CTG", "ATG"); - testCorrection(corrector, "NNA", "AAA"); - testCorrection(corrector, "CCC", "ACC"); - testCorrection(corrector, "NNN", null); - testCorrection(corrector, "NNC", "ACC"); - - Assert.assertNotNull(corrector.toString()); - } - - private void testCorrection(final KMerErrorCorrector corrector, final String in, final String out) { - Assert.assertEquals(corrector.getErrorCorrectedKmer(in), out); - Assert.assertEquals(corrector.getErrorCorrectedKmer(in.getBytes()), out == null ? null : out.getBytes()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index b38d6575e..2f4c1b55d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -149,7 +149,7 @@ public class ActiveRegion implements HasGenomeLocation { @Override public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size() + " "; + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); } /** @@ -374,6 +374,8 @@ public class ActiveRegion implements HasGenomeLocation { * * Note that the returned list may be empty, if this active region doesn't overlap the set at all * + * Note that the resulting regions are all empty, regardless of whether the current active region has reads + * * @param intervals a non-null set of intervals that are allowed * @return an ordered list of active region where each interval is contained within intervals */ @@ -383,14 +385,59 @@ public class ActiveRegion implements HasGenomeLocation { final List clippedRegions = new LinkedList(); for ( final GenomeLoc overlapping : allOverlapping ) { - final GenomeLoc subLoc = getLocation().intersect(overlapping); - final int subStart = subLoc.getStart() - getLocation().getStart(); - final int subEnd = subStart + subLoc.size(); - final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); - final ActiveRegion clipped = new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension ); - clippedRegions.add(clipped); + clippedRegions.add(trim(overlapping, extension)); } return clippedRegions; } + + /** + * Trim this active to just the newExtent, producing a new active region without any reads that has only + * the extent of newExtend intersected with the current extent + * @param newExtent the new extend of the active region we want + * @param newExtension the extension size we want for the newly trimmed active region + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc newExtent, final int newExtension) { + if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + + final GenomeLoc subLoc = getLocation().intersect(newExtent); + final int subStart = subLoc.getStart() - getLocation().getStart(); + final int subEnd = subStart + subLoc.size(); + final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); + return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, newExtension ); + } + + /** + * Trim this active to no more than the newExtent, producing a new active region without any reads that + * attempts to provide the best possible representation of this active region covering the newExtent. + * + * The challenge here is that newExtent may (1) be larger than can be represented by this active region + * + its original extension and (2) the extension must be symmetric on both sides. This algorithm + * therefore determines how best to represent newExtent as a subset of the span of this + * region with a padding value that captures as much of the newExtent as possible. + * + * For example, suppose this active region is + * + * Active: 100-200 with extension of 50, so that the true span is 50-250 + * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225 + * + * Here we represent the active region as a active region from 150-200 with 25 bp of padding. + * + * The overall constraint is that the active region can never exceed the original active region, and + * the extension is chosen to maximize overlap with the desired region + * + * @param newExtent the new extend of the active region we want + * @return a non-null, empty active region + */ + public ActiveRegion trim(final GenomeLoc newExtent) { + if ( newExtent == null ) throw new IllegalArgumentException("Active region extent cannot be null"); + + final GenomeLoc subActive = getLocation().intersect(newExtent); + final int requiredOnRight = Math.max(newExtent.getStop() - subActive.getStop(), 0); + final int requiredOnLeft = Math.max(subActive.getStart() - newExtent.getStart(), 0); + final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension()); + + return new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java index 1d33e328d..ab5f23894 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java @@ -390,4 +390,27 @@ public class EventMap extends TreeMap { return startPosKeySet; } + + private static class VariantContextComparator implements Comparator { + @Override + public int compare(VariantContext vc1, VariantContext vc2) { + return vc1.getStart() - vc2.getStart(); + } + } + + /** + * Get all of the VariantContexts in the event maps for all haplotypes, sorted by their start position + * @param haplotypes the set of haplotypes to grab the VCs from + * @return a sorted set of variant contexts + */ + public static TreeSet getAllVariantContexts( final List haplotypes ) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet vcs = new TreeSet(new VariantContextComparator()); + + for( final Haplotype h : haplotypes ) { + vcs.addAll(h.getEventMap().getVariantContexts()); + } + + return vcs; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 081fd14e0..bacee7942 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -103,6 +103,40 @@ public class Haplotype extends Allele { this.genomeLocation = loc; } + /** + * Create a new Haplotype derived from this one that exactly spans the provided location + * + * Note that this haplotype must have a contain a genome loc for this operation to be successful. If no + * GenomeLoc is contained than @throws an IllegalStateException + * + * Also loc must be fully contained within this Haplotype's genomeLoc. If not an IllegalArgumentException is + * thrown. + * + * @param loc a location completely contained within this Haplotype's location + * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if + */ + public Haplotype trim(final GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("Loc cannot be null"); + if ( genomeLocation == null ) throw new IllegalStateException("Cannot trim a Haplotype without containing GenomeLoc"); + if ( ! genomeLocation.containsP(loc) ) throw new IllegalArgumentException("Can only trim a Haplotype to a containing span. My loc is " + genomeLocation + " but wanted trim to " + loc); + if ( getCigar() == null ) throw new IllegalArgumentException("Cannot trim haplotype without a cigar " + this); + + final int newStart = loc.getStart() - this.genomeLocation.getStart(); + final int newStop = newStart + loc.size() - 1; + final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar()); + final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop); + + if ( newBases == null || AlignmentUtils.startsOrEndsWithInsertionOrDeletion(newCigar) ) + // we cannot meaningfully chop down the haplotype, so return null + return null; + + final Haplotype ret = new Haplotype(newBases, isReference()); + ret.setCigar(newCigar); + ret.setGenomeLocation(loc); + ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); + return ret; + } + @Override public boolean equals( Object h ) { return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); @@ -126,6 +160,18 @@ public class Haplotype extends Allele { return getDisplayString(); } + /** + * Get the span of this haplotype (may be null) + * @return a potentially null genome loc + */ + public GenomeLoc getGenomeLocation() { + return genomeLocation; + } + + public void setGenomeLocation(GenomeLoc genomeLocation) { + this.genomeLocation = genomeLocation; + } + public long getStartPosition() { return genomeLocation.getStart(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 9b25b00c6..2208302fb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -48,6 +48,24 @@ public final class AlignmentUtils { // cannot be instantiated private AlignmentUtils() { } + /** + * Does cigar start or end with a deletion operation? + * + * @param cigar a non-null cigar to test + * @return true if the first or last operator of cigar is a D + */ + public static boolean startsOrEndsWithInsertionOrDeletion(final Cigar cigar) { + if ( cigar == null ) throw new IllegalArgumentException("Cigar cannot be null"); + + if ( cigar.isEmpty() ) + return false; + + final CigarOperator first = cigar.getCigarElement(0).getOperator(); + final CigarOperator last = cigar.getCigarElement(cigar.numCigarElements()-1).getOperator(); + return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I; + } + + /** * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the * alignment of bases to the reference (basesToRefCigar) and the start offset of the bases on the reference @@ -55,6 +73,8 @@ public final class AlignmentUtils { * refStart and refEnd are 0 based offsets that we want to obtain. In the client code, if the reference * bases start at position X and you want Y -> Z, refStart should be Y - X and refEnd should be Z - X. * + * If refStart or refEnd would start or end the new bases within a deletion, this function will return null + * * @param bases * @param refStart * @param refEnd @@ -63,7 +83,7 @@ public final class AlignmentUtils { * 10 (meaning bases doesn't fully span the reference), which would be indicated by basesStartOnRef == 10. * It's not trivial to eliminate this parameter because it's tied up with the cigar * @param basesToRefCigar the cigar that maps the bases to the reference genome - * @return a non-null byte[] + * @return a byte[] containing the bases covering this interval, or null if we would start or end within a deletion */ public static byte[] getBasesCoveringRefInterval(final int refStart, final int refEnd, final byte[] bases, final int basesStartOnRef, final Cigar basesToRefCigar) { if ( refStart < 0 || refEnd < refStart ) throw new IllegalArgumentException("Bad start " + refStart + " and/or stop " + refEnd); @@ -74,33 +94,41 @@ public final class AlignmentUtils { int refPos = basesStartOnRef; int basesPos = 0; - int basesStart = -1; int basesStop = -1; boolean done = false; for ( int iii = 0; ! done && iii < basesToRefCigar.numCigarElements(); iii++ ) { final CigarElement ce = basesToRefCigar.getCigarElement(iii); - final int bInc, rInc; switch ( ce.getOperator() ) { - case I: bInc = 1; rInc = 0; break; - case M: case X: case EQ: bInc = rInc = 1; break; - case D: bInc = 0; rInc = 1; break; + case I: + basesPos += ce.getLength(); + break; + case M: case X: case EQ: + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refStart ) + basesStart = basesPos; + if ( refPos == refEnd ) { + basesStop = basesPos; + done = true; + break; + } + refPos++; + basesPos++; + } + break; + case D: + for ( int i = 0; i < ce.getLength(); i++ ) { + if ( refPos == refEnd || refPos == refStart ) { + // if we ever reach a ref position that is either a start or an end, we fail + return null; + } + refPos++; + } + break; default: throw new IllegalStateException("Unsupported operator " + ce); } - - for ( int i = 0; i < ce.getLength(); i++ ) { - if ( refPos == refStart ) - basesStart = basesPos; - if ( refPos == refEnd ) { - basesStop = basesPos; - done = true; - break; - } - refPos += rInc; - basesPos += bInc; - } } if ( basesStart == -1 || basesStop == -1 ) diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java index 7f0f93704..ad5fd3642 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -49,7 +49,7 @@ import java.util.*; public class ActiveRegionUnitTest extends BaseTest { - private final static boolean DEBUG = true; + private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; private IndexedFastaSequenceFile seq; private String contig; @@ -309,4 +309,75 @@ public class ActiveRegionUnitTest extends BaseTest { } } } + + // ----------------------------------------------------------------------------------------------- + // + // Make sure we can properly cut up an active region based on engine intervals + // + // ----------------------------------------------------------------------------------------------- + + @DataProvider(name = "TrimActiveRegionData") + public Object[][] makeTrimActiveRegionData() { + List tests = new ArrayList(); + + // fully enclosed within active region + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 16), + genomeLocParser.createGenomeLoc("20", 15, 16), 0}); + + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 10, 15), + genomeLocParser.createGenomeLoc("20", 10, 15), 0}); + + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 20), + genomeLocParser.createGenomeLoc("20", 15, 20), 0}); + + // needs extra padding on the right + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 15, 25), + genomeLocParser.createGenomeLoc("20", 15, 20), 5}); + + // needs extra padding on the left + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 5, 15), + genomeLocParser.createGenomeLoc("20", 10, 15), 5}); + + // needs extra padding on both + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 7, 21), + genomeLocParser.createGenomeLoc("20", 10, 20), 3}); + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 9, 23), + genomeLocParser.createGenomeLoc("20", 10, 20), 3}); + + // desired span captures everything, so we're returning everything. Tests that extension is set correctly + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 10, 20), 10, + genomeLocParser.createGenomeLoc("20", 1, 50), + genomeLocParser.createGenomeLoc("20", 10, 20), 10}); + + // At the start of the chromosome, potentially a bit weird + tests.add(new Object[]{ + genomeLocParser.createGenomeLoc("20", 1, 10), 10, + genomeLocParser.createGenomeLoc("20", 1, 50), + genomeLocParser.createGenomeLoc("20", 1, 10), 10}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimActiveRegionData") + public void testTrimActiveRegion(final GenomeLoc regionLoc, final int extension, final GenomeLoc desiredSpan, final GenomeLoc expectedActiveRegion, final int expectedExtension) { + final ActiveRegion region = new ActiveRegion(regionLoc, Collections.emptyList(), true, genomeLocParser, extension); + final ActiveRegion trimmed = region.trim(desiredSpan); + Assert.assertEquals(trimmed.getLocation(), expectedActiveRegion, "Incorrect region"); + Assert.assertEquals(trimmed.getExtension(), expectedExtension, "Incorrect region"); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java index fe02aea9f..cfbc4a3e0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeUnitTest.java @@ -31,12 +31,15 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -45,10 +48,6 @@ import java.util.*; * Basic unit test for Haplotype Class */ public class HaplotypeUnitTest extends BaseTest { - @BeforeClass - public void init() { - } - @Test public void testSimpleInsertionAllele() { final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; @@ -183,4 +182,68 @@ public class HaplotypeUnitTest extends BaseTest { Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(1).toString(), "1M3I1M"); Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(2).toString(), "1M3I2M"); } + + @DataProvider(name = "TrimmingData") + public Object[][] makeTrimmingData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, 10, 20); + final String fullBases = "ACGTAACCGGT"; + for ( int trimStart = loc.getStart(); trimStart < loc.getStop(); trimStart++ ) { + for ( int trimStop = trimStart; trimStop <= loc.getStop(); trimStop++ ) { + final int start = trimStart - loc.getStart(); + final int stop = start + (trimStop - trimStart) + 1; + final GenomeLoc trimmedLoc = new UnvalidatingGenomeLoc("20", 0, start + loc.getStart(), stop + loc.getStart() - 1); + final String expectedBases = fullBases.substring(start, stop); + final Haplotype full = new Haplotype(fullBases.getBytes(), loc); + final Haplotype trimmed = new Haplotype(expectedBases.getBytes(), trimmedLoc); + + final int hapStart = 10; + full.setAlignmentStartHapwrtRef(hapStart); + full.setCigar(TextCigarCodec.getSingleton().decode(full.length() + "M")); + + trimmed.setAlignmentStartHapwrtRef(hapStart + start); + trimmed.setCigar(TextCigarCodec.getSingleton().decode(trimmed.length() + "M")); + + tests.add(new Object[]{full, trimmedLoc, trimmed}); + } + } + + final Haplotype full = new Haplotype("ACT".getBytes(), new UnvalidatingGenomeLoc("20", 0, 10, 14)); + full.setAlignmentStartHapwrtRef(10); + full.setCigar(TextCigarCodec.getSingleton().decode("1M2D2M")); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 11, 12), null}); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 10, 12), null}); + tests.add(new Object[]{full, new UnvalidatingGenomeLoc("20", 0, 11, 13), null}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimmingData") + public void testTrim(final Haplotype full, final GenomeLoc trimTo, final Haplotype expected) { + final Haplotype actual = full.trim(trimTo); + if ( expected != null ) { + Assert.assertEquals(actual.getBases(), expected.getBases()); + Assert.assertEquals(actual.getStartPosition(), trimTo.getStart()); + Assert.assertEquals(actual.getStopPosition(), trimTo.getStop()); + Assert.assertEquals(actual.getCigar(), expected.getCigar()); + Assert.assertEquals(actual.getAlignmentStartHapwrtRef(), expected.getAlignmentStartHapwrtRef()); + } else { + Assert.assertNull(actual); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadTrimLoc() { + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, 10, 20); + final Haplotype hap = new Haplotype("ACGTAACCGGT".getBytes(), loc); + hap.trim(new UnvalidatingGenomeLoc("20", 0, 1, 20)); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testBadTrimNoLoc() { + final Haplotype hap = new Haplotype("ACGTAACCGGT".getBytes()); + hap.trim(new UnvalidatingGenomeLoc("20", 0, 1, 20)); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index 125450257..2a2d80206 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -948,4 +948,89 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(actualEndPos, pos + elt.getLength()); Assert.assertEquals(AlignmentUtils.consolidateCigar(new Cigar(elts)), expectedCigar); } + + @DataProvider(name = "GetBasesCoveringRefIntervalData") + public Object[][] makeGetBasesCoveringRefIntervalData() { + List tests = new ArrayList(); + + // matches + // 0123 + // ACGT + tests.add(new Object[]{"ACGT", 0, 3, "4M", "ACGT"}); + tests.add(new Object[]{"ACGT", 1, 3, "4M", "CGT"}); + tests.add(new Object[]{"ACGT", 1, 2, "4M", "CG"}); + tests.add(new Object[]{"ACGT", 1, 1, "4M", "C"}); + + // deletions + // 012345 + // AC--GT + tests.add(new Object[]{"ACGT", 0, 5, "2M2D2M", "ACGT"}); + tests.add(new Object[]{"ACGT", 1, 5, "2M2D2M", "CGT"}); + tests.add(new Object[]{"ACGT", 2, 5, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 3, 5, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 4, 5, "2M2D2M", "GT"}); + tests.add(new Object[]{"ACGT", 5, 5, "2M2D2M", "T"}); + tests.add(new Object[]{"ACGT", 0, 4, "2M2D2M", "ACG"}); + tests.add(new Object[]{"ACGT", 0, 3, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 0, 2, "2M2D2M", null}); + tests.add(new Object[]{"ACGT", 0, 1, "2M2D2M", "AC"}); + tests.add(new Object[]{"ACGT", 0, 0, "2M2D2M", "A"}); + + // insertions + // 01--23 + // ACTTGT + tests.add(new Object[]{"ACTTGT", 0, 3, "2M2I2M", "ACTTGT"}); + tests.add(new Object[]{"ACTTGT", 1, 3, "2M2I2M", "CTTGT"}); + tests.add(new Object[]{"ACTTGT", 2, 3, "2M2I2M", "GT"}); + tests.add(new Object[]{"ACTTGT", 3, 3, "2M2I2M", "T"}); + tests.add(new Object[]{"ACTTGT", 0, 2, "2M2I2M", "ACTTG"}); + tests.add(new Object[]{"ACTTGT", 0, 1, "2M2I2M", "AC"}); + tests.add(new Object[]{"ACTTGT", 1, 2, "2M2I2M", "CTTG"}); + tests.add(new Object[]{"ACTTGT", 2, 2, "2M2I2M", "G"}); + tests.add(new Object[]{"ACTTGT", 1, 1, "2M2I2M", "C"}); + + tests.add(new Object[]{"ACGT", 0, 1, "2M2I", "AC"}); + tests.add(new Object[]{"ACGT", 1, 1, "2M2I", "C"}); + tests.add(new Object[]{"ACGT", 0, 0, "2M2I", "A"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetBasesCoveringRefIntervalData", enabled = true) + public void testGetBasesCoveringRefInterval(final String basesString, final int refStart, final int refEnd, final String cigarString, final String expected) { + final byte[] actualBytes = AlignmentUtils.getBasesCoveringRefInterval(refStart, refEnd, basesString.getBytes(), 0, TextCigarCodec.getSingleton().decode(cigarString)); + if ( expected == null ) + Assert.assertNull(actualBytes); + else + Assert.assertEquals(new String(actualBytes), expected); + } + + @DataProvider(name = "StartsOrEndsWithInsertionOrDeletionData") + public Object[][] makeStartsOrEndsWithInsertionOrDeletionData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"2M", false}); + tests.add(new Object[]{"1D2M", true}); + tests.add(new Object[]{"2M1D", true}); + tests.add(new Object[]{"2M1I", true}); + tests.add(new Object[]{"1I2M", true}); + tests.add(new Object[]{"1M1I2M", false}); + tests.add(new Object[]{"1M1D2M", false}); + tests.add(new Object[]{"1M1I2M1I", true}); + tests.add(new Object[]{"1M1I2M1D", true}); + tests.add(new Object[]{"1D1M1I2M", true}); + tests.add(new Object[]{"1I1M1I2M", true}); + tests.add(new Object[]{"1M1I2M1I1M", false}); + tests.add(new Object[]{"1M1I2M1D1M", false}); + tests.add(new Object[]{"1M1D2M1D1M", false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "StartsOrEndsWithInsertionOrDeletionData", enabled = true) + public void testStartsOrEndsWithInsertionOrDeletion(final String cigar, final boolean expected) { + Assert.assertEquals(AlignmentUtils.startsOrEndsWithInsertionOrDeletion(TextCigarCodec.getSingleton().decode(cigar)), expected); + } + + } From 9b5c55a84ab46a09471f79c0694f792af994fd58 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 3 Apr 2013 08:59:09 -0400 Subject: [PATCH 150/226] LikelihoodCalculationEngine will now only use reads longer than the minReadLength, which is currently fixed at 20 bp --- .../LikelihoodCalculationEngine.java | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 543b23d9c..a90f8959d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -69,6 +69,7 @@ public class LikelihoodCalculationEngine { private final byte constantGCP; private final boolean DEBUG; private final PairHMM pairHMM; + private final int minReadLength = 20; public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { @@ -90,9 +91,16 @@ public class LikelihoodCalculationEngine { DEBUG = debug; } - public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - - final Map stratifiedReadMap = new HashMap(); + /** + * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate + * + * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples + * against the set of haplotypes + * + * @param haplotypes a non-null list of haplotypes + * @param perSampleReadList a mapping from sample -> reads + */ + private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { int X_METRIC_LENGTH = 0; for( final Map.Entry> sample : perSampleReadList.entrySet() ) { for( final GATKSAMRecord read : sample.getValue() ) { @@ -108,13 +116,20 @@ public class LikelihoodCalculationEngine { // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + } - // for each sample's reads + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { + // configure the HMM + initializePairHMM(haplotypes, perSampleReadList); + + // Add likelihoods for each sample's reads to our stratifiedReadMap + final Map stratifiedReadMap = new HashMap(); for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue())); } + return stratifiedReadMap; } @@ -128,6 +143,10 @@ public class LikelihoodCalculationEngine { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); for( final GATKSAMRecord read : reads ) { + if ( read.getReadLength() < minReadLength ) + // don't consider any reads that have a read length < the minimum + continue; + final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read From 15461567d77e9de85e339c5cbcd7ebf6357246a6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 3 Apr 2013 18:47:59 -0400 Subject: [PATCH 151/226] HaplotypeCaller no longer uses reads with poor likelihoods w.r.t. any haplotype -- The previous likelihood calculation proceeds as normal, but after each read has been evaluated against each haplotype we go through the read / allele / likelihoods map and eliminate all reads that have poor fit to any of the haplotypes. This functionality stops us from making a particular type of error in the HC, where we have a haplotype that's very far from the reference allele but not the right true haplotype. All of the reads that are slightly closer to this FP haplotype than the reference previously generated enormous likelihoods in favor of this FP haplotype because they were closer to it than the reference, even if each read had many mismatches w.r.t. the FP haplotype (and so the FP haplotype was a bad model for the true underlying haplotype). --- .../LikelihoodCalculationEngine.java | 17 +++- .../PerReadAlleleLikelihoodMapUnitTest.java | 94 +++++++++++++++---- .../genotyper/PerReadAlleleLikelihoodMap.java | 58 ++++++++++++ 3 files changed, 151 insertions(+), 18 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a90f8959d..1fb873e81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -71,6 +71,13 @@ public class LikelihoodCalculationEngine { private final PairHMM pairHMM; private final int minReadLength = 20; + /** + * The expected rate of random sequencing errors for a read originating from its true haplotype. + * + * For example, if this is 0.01, then we'd expect 1 error per 100 bp. + */ + private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02; + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { switch (hmmType) { @@ -127,7 +134,14 @@ public class LikelihoodCalculationEngine { for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue())); + final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); + + final List removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); +// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey()); +// for ( final GATKSAMRecord read : removedReads ) +// logger.info("\tRemoved " + read.getReadName()); + + stratifiedReadMap.put(sampleEntry.getKey(), map); } return stratifiedReadMap; @@ -170,6 +184,7 @@ public class LikelihoodCalculationEngine { perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); } } + return perReadAlleleLikelihoodMap; } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 84bdfd19b..c50849a54 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.utils.genotyper; +import net.sf.samtools.*; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.BaseUtils; @@ -54,33 +56,16 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.Utils; import java.util.Map; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.variant.vcf.VCFCodec; import java.io.File; import java.io.FileNotFoundException; import java.util.*; @@ -235,7 +220,82 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertEquals(downsampledStrat.get(base_A).size(),(int) (pileup.depthOfCoverage()/2) - 1); Assert.assertEquals(downsampledStrat.get(base_C).size(),(int) (pileup.depthOfCoverage()/2)); Assert.assertEquals(downsampledStrat.get(base_T).size(),0); + } + + @DataProvider(name = "PoorlyModelledReadData") + public Object[][] makePoorlyModelledReadData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0)}); + tests.add(new Object[]{10, 0.1, true, Arrays.asList(-10.0)}); + tests.add(new Object[]{10, 0.1, false, Arrays.asList(0.0, -10.0)}); + tests.add(new Object[]{10, 0.1, true, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.1, false, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -3.0)}); + tests.add(new Object[]{100, 0.01, false, Arrays.asList(-5.0, -10.0, -2.0)}); + tests.add(new Object[]{100, 0.01, true, Arrays.asList(-5.0, -10.0, -4.0)}); + tests.add(new Object[]{100, 0.001, true, Arrays.asList(-5.0, -10.0)}); + tests.add(new Object[]{100, 0.001, false, Arrays.asList(-5.0, -10.0, 0.0)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PoorlyModelledReadData") + public void testPoorlyModelledRead(final int readLen, final double maxErrorRatePerBase, final boolean expected, final List log10likelihoods) { + final byte[] bases = Utils.dupBytes((byte)'A', readLen); + final byte[] quals = Utils.dupBytes((byte) 30, readLen); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M"); + + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + final boolean actual = map.readIsPoorlyModelled(read, log10likelihoods, maxErrorRatePerBase); + Assert.assertEquals(actual, expected); + } + @DataProvider(name = "RemovingPoorlyModelledReadData") + public Object[][] makeRemovingPoorlyModelledReadData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final int readLen = 10; + for ( int nReads = 0; nReads < 4; nReads++ ) { + for ( int nBad = 0; nBad <= nReads; nBad++ ) { + final int nGood = nReads - nBad; + tests.add(new Object[]{readLen, nReads, nBad, nGood}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "RemovingPoorlyModelledReadData") + public void testRemovingPoorlyModelledReads(final int readLen, final int nReads, final int nBad, final int nGood) { + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + final Set goodReads = new HashSet(); + final Set badReads = new HashSet(); + for ( int readI = 0; readI < nReads; readI++ ) { + final boolean bad = readI < nBad; + final double likelihood = bad ? -100.0 : 0.0; + + final byte[] bases = Utils.dupBytes((byte)'A', readLen); + final byte[] quals = Utils.dupBytes((byte) 30, readLen); + + final Allele allele = Allele.create(Utils.dupString("A", readI+1)); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, readLen + "M"); + read.setReadName("readName" + readI); + map.add(read, allele, likelihood); + (bad ? badReads : goodReads).add(read); + } + + final List removedReads = map.filterPoorlyModelledReads(0.01); + Assert.assertEquals(removedReads.size(), nBad, "nBad " + nBad + " nGood " + nGood); + Assert.assertEquals(new HashSet(removedReads), badReads, "nBad " + nBad + " nGood " + nGood); + Assert.assertEquals(map.size(), nGood, "nBad " + nBad + " nGood " + nGood); + Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood); + Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 02618100d..201e3b9b4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -251,4 +251,62 @@ public class PerReadAlleleLikelihoodMap { } return sb.toString(); } + + /** + * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods + * + * Goes through each read in this map, and if it is poorly modelled removes it from the map. + * + * @see #readIsPoorlyModelled(org.broadinstitute.sting.utils.sam.GATKSAMRecord, java.util.Collection, double) + * for more information about the poorly modelled test. + * + * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled + * @return the list of reads removed from this map because they are poorly modelled + */ + public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { + final List removedReads = new LinkedList(); + final Iterator>> it = likelihoodReadMap.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> record = it.next(); + if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { + it.remove(); + removedReads.add(record.getKey()); + } + } + + return removedReads; + } + + /** + * Is this read poorly modelled by any of the alleles in this map? + * + * A read is poorly modeled when it's likelihood is below what would be expected for a read + * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. + * + * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood + * of the read. In other words, that the read would be fully explained by one of the alleles. This means + * that the allele should be something like the full haplotype from which the read might originate. + * + * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So + * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors + * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect + * a likelihood to be >= 10 * -3. + * + * @param read the read we want to evaluate + * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. + * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So + * 0.01 means a 1% error rate + * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes + */ + protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.ceil(read.getReadLength() * maxErrorRatePerBase); + final double log10QualPerBase = -3.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + for ( final double log10Likelihood : log10Likelihoods ) + if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) + return false; + + return true; + } } From 5545c629f5680c3dce0fe414e19858970fc8a1d4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 4 Apr 2013 10:24:12 -0400 Subject: [PATCH 152/226] Rename Utils to GraphUtils to avoid conflicts with the sting.Utils class; fix broken unit test in SharedVertexSequenceSplitterUnitTest --- .../haplotypecaller/graphs/CommonSuffixSplitter.java | 7 +++---- .../haplotypecaller/graphs/{Utils.java => GraphUtils.java} | 4 ++-- .../gatk/walkers/haplotypecaller/graphs/SeqGraph.java | 2 +- .../graphs/SharedVertexSequenceSplitter.java | 4 ++-- .../walkers/haplotypecaller/graphs/SeqGraphUnitTest.java | 7 ++++--- .../graphs/SharedVertexSequenceSplitterUnitTest.java | 4 ++-- 6 files changed, 14 insertions(+), 14 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/{Utils.java => GraphUtils.java} (99%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index dabfbb322..371d5b7e3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -48,7 +48,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Requires; -import java.io.File; import java.util.*; /** @@ -177,9 +176,9 @@ public class CommonSuffixSplitter { */ @Requires("!middleVertices.isEmpty()") protected static SeqVertex commonSuffix(final Collection middleVertices) { - final List kmers = Utils.getKmers(middleVertices); - final int min = Utils.minKmerLength(kmers); - final int suffixLen = Utils.compSuffixLen(kmers, min); + final List kmers = GraphUtils.getKmers(middleVertices); + final int min = GraphUtils.minKmerLength(kmers); + final int suffixLen = GraphUtils.compSuffixLen(kmers, min); final byte[] kmer = kmers.get(0); final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length); return new SeqVertex(suffix); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java index 8cb272925..30c5be190 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Utils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/GraphUtils.java @@ -60,8 +60,8 @@ import java.util.List; * Date: 3/25/13 * Time: 9:42 PM */ -final class Utils { - private Utils() {} +final class GraphUtils { + private GraphUtils() {} /** * Compute the maximum shared prefix length of list of bytes. diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 4cc7aae2a..97969d098 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -70,7 +70,7 @@ public final class SeqGraph extends BaseGraph { * merging inappropriate head or tail nodes, which introduces large insertion / deletion events * as the merge operation creates a link among the non-linked sink / source vertices */ - private final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; + protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; /** * Construct an empty SeqGraph diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index ca7faa444..f6ee4c3c3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -295,8 +295,8 @@ public class SharedVertexSequenceSplitter { min = Math.min(min, v.getSequence().length); } - final int prefixLen = Utils.compPrefixLen(kmers, min); - final int suffixLen = Utils.compSuffixLen(kmers, min - prefixLen); + final int prefixLen = GraphUtils.compPrefixLen(kmers, min); + final int suffixLen = GraphUtils.compSuffixLen(kmers, min - prefixLen); final byte[] kmer = kmers.get(0); final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index ca43ced69..42137e4e4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -195,8 +196,8 @@ public class SeqGraphUnitTest extends BaseTest { final SeqGraph graph = new SeqGraph(); - SeqVertex pre1 = new SeqVertex("ACT"); - SeqVertex pre2 = new SeqVertex("AGT"); + SeqVertex pre1 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "CT"); + SeqVertex pre2 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "GT"); SeqVertex top = new SeqVertex("A"); SeqVertex middle1 = new SeqVertex("GC"); SeqVertex middle2 = new SeqVertex("TC"); @@ -282,7 +283,7 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex newMiddle1 = new SeqVertex("G"); final SeqVertex newMiddle2 = new SeqVertex("T"); final SeqVertex newBottom = new SeqVertex("C" + bottom.getSequenceString()); - final SeqVertex newTop = new SeqVertex("A"); + final SeqVertex newTop = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES)); final SeqVertex newTopDown1 = new SeqVertex("G"); final SeqVertex newTopDown2 = new SeqVertex("C"); final SeqVertex newTopBottomMerged = new SeqVertex("TA"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 0930d497f..2df783b19 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -98,10 +98,10 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { min = Math.min(min, s.length()); } - final int actualPrefixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compPrefixLen(bytes, min); + final int actualPrefixLen = GraphUtils.compPrefixLen(bytes, min); Assert.assertEquals(actualPrefixLen, expectedPrefixLen, "Failed prefix test"); - final int actualSuffixLen = org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Utils.compSuffixLen(bytes, min - actualPrefixLen); + final int actualSuffixLen = GraphUtils.compSuffixLen(bytes, min - actualPrefixLen); Assert.assertEquals(actualSuffixLen, expectedSuffixLen, "Failed suffix test"); } From 9c7a35f73fe5bcff75abbad10af3065bf589e381 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 4 Apr 2013 15:15:10 -0400 Subject: [PATCH 153/226] HaplotypeCaller no longer creates haplotypes that involve cycles in the SeqGraph -- The kbest paths algorithm now takes an explicit set of starting and ending vertices, which is conceptually cleaner and works for either the cycle or no-cycle models. Allowing cycles can be re-enabled with an HC command line switch. --- .../haplotypecaller/DeBruijnAssembler.java | 14 ++- .../haplotypecaller/HaplotypeCaller.java | 7 +- .../haplotypecaller/graphs/BaseGraph.java | 24 ++++ .../haplotypecaller/graphs/KBestPaths.java | 119 +++++++++++------- .../walkers/haplotypecaller/graphs/Path.java | 13 ++ .../graphs/KBestPathsUnitTest.java | 75 +++++++---- 6 files changed, 183 insertions(+), 69 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 40a6a79e0..11701a73b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -95,22 +95,25 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private final boolean debug; private final boolean debugGraphTransformations; private final int minKmer; + private final boolean allowCyclesInKmerGraphToGeneratePaths; private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; protected DeBruijnAssembler() { - this(false, -1, 11); + this(false, -1, 11, false); } public DeBruijnAssembler(final boolean debug, final int debugGraphTransformations, - final int minKmer) { + final int minKmer, + final boolean allowCyclesInKmerGraphToGeneratePaths) { super(); this.debug = debug; this.debugGraphTransformations = debugGraphTransformations > 0; this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations; this.minKmer = minKmer; + this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths; } /** @@ -388,7 +391,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } for( final SeqGraph graph : graphs ) { - for ( final Path path : new KBestPaths().getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + final SeqVertex source = graph.getReferenceSourceVertex(); + final SeqVertex sink = graph.getReferenceSinkVertex(); + if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); + + final KBestPaths pathFinder = new KBestPaths(allowCyclesInKmerGraphToGeneratePaths); + for ( final Path path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) { // logger.info("Found path " + path); Haplotype h = new Haplotype( path.getBases() ); if( !returnHaplotypes.contains(h) ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index bce179ee1..80276f7be 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -314,6 +314,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="trimActiveRegions", shortName="trimActiveRegions", doc="If specified, we will trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) protected boolean trimActiveRegions = false; + @Hidden + @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) + protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + + // the UG engines private UnifiedGenotyperEngine UG_engine = null; private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; @@ -424,7 +429,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } // setup the assembler - assemblyEngine = new DeBruijnAssembler( DEBUG, debugGraphTransformations, minKmer); + assemblyEngine = new DeBruijnAssembler(DEBUG, debugGraphTransformations, minKmer, allowCyclesInKmerGraphToGeneratePaths); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 5d591fd5c..7ce57e2e7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -137,6 +137,30 @@ public class BaseGraph extends DefaultDirectedGraph getSources() { + final Set set = new LinkedHashSet(); + for ( final T v : vertexSet() ) + if ( isSource(v) ) + set.add(v); + return set; + } + + /** + * Get the set of sink vertices of this graph + * @return a non-null set + */ + public Set getSinks() { + final Set set = new LinkedHashSet(); + for ( final T v : vertexSet() ) + if ( isSink(v) ) + set.add(v); + return set; + } + /** * Pull out the additional sequence implied by traversing this node in the graph * @param v the vertex from which to pull out the additional base sequence diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java index 1dc712c67..466148588 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java @@ -50,10 +50,7 @@ import com.google.common.collect.MinMaxPriorityQueue; import com.google.java.contract.Ensures; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; +import java.util.*; /** * Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. @@ -63,7 +60,23 @@ import java.util.List; * Date: Mar 23, 2011 */ public class KBestPaths { - public KBestPaths() { } + private final boolean allowCycles; + + /** + * Create a new KBestPaths finder that follows cycles in the graph + */ + public KBestPaths() { + this(true); + } + + /** + * Create a new KBestPaths finder + * + * @param allowCycles should we allow paths that follow cycles in the graph? + */ + public KBestPaths(final boolean allowCycles) { + this.allowCycles = allowCycles; + } protected static class MyInt { public int val = 0; } @@ -78,31 +91,61 @@ public class KBestPaths { } /** - * @see #getKBestPaths(BaseGraph, int) retriving the first 1000 paths + * @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths */ public List> getKBestPaths( final BaseGraph graph ) { return getKBestPaths(graph, 1000); } /** - * Traverse the graph and pull out the best k paths. - * Paths are scored via their comparator function. The default being PathComparatorTotalScore() - * @param graph the graph from which to pull paths - * @param k the number of paths to find - * @return a list with at most k top-scoring paths from the graph + * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths + * starting from all source vertices and ending with all sink vertices */ - @Ensures({"result != null", "result.size() <= k"}) public List> getKBestPaths( final BaseGraph graph, final int k ) { + return getKBestPaths(graph, k, graph.getSources(), graph.getSinks()); + } + + /** + * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 + */ + public List> getKBestPaths( final BaseGraph graph, final Set sources, final Set sinks ) { + return getKBestPaths(graph, 1000, sources, sinks); + } + + /** + * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 + */ + public List> getKBestPaths( final BaseGraph graph, final T source, final T sink ) { + return getKBestPaths(graph, 1000, source, sink); + } + + /** + * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets + */ + public List> getKBestPaths( final BaseGraph graph, final int k, final T source, final T sink ) { + return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink)); + } + + /** + * Traverse the graph and pull out the best k paths. + * Paths are scored via their comparator function. The default being PathComparatorTotalScore() + * @param graph the graph from which to pull paths + * @param k the number of paths to find + * @param sources a set of vertices we want to start paths with + * @param sinks a set of vertices we want to end paths with + * @return a list with at most k top-scoring paths from the graph + */ + @Ensures({"result != null", "result.size() <= k"}) + public List> getKBestPaths( final BaseGraph graph, final int k, final Set sources, final Set sinks ) { if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } // a min max queue that will collect the best k paths final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); // run a DFS for best paths - for ( final T v : graph.vertexSet() ) { - if ( graph.inDegreeOf(v) == 0 ) { - findBestPaths(new Path(v, graph), bestPaths, new MyInt()); - } + for ( final T source : sources ) { + final Path startingPath = new Path(source, graph); + findBestPaths(startingPath, sinks, bestPaths, new MyInt()); } // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result @@ -111,9 +154,15 @@ public class KBestPaths { return toReturn; } - private void findBestPaths( final Path path, final MinMaxPriorityQueue> bestPaths, final MyInt n ) { - // did we hit the end of a path? - if ( allOutgoingEdgesHaveBeenVisited(path) ) { + /** + * Recursive algorithm to find the K best paths in the graph from the current path to any of the sinks + * @param path the current path progress + * @param sinks a set of nodes that are sinks. Will terminate and add a path if the last vertex of path is in this set + * @param bestPaths a path to collect completed paths. + * @param n used to limit the search by tracking the number of vertices visited across all paths + */ + private void findBestPaths( final Path path, final Set sinks, final Collection> bestPaths, final MyInt n ) { + if ( sinks.contains(path.getLastVertex())) { bestPaths.add(path); } else if( n.val > 10000 ) { // do nothing, just return, as we've done too much work already @@ -122,31 +171,15 @@ public class KBestPaths { final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); for ( final BaseEdge edge : edgeArrayList ) { + final T target = path.getGraph().getEdgeTarget(edge); // make sure the edge is not already in the path - if ( path.containsEdge(edge) ) - continue; - - final Path newPath = new Path(path, edge); - n.val++; - findBestPaths(newPath, bestPaths, n); + final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target); + if ( ! alreadyVisited ) { + final Path newPath = new Path(path, edge); + n.val++; + findBestPaths(newPath, sinks, bestPaths, n); + } } } } - - /** - * Have all of the outgoing edges of the final vertex been visited? - * - * I.e., are all outgoing vertices of the current path in the list of edges of the graph? - * - * @param path the path to test - * @return true if all the outgoing edges at the end of this path have already been visited - */ - private boolean allOutgoingEdgesHaveBeenVisited( final Path path ) { - for( final BaseEdge edge : path.getOutgoingEdgesOfLastVertex() ) { - if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles - return false; - } - } - return true; - } -} +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 50ca91d41..252ae3449 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -148,6 +148,19 @@ public class Path { return edgesAsSet.contains(edge); } + /** + * Does this path contain the given vertex? + * + * @param v a non-null vertex + * @return true if v occurs within this path, false otherwise + */ + public boolean containsVertex(final T v) { + if ( v == null ) throw new IllegalArgumentException("Vertex cannot be null"); + + // TODO -- warning this is expense. Need to do vertex caching + return getVertices().contains(v); + } + /** * Check that two paths have the same edges and total score * @param path the other path we might be the same as diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index d20a0f778..3c6327842 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -55,10 +55,7 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -70,15 +67,13 @@ public class KBestPathsUnitTest { @DataProvider(name = "BasicPathFindingData") public Object[][] makeBasicPathFindingData() { List tests = new ArrayList(); -// for ( final int nStartNodes : Arrays.asList(1) ) { -// for ( final int nBranchesPerBubble : Arrays.asList(2) ) { -// for ( final int nEndNodes : Arrays.asList(1) ) { -// for ( final boolean addCycle : Arrays.asList(true) ) { - for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { - for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { - for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { - for ( final boolean addCycle : Arrays.asList(true, false) ) { - tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle}); + for ( final boolean allowCycles : Arrays.asList(false, true)) { + for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { + for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { + for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { + for ( final boolean addCycle : Arrays.asList(true, false) ) { + tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle, allowCycles}); + } } } } @@ -88,9 +83,9 @@ public class KBestPathsUnitTest { } private static int weight = 1; - final List createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { + final Set createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { final List seqs = Arrays.asList("A", "C", "G", "T"); - final List vertices = new LinkedList(); + final Set vertices = new LinkedHashSet(); for ( int i = 0; i < n; i++ ) { final SeqVertex v = new SeqVertex(seqs.get(i)); graph.addVertex(v); @@ -102,22 +97,22 @@ public class KBestPathsUnitTest { } @Test(dataProvider = "BasicPathFindingData", enabled = true) - public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle) { + public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle, final boolean allowCycles) { SeqGraph graph = new SeqGraph(); final SeqVertex middleTop = new SeqVertex("GTAC"); final SeqVertex middleBottom = new SeqVertex("ACTG"); graph.addVertices(middleTop, middleBottom); - final List starts = createVertices(graph, nStartNodes, null, middleTop); - final List bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); - final List ends = createVertices(graph, nEndNodes, middleBottom, null); + final Set starts = createVertices(graph, nStartNodes, null, middleTop); + final Set bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); + final Set ends = createVertices(graph, nEndNodes, middleBottom, null); if ( addCycle ) graph.addEdge(middleBottom, middleBottom); // enumerate all possible paths - final List> paths = new KBestPaths().getKBestPaths(graph); + final List> paths = new KBestPaths(allowCycles).getKBestPaths(graph, starts, ends); - final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle ? 2 : 1) * nEndNodes; + final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle && allowCycles ? 2 : 1) * nEndNodes; Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); int lastScore = Integer.MAX_VALUE; @@ -128,11 +123,47 @@ public class KBestPathsUnitTest { // get the best path, and make sure it's the same as our optimal path overall final Path best = paths.get(0); - final List> justOne = new KBestPaths().getKBestPaths(graph, 1); + final List> justOne = new KBestPaths(allowCycles).getKBestPaths(graph, 1, starts, ends); Assert.assertEquals(justOne.size(), 1); Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); } + @Test + public void testPathFindingComplexCycle() { + SeqGraph graph = new SeqGraph(); + + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("C"); + final SeqVertex v3 = new SeqVertex("G"); + final SeqVertex v4 = new SeqVertex("T"); + final SeqVertex v5 = new SeqVertex("AA"); + graph.addVertices(v1, v2, v3, v4, v5); + graph.addEdges(v1, v2, v3, v4, v5); + graph.addEdges(v3, v3); + graph.addEdges(v4, v2); + + // enumerate all possible paths + final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v5); + + Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); + } + + @Test + public void testPathFindingCycleLastNode() { + SeqGraph graph = new SeqGraph(); + + final SeqVertex v1 = new SeqVertex("A"); + final SeqVertex v2 = new SeqVertex("C"); + final SeqVertex v3 = new SeqVertex("G"); + graph.addVertices(v1, v2, v3); + graph.addEdges(v1, v2, v3, v3); + + // enumerate all possible paths + final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v3); + + Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); + } + @DataProvider(name = "BasicBubbleDataProvider") public Object[][] makeBasicBubbleDataProvider() { List tests = new ArrayList(); From 3a19266843788c17a2fa8d7e3bf9fb55d18f277b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 4 Apr 2013 16:07:00 -0400 Subject: [PATCH 154/226] Fix residual merge conflicts --- .../haplotypecaller/graphs/SeqGraph.java | 6 +-- .../graphs/SharedSequenceMerger.java | 2 +- .../sting/utils/haplotype/EventMap.java | 46 +++++++++---------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 97969d098..8c78d8515 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -350,7 +350,7 @@ public final class SeqGraph extends BaseGraph { protected class MergeDiamonds extends VertexBasedTransformer { @Override protected boolean tryToTransform(final SeqVertex top) { - final List middles = outgoingVerticesOf(top); + final Set middles = outgoingVerticesOf(top); if ( middles.size() <= 1 ) // we can only merge if there's at least two middle nodes return false; @@ -407,7 +407,7 @@ public final class SeqGraph extends BaseGraph { protected class MergeTails extends VertexBasedTransformer { @Override protected boolean tryToTransform(final SeqVertex top) { - final List tails = outgoingVerticesOf(top); + final Set tails = outgoingVerticesOf(top); if ( tails.size() <= 1 ) return false; @@ -495,7 +495,7 @@ public final class SeqGraph extends BaseGraph { protected class MergeHeadlessIncomingSources extends VertexBasedTransformer { @Override boolean tryToTransform(final SeqVertex bottom) { - final List incoming = incomingVerticesOf(bottom); + final Set incoming = incomingVerticesOf(bottom); if ( incoming.size() <= 1 ) return false; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java index 28734e505..1c53f2332 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedSequenceMerger.java @@ -75,7 +75,7 @@ public class SharedSequenceMerger { if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v); - final List prevs = graph.incomingVerticesOf(v); + final Set prevs = graph.incomingVerticesOf(v); if ( ! canMerge(graph, v, prevs) ) return false; else { diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java index ab5f23894..752c880b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/EventMap.java @@ -1,27 +1,27 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package org.broadinstitute.sting.utils.haplotype; From 5a54a4155a12b7a9a4531b0f635f4074cead7784 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 4 Apr 2013 20:29:17 -0400 Subject: [PATCH 155/226] Change key Haplotype default parameter values -- Extension increased to 200 bp -- Min prune factor defaults to 0 -- LD merging enabled by default for complex variants, only when there are 10+ samples for SNP + SNP merging -- Active region trimming enabled by default --- .../haplotypecaller/HaplotypeCaller.java | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 80276f7be..a7aeadde6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -137,7 +137,7 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@ActiveRegionTraversalParameters(extension=85, maxRegion=300) +@ActiveRegionTraversalParameters(extension=200, maxRegion=300) @ReadFilters({HCMappingQualityFilter.class}) @Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { @@ -200,7 +200,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Advanced @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) - protected int MIN_PRUNE_FACTOR = 1; + protected int MIN_PRUNE_FACTOR = 0; @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) @@ -284,6 +284,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + @Advanced + @Argument(fullName="dontMergeVariantsViaLD", shortName="dontMergeVariantsViaLD", doc="If specified, we will include low quality bases when doing the assembly", required = false) + protected boolean dontMergeVariantsViaLD = false; + /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. */ @@ -301,18 +305,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected int debugGraphTransformations = -1; - // TODO -- not currently useful - @Hidden + @Hidden // TODO -- not currently useful @Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false) protected boolean useLowQualityBasesForAssembly = false; @Hidden - @Argument(fullName="useNewLDMerger", shortName="useNewLDMerger", doc="If specified, we will include low quality bases when doing the assembly", required = false) - protected boolean useNewLDMerger = false; - - @Hidden - @Argument(fullName="trimActiveRegions", shortName="trimActiveRegions", doc="If specified, we will trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) - protected boolean trimActiveRegions = false; + @Argument(fullName="dontTrimActiveRegions", shortName="donTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + protected boolean dontTrimActiveRegions = false; @Hidden @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) @@ -437,7 +436,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - final LDMerger ldMerger = new LDMerger(DEBUG, useNewLDMerger ? 10 : 10, useNewLDMerger ? 1 : 10); + final LDMerger ldMerger = new LDMerger(DEBUG, dontMergeVariantsViaLD ? 10000000 : 10, dontMergeVariantsViaLD ? 10000000 : 1); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, ldMerger ); @@ -640,7 +639,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype ); - if ( trimActiveRegions ) { + if ( ! dontTrimActiveRegions ) { return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc); } else { // we don't want to or cannot create a trimmed active region, so go ahead and use the old one From 6d22485a4cd3fa715a66c330b176ede0a42017b4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 5 Apr 2013 13:50:03 -0400 Subject: [PATCH 157/226] Critical bugfix to ReduceRead functionality of the GATKSAMRecord -- The function getReducedCounts() was returning the undecoded reduced read tag, which looks like [10, 5, -1, -5] when the depths were [10, 15, 9, 5]. The only function that actually gave the real counts was getReducedCount(int i) which did the proper decoding. Now GATKSAMRecord decodes the tag into the proper depths vector so that getReduceCounts() returns what one reasonably expects it to, and getReduceCount(i) merely looks up the value at i. Added unit test to ensure this behavior going forward. -- Changed the name of setReducedCounts() to setReducedCountsTag as this function assumes that counts have already been encoded in the tag way. --- .../reducereads/SyntheticRead.java | 2 +- .../sting/utils/sam/GATKSAMRecord.java | 61 +++++++++++++++++-- .../utils/sam/GATKSAMRecordUnitTest.java | 8 +++ 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index b1ac19f50..ae4366768 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -235,7 +235,7 @@ public class SyntheticRead { read.setReadBases(convertReadBases()); read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size())); read.setReadGroup(readGroupRecord); - read.setReducedReadCounts(convertBaseCounts()); + read.setReducedReadCountsTag(convertBaseCounts()); if (hasIndelQualities) { read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 01f39a67b..0e672b3d7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -345,24 +345,50 @@ public class GATKSAMRecord extends BAMRecord { // *** ReduceReads functions ***// /////////////////////////////////////////////////////////////////////////////// + /** + * Get the counts of the bases in this reduced read + * + * NOTE that this is not the value of the REDUCED_READ_CONSENSUS_TAG, which + * is encoded in a special way. This is the actual positive counts of the + * depth at each bases. So for a RR with a tag of: + * + * [10, 5, -1, -5] + * + * this function returns + * + * [10, 15, 9, 5] + * + * as one might expect. + * + * @return a byte[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read + */ public byte[] getReducedReadCounts() { if ( ! retrievedReduceReadCounts ) { - reducedReadCounts = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); + final byte[] tag = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); + if ( tag != null ) reducedReadCounts = decodeReadReadCounts(tag); retrievedReduceReadCounts = true; } return reducedReadCounts; } + /** + * Is this read a reduced read? + * @return true if yes + */ public boolean isReducedRead() { return getReducedReadCounts() != null; } /** - * Set the reduced read counts for this record to counts + * Set the reduced read counts tag for this record to counts + * + * WARNING -- this function assumes that counts is encoded as a difference in value count + * of count[i] - count[0]. It is not a straight counting of the bases in the read. + * * @param counts the count array */ - public void setReducedReadCounts(final byte[] counts) { + public void setReducedReadCountsTag(final byte[] counts) { retrievedReduceReadCounts = false; setAttribute(REDUCED_READ_CONSENSUS_TAG, counts); } @@ -374,9 +400,32 @@ public class GATKSAMRecord extends BAMRecord { * @return the number of bases corresponding to the i'th base of the reduced read */ public final byte getReducedCount(final int i) { - byte firstCount = getReducedReadCounts()[0]; - byte offsetCount = getReducedReadCounts()[i]; - return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); + return getReducedReadCounts()[i]; + } + + /** + * Actually decode the consensus tag of a reduce read, returning a newly allocated + * set of values countsFromTag to be the real depth of cover at each base of the reduced read. + * + * for example, if the tag contains [10, 5, -1, -5], after running this function the + * byte[] will contain the true counts [10, 15, 9, 5]. + * + * as one might expect. + * + * @param countsFromTag a non-null byte[] containing the tag encoded reduce reads counts + * @return a non-null byte[] containing the true depth values for the vector + */ + private byte[] decodeReadReadCounts(final byte[] countsFromTag) { + final int n = countsFromTag.length; + final byte[] result = new byte[n]; + final byte firstCount = countsFromTag[0]; + result[0] = firstCount; + for ( int i = 1; i < n; i++) { + final byte offsetCount = countsFromTag[i]; + result[i] = (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); + } + + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 18a501b51..57a7946ae 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -134,4 +134,12 @@ public class GATKSAMRecordUnitTest extends BaseTest { read.setIsStrandless(true); read.setReadNegativeStrandFlag(true); } + + @Test + public void testGetReducedCountsIsCorrect() { + final byte[] counts = reducedRead.getReducedReadCounts(); + Assert.assertNotSame(counts, reducedRead.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); + for ( int i = 0; i < counts.length; i++ ) + Assert.assertEquals(counts[i], reducedRead.getReducedCount(i), "Reduced counts vector not equal to getReducedCount(i) at " + i); + } } From caf15fb7276485f11336e292e1a44dc912c0c70f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 5 Apr 2013 14:11:06 -0400 Subject: [PATCH 158/226] Update MD5s to reflect new HC algorithms and parameter values --- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 21 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index a891220c5..ff2b3d0b6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7b67ac6213b7a6f759057fb9d7148fdc"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "80b9280b1e65952f60ba2fd738d4840f"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "eb41ed6f1d692368a0f67311d139a38a"); + "125e93deeb3b390a14d9b777aa2a220f"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "c4c33c962aca12c51def9b8cde35b7d2"); + "6957fd0e8a5bc66d2572a6ca8626fa7a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 51c3296ac..5fc8c5622 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -47,12 +47,15 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.TribbleIndexedFeatureReader; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; import org.testng.annotations.Test; import java.io.File; @@ -77,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "f132843e3c8e065a783cc4fdf9ee5df3"); + HCTest(CEUTRIO_BAM, "", "6fa37c449a800bcd59069be03ad2fff2"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "15e0201f5c478310d278d2d03483c152"); + HCTest(NA12878_BAM, "", "6140447b34bd1d08b3ed4d473d2c2f23"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -93,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "48d309aed0cdc40cc983eeb5a8d12f53"); + "cbd119f3d37a9af0b3539c13b8053bd9"); } @Test @@ -109,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "34c7fcfe17a1d835e2dc403df9eb3591"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "9eeeada2f7145adfe08f538aad704982"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -146,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "eae65d20836d6c6ebca9e25e33566f74"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "16ecd2f282bcb10dc32e7f3fe714a000"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -156,14 +159,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a3d74040a4966bf7a04cbd4924970685")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("40da88ed3722c512264b72db37f18720")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); executeTest("HCTestStructuralIndels: ", spec); } @@ -185,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("69b83d578c14ed32d08ce4e7ff8a8a18")); + Arrays.asList("e30b974b038293841e6be23c93ce76e1")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -193,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("0cae60d86a3f86854699217a30ece3e3")); + Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 21410690a224f96f1e949e316f8b7976f8fdaa41 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 6 Apr 2013 14:08:26 -0400 Subject: [PATCH 159/226] Address reviewer comments --- .../haplotypecaller/GenotypingEngine.java | 12 +-- .../haplotypecaller/HaplotypeCaller.java | 11 ++- .../haplotypecaller/graphs/BaseVertex.java | 5 +- .../walkers/haplotypecaller/graphs/Path.java | 2 +- .../sting/utils/haplotype/LDMerger.java | 14 ++-- .../MergeVariantsAcrossHaplotypes.java | 79 +++++++++++++++++++ .../genotyper/PerReadAlleleLikelihoodMap.java | 2 +- 7 files changed, 103 insertions(+), 22 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index abd502c2b..5fe98649f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -59,7 +59,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.LDMerger; +import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; @@ -74,16 +74,16 @@ public class GenotypingEngine { private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final VariantAnnotatorEngine annotationEngine; - private final LDMerger ldMerger; + private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, - final LDMerger ldMerger) { + final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) { this.DEBUG = DEBUG; this.annotationEngine = annotationEngine; this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS; noCall.add(Allele.NO_CALL); - this.ldMerger = ldMerger; + this.crossHaplotypeEventMerger = crossHaplotypeEventMerger; } /** @@ -247,8 +247,8 @@ public class GenotypingEngine { cleanUpSymbolicUnassembledEvents( haplotypes ); if ( !in_GGA_mode ) { - // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure - final boolean mergedAnything = ldMerger.mergeConsecutiveEventsBasedOnLD( haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc ); + // run the event merger if we're not in GGA mode + final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); if ( mergedAnything ) cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a7aeadde6..c52892373 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -77,10 +77,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.EventMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.HaplotypeBaseComparator; -import org.broadinstitute.sting.utils.haplotype.LDMerger; +import org.broadinstitute.sting.utils.haplotype.*; import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; @@ -436,9 +433,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - final LDMerger ldMerger = new LDMerger(DEBUG, dontMergeVariantsViaLD ? 10000000 : 10, dontMergeVariantsViaLD ? 10000000 : 1); + final MergeVariantsAcrossHaplotypes variantMerger = dontMergeVariantsViaLD + ? new MergeVariantsAcrossHaplotypes() + : new LDMerger(DEBUG, 10, 1); - genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, ldMerger ); + genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); if ( bamWriter != null ) haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java index 65643a2cc..b075a69a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java @@ -58,7 +58,8 @@ import java.util.Arrays; */ public class BaseVertex { final byte[] sequence; - int cachedHashCode = -1; + private final static int UNASSIGNED_HASHCODE = -1; + int cachedHashCode = UNASSIGNED_HASHCODE; /** * Create a new sequence vertex with sequence @@ -129,7 +130,7 @@ public class BaseVertex { */ @Override public int hashCode() { - if ( cachedHashCode == -1 ) { + if ( cachedHashCode == UNASSIGNED_HASHCODE ) { cachedHashCode = Arrays.hashCode(sequence); } return cachedHashCode; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 252ae3449..d91ec0e37 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -157,7 +157,7 @@ public class Path { public boolean containsVertex(final T v) { if ( v == null ) throw new IllegalArgumentException("Vertex cannot be null"); - // TODO -- warning this is expense. Need to do vertex caching + // TODO -- warning this is expensive. Need to do vertex caching return getVertices().contains(v); } diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java index ea00a1901..bbedd1b1a 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/LDMerger.java @@ -63,7 +63,7 @@ import java.util.*; * Date: 3/28/13 * Time: 6:17 PM */ -public class LDMerger { +public class LDMerger extends MergeVariantsAcrossHaplotypes { private final static Logger logger = Logger.getLogger(LDMerger.class); private final boolean DEBUG; @@ -71,6 +71,7 @@ public class LDMerger { private final int minSamplesToMergeOtherEvents; public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) { + super(); this.DEBUG = DEBUG; this.minSamplesToMergeSNPs = minSamplesToMergeSNPs; this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents; @@ -98,11 +99,12 @@ public class LDMerger { * @param ref the reference bases * @param refLoc the span of the reference bases */ - public boolean mergeConsecutiveEventsBasedOnLD( final List haplotypes, - final Map haplotypeReadMap, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { + @Override + public boolean merge( final List haplotypes, + final Map haplotypeReadMap, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); if ( haplotypeReadMap == null ) throw new IllegalArgumentException("haplotypeReadMap cannot be null"); if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java new file mode 100644 index 000000000..fc47807e0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/MergeVariantsAcrossHaplotypes.java @@ -0,0 +1,79 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; + +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +/** + * Baseclass for code that wants to merge variants together in the haplotype caller + * + * This root class is basically a no-op, and can be used to not do any merging + */ +public class MergeVariantsAcrossHaplotypes { + /** + * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate + * + * @param haplotypes a list of haplotypes whose events we want to merge + * @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype + * @param startPosKeySet a set of starting positions of all events among the haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @return true if anything was merged + */ + public boolean merge( final List haplotypes, + final Map haplotypeReadMap, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + return false; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 201e3b9b4..47be30871 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -278,7 +278,7 @@ public class PerReadAlleleLikelihoodMap { } /** - * Is this read poorly modelled by any of the alleles in this map? + * Is this read poorly modelled by all of the alleles in this map? * * A read is poorly modeled when it's likelihood is below what would be expected for a read * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. From 317dc4c323ff011418127df3183c9957f6f01bf6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 7 Apr 2013 12:20:44 -0400 Subject: [PATCH 161/226] Add size() method to Downsampler interface -- This method provides client with the current number of elements, without having to retreive the underlying list. Added unit tests for LevelingDownsampler and ReservoirDownsampler as these are the only two complex ones. All of the others are trivially obviously correct. --- .../sting/gatk/downsampling/Downsampler.java | 11 +++++++++++ .../gatk/downsampling/FractionalDownsampler.java | 5 +++++ .../sting/gatk/downsampling/LevelingDownsampler.java | 9 +++++++++ .../gatk/downsampling/PassThroughDownsampler.java | 5 +++++ .../sting/gatk/downsampling/ReservoirDownsampler.java | 5 +++++ .../downsampling/SimplePositionalDownsampler.java | 5 +++++ .../downsampling/LevelingDownsamplerUnitTest.java | 2 ++ .../downsampling/ReservoirDownsamplerUnitTest.java | 1 + 8 files changed, 43 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index bfac08d35..23b16cff2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -94,6 +94,17 @@ public interface Downsampler { */ public T peekPending(); + /** + * Get the current number of items in this downsampler + * + * This should be the best estimate of the total number of elements that will come out of the downsampler + * were consumeFinalizedItems() to be called immediately after this call. In other words it should + * be number of finalized items + estimate of number of pending items that will ultimately be included as well. + * + * @return a positive integer + */ + public int size(); + /** * Returns the number of items discarded (so far) during the downsampling process * diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java index 266148178..1cede9c33 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -109,6 +109,11 @@ public class FractionalDownsampler implements ReadsDownsamp return numDiscardedItems; } + @Override + public int size() { + return selectedReads.size(); + } + public void signalEndOfInput() { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java index a8a808333..4ff729537 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -128,6 +128,15 @@ public class LevelingDownsampler, E> implements Downsampler return numDiscardedItems; } + @Override + public int size() { + int s = 0; + for ( final List l : groups ) { + s += l.size(); + } + return s; + } + public void signalEndOfInput() { levelGroups(); groupsAreFinalized = true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java index b06d5f5b4..3aaed6c73 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -89,6 +89,11 @@ public class PassThroughDownsampler implements ReadsDownsam return 0; } + @Override + public int size() { + return selectedReads.size(); + } + public void signalEndOfInput() { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index 4331fd723..0e6bbfcb6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -156,6 +156,11 @@ public class ReservoirDownsampler implements ReadsDownsampl return numDiscardedItems; } + @Override + public int size() { + return reservoir.size(); + } + public void signalEndOfInput() { // NO-OP } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java index 3da18b2bb..7c6c043c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -112,6 +112,11 @@ public class SimplePositionalDownsampler implements ReadsDo return numDiscardedItems; } + @Override + public int size() { + return finalizedReads.size() + reservoir.size(); + } + public void signalEndOfInput() { finalizeReservoir(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java index 3a12c7ce7..972e51dcd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -139,6 +139,7 @@ public class LevelingDownsamplerUnitTest extends BaseTest { Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); } + final int sizeFromDownsampler = downsampler.size(); List> downsampledStacks = downsampler.consumeFinalizedItems(); Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); @@ -151,6 +152,7 @@ public class LevelingDownsamplerUnitTest extends BaseTest { totalRemainingItems += stack.size(); } + Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java index 74a17189e..022eb02d2 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -115,6 +115,7 @@ public class ReservoirDownsamplerUnitTest extends BaseTest { Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); } + Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); List downsampledReads = downsampler.consumeFinalizedItems(); Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); From 1b36db8940dcfae5a85cfb501cac4f671ef4f28a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 7 Apr 2013 15:25:52 -0400 Subject: [PATCH 162/226] Make ActiveRegionTraversal robust to excessive coverage -- Add a maximum per sample and overall maximum number of reads held in memory by the ART at any one time. Does this in a new TAROrderedReadCache data structure that uses a reservior downsampler to limit the total number of reads to a constant amount. This constant is set to be by default 3000 reads * nSamples to a global maximum of 1M reads, all controlled via the ActiveRegionTraversalParameters annotation. -- Added an integration test and associated excessively covered BAM excessiveCoverage.1.121484835.bam (private/testdata) that checks that the system is operating correctly. -- #resolves GSA-921 --- .../sting/gatk/GenomeAnalysisEngine.java | 9 ++ .../gatk/traversals/TAROrderedReadCache.java | 126 ++++++++++++++++++ .../traversals/TraverseActiveRegions.java | 23 +++- .../ActiveRegionTraversalParameters.java | 16 +++ .../TAROrderedReadCacheUnitTest.java | 107 +++++++++++++++ .../TraverseActiveRegionsUnitTest.java | 3 +- 6 files changed, 276 insertions(+), 8 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 2d8b9cd9a..fed33c1cb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -725,6 +725,15 @@ public class GenomeAnalysisEngine { rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); } + /** + * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or + * need to absolutely positively kill everyone in the room) + * @param dataSource + */ + public void setReadsDataSource(final SAMDataSource dataSource) { + this.readsDataSource = dataSource; + } + /** * Entry-point function to initialize the samples database from input data and pedigree arguments */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java new file mode 100644 index 000000000..80da8f8eb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCache.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; +import org.broadinstitute.sting.utils.sam.AlignmentStartComparator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Subsystem to track a list of all reads currently live in the TraverseActiveRegions system, + * while limiting the total number of reads to a maximum capacity. + * + * User: depristo + * Date: 4/7/13 + * Time: 11:23 AM + */ +public class TAROrderedReadCache { + final int maxCapacity; + final Downsampler downsampler; + + /** + * Create a new empty ReadCache + * @param maxCapacity the max capacity of the read cache. + */ + public TAROrderedReadCache(int maxCapacity) { + if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity); + this.maxCapacity = maxCapacity; + this.downsampler = new ReservoirDownsampler(maxCapacity); + } + + /** + * What's the maximum number of reads we'll store in the cache? + * @return a positive integer + */ + public int getMaxCapacity() { + return maxCapacity; + } + + /** + * Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads + * @param read a read to add + */ + public void add(final GATKSAMRecord read) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + downsampler.submit(read); + } + + /** + * Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other + * @param reads a collection of reads to add + */ + public void addAll(final List reads) { + if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null"); + downsampler.submit(reads); + } + + /** + * How many reads are currently in the cache? + * @return a positive integer + */ + public int size() { + return downsampler.size(); + } + + /** + * How many reads were discarded since the last call to popCurrentReads + * @return + */ + public int getNumDiscarded() { + return downsampler.getNumberOfDiscardedItems(); + } + + /** + * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart) + * + * Flushes this cache, so after this call the cache will contain no reads and all downsampling stats will + * be reset. + * + * @return a list of GATKSAMRecords in this cache + */ + public List popCurrentReads() { + final List maybeUnordered = downsampler.consumeFinalizedItems(); + + final List ordered; + if ( downsampler.getNumberOfDiscardedItems() == 0 ) { + // haven't discarded anything, so the reads are ordered properly + ordered = maybeUnordered; + } else { + // we need to sort these damn things: O(n log n) + ordered = new ArrayList(maybeUnordered); + Collections.sort(ordered, new AlignmentStartComparator()); + } + + // reset the downsampler stats so getNumberOfDiscardedItems is 0 + downsampler.reset(); + return ordered; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 908755a24..1daaaf1da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.activeregion.*; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; @@ -78,7 +79,8 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); - private LinkedList myReads = new LinkedList(); + private TAROrderedReadCache myReads = null; + private GenomeLoc spanOfLastReadSeen = null; private ActivityProfile activityProfile = null; int maxReadsInMemory = 0; @@ -117,6 +119,10 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine walker) { - final Iterator liveReads = myReads.iterator(); - while ( liveReads.hasNext() ) { + final List stillLive = new LinkedList(); + for ( final GATKSAMRecord read : myReads.popCurrentReads() ) { boolean killed = false; - final GATKSAMRecord read = liveReads.next(); final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { activeRegion.add(read); if ( ! walker.wantsNonPrimaryReads() ) { - liveReads.remove(); killed = true; } } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { activeRegion.add( read ); } + // if the read hasn't already been killed, check if it cannot occur in any more active regions, and maybe kill it if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) { - liveReads.remove(); + killed = true; } + + // keep track of all of the still live active regions + if ( ! killed ) stillLive.add(read); } + myReads.addAll(stillLive); if ( logger.isDebugEnabled() ) { logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java index cdb45db7b..5560946ea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionTraversalParameters.java @@ -78,4 +78,20 @@ public @interface ActiveRegionTraversalParameters { * @return the breadth of the band pass gaussian kernel we want for our traversal */ public double bandPassSigma() default BandPassActivityProfile.DEFAULT_SIGMA; + + /** + * What is the maximum number of reads we're willing to hold in memory per sample + * during the traversal? This limits our exposure to unusually large amounts + * of coverage in the engine. + * @return the maximum number of reads we're willing to hold in memory + */ + public int maxReadsToHoldInMemoryPerSample() default 3000; + + /** + * No matter what the per sample value says, we will never hold more than this + * number of reads in memory at any time. Provides an upper bound on the total number + * of reads in the case where we have a lot of samples. + * @return the maximum number of reads to hold in memory + */ + public int maxReadsToHoldTotal() default 1000000; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java new file mode 100644 index 000000000..f3e1ce44b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TAROrderedReadCacheUnitTest extends BaseTest { + // example fasta index file, can be deleted if you don't use the reference + private IndexedFastaSequenceFile seq; + + @BeforeClass + public void setup() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + } + + @DataProvider(name = "ReadCacheTestData") + public Object[][] makeReadCacheTestData() { + List tests = new ArrayList(); + + for ( final int nReadsPerLocus : Arrays.asList(0, 1, 10, 100) ) { + for ( final int nLoci : Arrays.asList(1, 10, 100) ) { + for ( final int max : Arrays.asList(10, 50, 1000) ) { + for ( final boolean addAllAtOnce : Arrays.asList(true, false) ) { + tests.add(new Object[]{nReadsPerLocus, nLoci, max, addAllAtOnce}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadCacheTestData") + public void testReadCache(final int nReadsPerLocus, final int nLoci, final int max, final boolean addAllAtOnce) { + final TAROrderedReadCache cache = new TAROrderedReadCache(max); + + Assert.assertEquals(cache.getMaxCapacity(), max); + Assert.assertEquals(cache.getNumDiscarded(), 0); + Assert.assertEquals(cache.size(), 0); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(seq, nReadsPerLocus, nLoci); + final List reads = bamBuilder.makeReads(); + + if ( addAllAtOnce ) { + cache.addAll(reads); + } else { + for ( final GATKSAMRecord read : reads ) { + cache.add(read); + } + } + + final int nTotalReads = reads.size(); + final int nExpectedToKeep = Math.min(nTotalReads, max); + final int nExpectedToDiscard = nTotalReads - nExpectedToKeep; + Assert.assertEquals(cache.getNumDiscarded(), nExpectedToDiscard, "wrong number of reads discarded"); + Assert.assertEquals(cache.size(), nExpectedToKeep, "wrong number of reads kept"); + + final List cacheReads = cache.popCurrentReads(); + Assert.assertEquals(cache.size(), 0, "Should be no reads left"); + Assert.assertEquals(cache.getNumDiscarded(), 0, "should have reset stats"); + Assert.assertEquals(cacheReads.size(), nExpectedToKeep, "should have 1 read for every read we expected to keep"); + + int lastStart = -1; + for ( final GATKSAMRecord read : cacheReads ) { + Assert.assertTrue(lastStart <= read.getAlignmentStart(), "Reads should be sorted but weren't. Found read with start " + read.getAlignmentStart() + " while last was " + lastStart); + lastStart = read.getAlignmentStart(); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 0384260fa..b6106d4bc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -470,7 +470,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, File bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - traverseActiveRegions.initialize(engine, walker); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(bamFile, new Tags()); @@ -486,8 +485,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { new ArrayList(), false, (byte)30, false, true); + engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); + traverseActiveRegions.initialize(engine, walker); List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { From 3960733c88bf1965a3b1a20eba630588e84ea318 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 9 Apr 2013 08:19:52 -0400 Subject: [PATCH 163/226] Fix PrintReads out of space issue Problem: -------- Print Reads was running out of disk space when using the -BQSR option even for small bam files Solution: --------- Configure setupWriter to expect pre sorted reads --- .../broadinstitute/sting/gatk/walkers/readutils/PrintReads.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java index add567b36..a28523369 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/PrintReads.java @@ -190,7 +190,7 @@ public class PrintReads extends ReadWalker impleme final boolean preSorted = true; if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME); } } From 33ecec535d5a1f86ba399d611aca922f07f8fc5e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 9 Apr 2013 09:52:56 -0400 Subject: [PATCH 164/226] Turn off the LD merging code by default -- It's just too hard to interpret the called variation when we merge variants via LD. -- Can now be turned on with -mergeVariantsViaLD -- Update MD5s --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 8 +++----- ...peCallerComplexAndSymbolicVariantsIntegrationTest.java | 2 +- .../haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index c52892373..8a3cacdcd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -282,8 +282,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); @Advanced - @Argument(fullName="dontMergeVariantsViaLD", shortName="dontMergeVariantsViaLD", doc="If specified, we will include low quality bases when doing the assembly", required = false) - protected boolean dontMergeVariantsViaLD = false; + @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) + protected boolean mergeVariantsViaLD = false; /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. @@ -433,9 +433,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - final MergeVariantsAcrossHaplotypes variantMerger = dontMergeVariantsViaLD - ? new MergeVariantsAcrossHaplotypes() - : new LDMerger(DEBUG, 10, 1); + final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger ); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index ff2b3d0b6..d6fb3b70a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "80b9280b1e65952f60ba2fd738d4840f"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6b673efb6f12b5deebb3e63fe94c48ed"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 5fc8c5622..7b7f4d9cc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("e30b974b038293841e6be23c93ce76e1")); + Arrays.asList("020b1a4feb82f050894f6066dc07cc4a")); executeTest("HC calling on a ReducedRead BAM", spec); } From 51954ae3e590f89b5d8812dd58fb2f867481ad0f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 9 Apr 2013 15:18:42 -0400 Subject: [PATCH 165/226] HaplotypeCaller doesn't support EXACT_GENERAL_PLOIDY model -- HC now throws a UserException if this model is provided. Documented this option as not being supported in the HC in the docs for EXACT_GENERAL_PLOIDY --- .../sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java | 2 +- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index d4bb3cab3..6dffa8a6d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -79,7 +79,7 @@ public class AFCalcFactory { /** original biallelic exact model, for testing only */ EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), - /** implementation that supports any sample ploidy */ + /** implementation that supports any sample ploidy. Currently not available for the HaplotypeCaller */ EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1); /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 8a3cacdcd..55490a1cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -67,6 +67,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalcul import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; @@ -366,6 +367,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem public void initialize() { super.initialize(); + if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) + throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); + // get all of the unique sample names Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); From b115e5c582bb2ff9a32c1195071c09ca1ef401e9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 8 Apr 2013 14:00:51 -0400 Subject: [PATCH 166/226] Critical bugfix for CommonSuffixSplitter to avoid infinite loops -- The previous version would enter into an infinite loop in the case where we have a graph that looks like: X -> A -> B Y -> A -> B So that the incoming vertices of B all have the same sequence. This would cause us to remodel the graph endless by extracting the common sequence A and rebuilding exactly the same graph. Fixed and unit tested -- Additionally add a max to the number of simplification cycles that are run (100), which will throw an error and write out the graph for future debugging. So the GATK will always error out, rather than just go on forever -- After 5 rounds of simplification we start keeping a copy of the previous graph, and then check if the current graph is actually different from the previous graph. Equals here means that all vertices have equivalents in both graphs, as do all edges. If the two graphs are equal we stop simplifying. It can be a bit expensive but it only happens when we end up cycling due to the structure of the graph. -- Added a unittest that goes into an infinite loop (found empirically in running the CEU trio) and confirmed that the new approach aborts out correctly -- #resolves GSA-924 -- See https://jira.broadinstitute.org/browse/GSA-924 for more details -- Update MD5s due to change in assembly graph construction --- .../graphs/CommonSuffixSplitter.java | 33 ++++++-- .../haplotypecaller/graphs/SeqGraph.java | 79 +++++++++++++------ ...lexAndSymbolicVariantsIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../graphs/CommonSuffixSplitterUnitTest.java | 36 ++++++++- .../graphs/SeqGraphUnitTest.java | 18 +++++ 6 files changed, 134 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index 371d5b7e3..e37fbb281 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -90,21 +90,23 @@ public class CommonSuffixSplitter { if ( v == null ) throw new IllegalArgumentException("v cannot be null"); if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v); - final Collection toMerge = graph.incomingVerticesOf(v); - if ( toMerge.size() < 2 ) + final Collection toSplit = graph.incomingVerticesOf(v); + if ( toSplit.size() < 2 ) // Can only split at least 2 vertices return false; - else if ( ! safeToSplit(graph, v, toMerge) ) { + else if ( ! safeToSplit(graph, v, toSplit) ) { return false; } else { - final SeqVertex suffixVTemplate = commonSuffix(toMerge); + final SeqVertex suffixVTemplate = commonSuffix(toSplit); if ( suffixVTemplate.isEmpty() ) { return false; + } else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) { + return false; } else { final List edgesToRemove = new LinkedList(); // graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0); - for ( final SeqVertex mid : toMerge ) { + for ( final SeqVertex mid : toSplit ) { // create my own copy of the suffix final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence()); graph.addVertex(suffixV); @@ -130,7 +132,7 @@ public class CommonSuffixSplitter { } } - graph.removeAllVertices(toMerge); + graph.removeAllVertices(toSplit); graph.removeAllEdges(edgesToRemove); // graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0); @@ -141,6 +143,25 @@ public class CommonSuffixSplitter { // private static int counter = 0; + /** + * Would all vertices that we'd split just result in the common suffix? + * + * That is, suppose we have prefix nodes ABC and ABC. After splitting all of the vertices would + * just be ABC again, and we'd enter into an infinite loop. + * + * @param commonSuffix the common suffix of all vertices in toSplits + * @param toSplits the collection of vertices we want to split + * @return true if all of the vertices are equal to the common suffix + */ + private boolean allVerticesAreTheCommonSuffix(final SeqVertex commonSuffix, final Collection toSplits) { + for ( final SeqVertex toSplit : toSplits ) { + if ( toSplit.length() != commonSuffix.length() ) + return false; + } + + return true; + } + /** * Can we safely split up the vertices in toMerge? * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 8c78d8515..bb4b26257 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -72,6 +72,12 @@ public final class SeqGraph extends BaseGraph { */ protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10; + /** + * How many cycles of the graph simplifications algorithms will we run before + * thinking something has gone wrong and throw an exception? + */ + private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100; + /** * Construct an empty SeqGraph */ @@ -101,29 +107,56 @@ public final class SeqGraph extends BaseGraph { } protected void simplifyGraph(final int maxCycles) { - boolean didSomeWork; - int i = 0; - // start off with one round of zipping of chains for performance reasons zipLinearChains(); - do { - //logger.info("simplifyGraph iteration " + i); - // iterate until we haven't don't anything useful - didSomeWork = false; - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".1.dot"), 0); - didSomeWork |= new MergeDiamonds().transformUntilComplete(); - didSomeWork |= new MergeTails().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".2.diamonds_and_tails.dot"), 0); - didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".3.split_suffix.dot"), 0); - didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); - if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + i + ".4.merge_suffix.dot"), 0); + SeqGraph prevGraph = null; + for( int i = 0; i < maxCycles; i++ ) { + if ( i > MAX_REASONABLE_SIMPLIFICATION_CYCLES ) { + logger.warn("Infinite loop detected in simpliciation routines. Writing current graph to debugMeMark.dot"); + printGraph(new File("debugMeMark.dot"), 0); + throw new IllegalStateException("Infinite loop detected in simplification routines for kmer graph " + getKmerSize()); + } - didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); - didSomeWork |= zipLinearChains(); - i++; - } while (didSomeWork && i < maxCycles); + final boolean didSomeWork = simplifyGraphOnce(i); + if ( ! didSomeWork ) + // no simplification algorithm could run, so stop + break; + + // we get five cycles before we start looking for changes in the graph + // by cloning ourselves and then checking for any changes + if ( i > 5 ) { + // the previous graph and this graph have the same structure, so the simplification + // algorithms are looping endless between states. Just break and consider ourselves done + if ( prevGraph != null && graphEquals(prevGraph, this) ) + break; + + prevGraph = (SeqGraph)clone(); + } + } + } + + /** + * Run one full cycle of the graph simplification algorithms + * @return true if any algorithms said they did some simplification + */ + private boolean simplifyGraphOnce(final int iteration) { + //logger.info("simplifyGraph iteration " + i); + // iterate until we haven't don't anything useful + boolean didSomeWork = false; + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0); + didSomeWork |= new MergeDiamonds().transformUntilComplete(); + didSomeWork |= new MergeTails().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0); + + didSomeWork |= new SplitCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0); + didSomeWork |= new MergeCommonSuffices().transformUntilComplete(); + if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0); + + didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete(); + didSomeWork |= zipLinearChains(); + return didSomeWork; } /** @@ -431,15 +464,13 @@ public final class SeqGraph extends BaseGraph { * * Performs the transformation: * - * { x + S_i + y -> Z } + * { x + S_i -> y -> Z } * * goes to: * - * { x -> S_i -> y -> Z } + * { x -> S_i -> y + Z } * * for all nodes that match this configuration. - * - * Differs from the diamond transform in that no top node is required */ protected class MergeCommonSuffices extends VertexBasedTransformer { @Override @@ -449,8 +480,6 @@ public final class SeqGraph extends BaseGraph { } /** - * Merge headless configurations: - * * Performs the transformation: * * { x + S_i + y -> Z } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index d6fb3b70a..6d85421c4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -63,8 +63,8 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } @Test - public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6b673efb6f12b5deebb3e63fe94c48ed"); + public void testHaplotypeCallerMultiSampleComplex1() { + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "490ecf6619740c01c81a463392ef23cf"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 7b7f4d9cc..573cc83fd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6fd839641ee038048626fbd1154f173")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java index 8006cb18d..1ed20e5f4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -102,8 +102,8 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { public void testSplitNoCycles() { final SeqGraph original = new SeqGraph(); final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("C"); + final SeqVertex v2 = new SeqVertex("AC"); + final SeqVertex v3 = new SeqVertex("TC"); final SeqVertex v4 = new SeqVertex("G"); original.addVertices(v1, v2, v3, v4); @@ -116,7 +116,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(new CommonSuffixSplitter().split(original, v4), "Cannot split graph with a cycle of the bottom list"); } - @Test(timeOut = 10000) + @Test(timeOut = 10000, enabled = !DEBUG) public void testSplitComplexCycle() { final SeqGraph original = new SeqGraph(); final SeqVertex r1 = new SeqVertex("ACTG"); @@ -130,7 +130,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { original.addEdges(r1, cat1, c1, cat2, c1); original.addEdges(r2, c2, cat2); - original.printGraph(new File("testSplitComplexCycle.dot"), 0); + //original.printGraph(new File("testSplitComplexCycle.dot"), 0); for ( final SeqVertex v : Arrays.asList(cat2) ) { // original.vertexSet() ) { final SeqGraph graph = (SeqGraph)original.clone(); @@ -139,4 +139,32 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { Assert.assertFalse(success, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); } } + + @Test(timeOut = 10000) + public void testSplitInfiniteCycleFailure() { + final SeqGraph original = new SeqGraph(); + final SeqVertex v1 = new SeqVertex("GC"); + final SeqVertex v2 = new SeqVertex("X"); + final SeqVertex v3 = new SeqVertex("N"); + final SeqVertex v4 = new SeqVertex("C"); + + original.addVertices(v1, v2, v3, v4); + original.addEdge(v1, v2, new BaseEdge(false, 12)); + original.addEdge(v2, v3, new BaseEdge(false, 23)); + original.addEdge(v3, v4, new BaseEdge(false, 34)); + original.addEdge(v4, v2, new BaseEdge(false, 42)); + + original.printGraph(new File("testSplitInfiniteCycleFailure.dot"), 0); + + final SeqGraph graph = (SeqGraph)original.clone(); + final boolean success = new CommonSuffixSplitter().split(graph, v2); + Assert.assertTrue(success); + + for ( final SeqVertex v : graph.vertexSet() ) { + graph.printGraph(new File("testSplitInfiniteCycleFailure.first_split.dot"), 0); + final boolean success2 = new CommonSuffixSplitter().split((SeqGraph)graph.clone(), v); + if ( success2 ) graph.printGraph(new File("testSplitInfiniteCycleFailure.fail.dot"), 0); + Assert.assertFalse(success2, "Shouldn't be able to split any vertices but CommonSuffixSplitter says it could for " + v); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index 42137e4e4..bd2e3cc2c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -521,4 +521,22 @@ public class SeqGraphUnitTest extends BaseTest { throw e; } } + + @Test(timeOut = 10000) + public void testInfiniteCycleFromEmpiricalRuns() { + final SeqVertex v1 = new SeqVertex("CCCT"); + final SeqVertex v2 = new SeqVertex("CATCCTCCCTTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCC"); + final SeqVertex v3 = new SeqVertex("CTAGACTTCTCCTCCTCCTCC"); + final SeqVertex v4 = new SeqVertex("ACCATC"); + final SeqVertex v5 = new SeqVertex("CCTCCACCATCCTCCCCTCTAGGCTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATC"); + final SeqVertex v6 = new SeqVertex("CTCCCCT"); + + final SeqGraph graph = new SeqGraph(); + graph.addVertices(v1, v2, v3, v4, v5, v6); + graph.addEdges(v1, v3, v4, v6, v3); + graph.addEdges(v2, v4); + graph.addEdges(v5, v6); + + graph.simplifyGraph(); + } } From 850be5e9da3131eb056758d4d380924d1d000fbe Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 10 Apr 2013 11:23:29 -0400 Subject: [PATCH 168/226] Bug fix in SWPairwiseAlignment. -- When the alignments are sufficiently apart from each other all the scores in the sw matrix could be negative which screwed up the max score calculation since it started at zero. --- .../utils/SWPairwiseAlignmentUnitTest.java | 32 +++++++++---------- .../sting/utils/SWPairwiseAlignment.java | 5 ++- .../HaplotypeBAMWriterUnitTest.java | 6 ---- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java index 6d3c310b7..c55b4147d 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java @@ -72,22 +72,22 @@ public class SWPairwiseAlignmentUnitTest extends BaseTest { Assert.assertEquals(sw.getCigar().toString(), expectedCigar); } - // TODO - // TODO - // TODO this example demonstrates some kind of failure mode of SW that results in the read not being aligned - // TODO to the reference at all. It has something to do with the specific parameters provided to the - // TODO SW code. With the default parameters the result is the one expected. With the specified parameters - // TODO the code fails - // TODO - // TODO - @Test(enabled = false) - public void testOddNoAlignment() { - final String reference = "AAAGACTACTG"; - final String read = "AACGGACACTG"; - final int expectedStart = 0; - final String expectedCigar = "11M"; - final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), 5.0, -10.0, -22.0, -1.2); - sw.printAlignment(reference.getBytes(), read.getBytes()); + @DataProvider(name = "OddNoAlignment") + public Object[][] makeOddNoAlignment() { + List tests = new ArrayList(); + + final String ref1 = "AAAGACTACTG"; + final String read1 = "AACGGACACTG"; + tests.add(new Object[]{ref1, read1, 5.0, -10.0, -22.0, -1.2, 1, "2M2I3M1D4M"}); + tests.add(new Object[]{ref1, read1, 20.0, -5.0, -30.0, -2.2, 0, "11M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OddNoAlignment", enabled = true) + public void testOddNoAlignment(final String reference, final String read, final double match, final double mismatch, final double gap, final double gap_extend, + final int expectedStart, final String expectedCigar) { + final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), match, mismatch, gap, gap_extend); Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); Assert.assertEquals(sw.getCigar().toString(), expectedCigar); } diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index e501cf40a..6c8beb32d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -205,10 +205,9 @@ public final class SWPairwiseAlignment { private void calculateCigar(int n, int m, double [] sw, int [] btrack) { // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - //PrimitivePair.Int p = new PrimitivePair.Int(); int p1 = 0, p2 = 0; - double maxscore = 0.0; + double maxscore = Double.NEGATIVE_INFINITY; // sw scores are allowed to be negative int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) // look for largest score. we use >= combined with the traversal direction @@ -259,7 +258,7 @@ public final class SWPairwiseAlignment { // move to next best location in the sw matrix: switch( new_state ) { - case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in th esw matrix + case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in the sw matrix case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up } diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java index 89d87a3c3..db16582b8 100644 --- a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -127,12 +127,6 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { } } - // test that reads without a good alignment to hap get excluded - { - final GATKSAMRecord read = makeRead("NNNNN"); - tests.add(new Object[]{read, allM, 10, -1, null}); - } - // example case of bad alignment because SW doesn't necessarily left-align indels { final String hap = "ACTGTGGGTTCCTCTTATTTTATTTCTACATCAATGTTCATATTTAACTTATTATTTTATCTTATTTTTAAATTTCTTTTATGTTGAGCCTTGATGAAAGCCATAGGTTCTCTCATATAATTGTATGTGTATGTATGTATATGTACATAATATATACATATATGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATATATACATATATG"; From bf42be44fcfd618b34fa410b9331dcdb6134029b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 8 Apr 2013 07:17:53 -0400 Subject: [PATCH 169/226] Fast DeBruijnGraph creation using the kmer counter -- The previous creation algorithm used the following algorithm: for each kmer1 -> kmer2 in each read add kmers 1 and 2 to the graph add edge kmer1 -> kmer2 in the graph, if it's not present (does check) update edge count by 1 if kmer1 -> kmer2 already existed in the graph -- This algorithm had O(reads * kmers / read * (getEdge cost + addEdge cost)). This is actually pretty expensive because get and add edges is expensive in jgrapht. -- The new approach uses the following algorithm: for each kmer1 -> kmer2 in each read add kmers 1 and 2 to a kmer counter, that counts kmer1+kmer2 in a fast hashmap for each kmer pair 1 and 2 in the hash counter add edge kmer1 -> kmer2 in the graph, if it's not present (does check) with multiplicity count from map update edge count by count from map if kmer1 -> kmer2 already existed in the graph -- This algorithm ensures that we add very much fewer edges -- Additionally, created a fast kmer class that lets us create kmers from larger byte[]s of bases without cutting up the byte[] itself. -- Overall runtimes are greatly reduced using this algorith --- .../haplotypecaller/DeBruijnAssembler.java | 33 +-- .../DeBruijnGraphBuilder.java} | 84 ++++--- .../walkers/haplotypecaller/KMerCounter.java | 91 +++++--- .../gatk/walkers/haplotypecaller/Kmer.java | 207 ++++++++++++++++++ .../haplotypecaller/graphs/DeBruijnGraph.java | 19 -- .../haplotypecaller/KmerCounterUnitTest.java | 84 +++++++ .../walkers/haplotypecaller/KmerUnitTest.java | 133 +++++++++++ 7 files changed, 555 insertions(+), 96 deletions(-) rename protected/java/{test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java => src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java} (75%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 11701a73b..b19b1f8d7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -252,14 +252,15 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype ) { final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); + final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph); // First pull kmers from the reference haplotype and add them to the graph - if ( ! addReferenceKmersToGraph(graph, refHaplotype.getBases()) ) + if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) ) // something went wrong, so abort right now with a null graph return null; // now go through the graph already seeded with the reference sequence and add the read kmers to it - if ( ! addReadKmersToGraph(graph, reads) ) + if ( ! addReadKmersToGraph(builder, reads) ) // some problem was detected adding the reads to the graph, return null to indicate we failed return null; @@ -270,12 +271,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { /** * Add the high-quality kmers from the reads to the graph * - * @param graph a graph to add the read kmers to + * @param builder a debruijn graph builder to add the read kmers to * @param reads a non-null list of reads whose kmers we want to add to the graph * @return true if we successfully added the read kmers to the graph without corrupting it in some way */ - protected boolean addReadKmersToGraph(final DeBruijnGraph graph, final List reads) { - final int kmerLength = graph.getKmerSize(); + protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List reads) { + final int kmerLength = builder.getKmerSize(); // Next pull kmers out of every read and throw them on the graph for( final GATKSAMRecord read : reads ) { @@ -285,6 +286,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { if( sequence.length > kmerLength + KMER_OVERLAP ) { final int kmersInSequence = sequence.length - kmerLength + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { + // TODO -- this is quite expense as it does O(kmerLength^2 work) per read // if the qualities of all the bases in the kmers are high enough boolean badKmer = false; for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) { @@ -304,12 +306,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength)); } - graph.addKmerPairFromSeqToGraph(sequence, iii, false, countNumber); + builder.addKmerPairFromSeqToGraph(sequence, iii, countNumber); } } } } + builder.flushKmersToGraph(false); + // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way return true; } @@ -317,17 +321,17 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { /** * Add the kmers from the reference sequence to the DeBruijnGraph * - * @param graph the graph to add the reference kmers to. Must be empty + * @param builder the graph to add the reference kmers to. Must be empty * @param refSequence the reference sequence from which we'll get our kmers * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise */ - protected boolean addReferenceKmersToGraph(final DeBruijnGraph graph, final byte[] refSequence) { - if ( graph == null ) throw new IllegalArgumentException("graph cannot be null"); - if ( graph.vertexSet().size() != 0 ) throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + graph.vertexSet().size() + " vertices in it already: " + graph); + protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) { + if ( builder == null ) throw new IllegalArgumentException("graph cannot be null"); + if ( builder.getGraph().vertexSet().size() != 0 ) + throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph()); if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null"); - - final int kmerLength = graph.getKmerSize(); + final int kmerLength = builder.getKmerSize(); if( refSequence.length < kmerLength + KMER_OVERLAP ) { // not enough reference sequence to build a kmer graph of this length, return null return false; @@ -335,11 +339,12 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final int kmersInSequence = refSequence.length - kmerLength + 1; for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - graph.addKmerPairFromSeqToGraph(refSequence, iii, true, 1); + builder.addKmerPairFromSeqToGraph(refSequence, iii, 1); } + builder.flushKmersToGraph(true); // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices - if ( graph.vertexSet().size() != kmersInSequence ) { + if ( builder.getGraph().vertexSet().size() != kmersInSequence ) { if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping"); return false; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java similarity index 75% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java index 56197047b..418174e64 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterUnitTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java @@ -46,39 +46,65 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; -public class KMerCounterUnitTest extends BaseTest { - @Test - public void testMyData() { - final KMerCounter counter = new KMerCounter(3); +/** + * User: depristo + * Date: 4/7/13 + * Time: 4:14 PM + */ +public class DeBruijnGraphBuilder { + private final static Logger logger = Logger.getLogger(DeBruijnGraphBuilder.class); - Assert.assertNotNull(counter.toString()); + private final int kmerSize; + private final DeBruijnGraph graph; + private final KmerCounter counter; - counter.addKmers( - "ATG", "ATG", "ATG", "ATG", - "ACC", "ACC", "ACC", - "AAA", "AAA", - "CTG", - "NNA", - "CCC" - ); - - testCounting(counter, "ATG", 4); - testCounting(counter, "ACC", 3); - testCounting(counter, "AAA", 2); - testCounting(counter, "CTG", 1); - testCounting(counter, "NNA", 1); - testCounting(counter, "CCC", 1); - testCounting(counter, "NNN", 0); - testCounting(counter, "NNC", 0); - - Assert.assertNotNull(counter.toString()); + public DeBruijnGraphBuilder(final DeBruijnGraph graph) { + if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null"); + this.kmerSize = graph.getKmerSize(); + this.graph = graph; + this.counter = new KmerCounter(kmerSize + 1); } - private void testCounting(final KMerCounter counter, final String in, final int expectedCount) { - Assert.assertEquals(counter.getKmerCount(in.getBytes()), expectedCount); + public DeBruijnGraph getGraph() { + return graph; + } + + public int getKmerSize() { + return kmerSize; + } + + /** + * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this + * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The + * edge between them is added with isRef and multiplicity + * + * @param sequence a sequence of bases from which we want to extract a pair of kmers + * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() + * @param multiplicity what's the multiplicity of the edge between these two kmers + */ + public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) { + if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); + if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); + if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); + final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1); + addKmerPair(kmerPair, multiplicity); + } + + + public void addKmerPair(final Kmer kmerPair, final int multiplicity) { + if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length()); + counter.addKmer(kmerPair, multiplicity); + } + + public void flushKmersToGraph(final boolean addRefEdges) { + for ( final KmerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { + final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases(); + final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases(); + graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount()); + } + counter.clear(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java index 1f0903753..38d829a87 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java @@ -46,9 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.apache.log4j.Logger; - -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; /** * generic utility class that counts kmers @@ -59,13 +59,13 @@ import java.util.*; * Date: 3/8/13 * Time: 1:16 PM */ -public class KMerCounter { - private final static Logger logger = Logger.getLogger(KMerCounter.class); +public class KmerCounter { + //private final static Logger logger = Logger.getLogger(KmerCounter.class); /** * A map of for each kmer to its num occurrences in addKmers */ - private final Map countsByKMer = new HashMap(); + private final Map countsByKMer = new HashMap(); private final int kmerLength; /** @@ -73,70 +73,72 @@ public class KMerCounter { * * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 */ - public KMerCounter(final int kmerLength) { + public KmerCounter(final int kmerLength) { if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); this.kmerLength = kmerLength; } - /** - * For testing purposes - * - * @param kmers - */ - protected void addKmers(final String ... kmers) { - for ( final String kmer : kmers ) - addKmer(kmer, 1); - } - /** * Get the count of kmer in this kmer counter * @param kmer a non-null counter to get * @return a positive integer */ - public int getKmerCount(final byte[] kmer) { + public int getKmerCount(final Kmer kmer) { if ( kmer == null ) throw new IllegalArgumentException("kmer cannot be null"); - final CountedKmer counted = countsByKMer.get(new String(kmer)); + final CountedKmer counted = countsByKMer.get(kmer); return counted == null ? 0 : counted.count; } + public Collection getCountedKmers() { + return countsByKMer.values(); + } + + public void clear() { + countsByKMer.clear(); + } + /** * Add a kmer that occurred kmerCount times * - * @param rawKmer a kmer + * @param kmer a kmer * @param kmerCount the number of occurrences */ - public void addKmer(final byte[] rawKmer, final int kmerCount) { - addKmer(new String(rawKmer), kmerCount); - } - - protected void addKmer(final String rawKmer, final int kmerCount) { - if ( rawKmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + rawKmer + " expected size " + kmerLength); + public void addKmer(final Kmer kmer, final int kmerCount) { + if ( kmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + kmer + " expected size " + kmerLength); if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount); - CountedKmer countFromMap = countsByKMer.get(rawKmer); + CountedKmer countFromMap = countsByKMer.get(kmer); if ( countFromMap == null ) { - countFromMap = new CountedKmer(rawKmer); - countsByKMer.put(rawKmer, countFromMap); + countFromMap = new CountedKmer(kmer); + countsByKMer.put(kmer, countFromMap); } countFromMap.count += kmerCount; } @Override public String toString() { - final StringBuilder b = new StringBuilder("KMerCounter{"); + final StringBuilder b = new StringBuilder("KmerCounter{"); b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); b.append("\n}"); return b.toString(); } - private static class CountedKmer implements Comparable { - final String kmer; - int count; + protected static class CountedKmer implements Comparable { + final Kmer kmer; + int count = 0; - private CountedKmer(String kmer) { + private CountedKmer(final Kmer kmer) { this.kmer = kmer; } + public Kmer getKmer() { + return kmer; + } + + public int getCount() { + return count; + } + @Override public String toString() { return "CountedKmer{" + @@ -150,4 +152,25 @@ public class KMerCounter { return o.count - count; } } + + // ------------------------------------------------------------------------------------- + // Protected methods for testing purposes only + // ------------------------------------------------------------------------------------- + + /** + * For testing purposes only + */ + protected void addKmer(final String rawKmer, final int kmerCount) { + addKmer(new Kmer(rawKmer), kmerCount); + } + + /** + * For testing purposes + * + * @param kmers + */ + protected void addKmers(final String ... kmers) { + for ( final String kmer : kmers ) + addKmer(kmer, 1); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java new file mode 100644 index 000000000..9b0e1ac0a --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java @@ -0,0 +1,207 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.Arrays; + +/** + * Fast wrapper for byte[] kmers + * + * This objects has several important features that make it better than using a raw byte[] for a kmer: + * + * -- Can create kmer from a range of a larger byte[], allowing us to avoid Array.copyOfRange + * -- Fast equals and hashcode methods + * -- can get actual byte[] of the kmer, even if it's from a larger byte[], and this operation + * only does the work of that operation once, updating its internal state + * + * User: depristo + * Date: 4/8/13 + * Time: 7:54 AM + */ +public class Kmer { + // this values may be updated in the course of interacting with this kmer + private byte[] bases; + protected int start; + + // two constants + final protected int length; + final protected int hash; + + /** + * Create a new kmer using all bases in kmer + * @param kmer a non-null byte[] + */ + public Kmer(byte[] kmer) { + this(kmer, 0, kmer.length); + } + + /** + * Create a new kmer based on the string kmer + * + * This is not a good method to use for performance + * + * @param kmer the bases as a string + */ + public Kmer(final String kmer) { + this(kmer.getBytes()); + } + + /** + * Create a new kmer backed by the bases in bases, spanning start -> start + length + * + * Under no circumstances can bases be modified anywhere in the client code. This does not make a copy + * of bases for performance reasons + * + * @param bases an array of bases + * @param start the start of the kmer in bases, must be >= 0 and < bases.length + * @param length the length of the kmer. Must be >= 0 and start + length < bases.length + */ + public Kmer(final byte[] bases, final int start, final int length) { + if ( bases == null ) throw new IllegalArgumentException("bases cannot be null"); + if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); + if ( length < 0 ) throw new IllegalArgumentException("length must be >= 0 but got " + length); + if ( (start + length) > bases.length ) throw new IllegalArgumentException("start + length " + (start + length) + " must be <= bases.length " + bases.length + " but got " + start + " with length " + length); + this.bases = bases; + this.start = start; + this.length = length; + this.hash = myHashCode(bases, start, length); + } + + /** + * Create a new kmer that's a shallow copy of kmer + * @param kmer the kmer to shallow copy + */ + public Kmer(final Kmer kmer) { + this.bases = kmer.bases; + this.start = kmer.start; + this.length = kmer.length; + this.hash = kmer.hash; + } + + /** + * Create a derived shallow kmer that starts at newStart and has newLength bases + * @param newStart the new start of kmer, where 0 means that start of the kmer, 1 means skip the first base + * @param newLength the new length + * @return a new kmer based on the data in this kmer. Does not make a copy, so shares most of the data + */ + public Kmer subKmer(final int newStart, final int newLength) { + return new Kmer(bases, start + newStart, newLength); + } + + /** + * Get the bases of this kmer. May create a copy of the bases, depending on how this kmer was constructed. + * + * Note that this function is efficient in that if it needs to copy the bases this only occurs once. + * + * @return a non-null byte[] containing length() bases of this kmer, regardless of how this kmer was created + */ + public byte[] bases() { + if ( start != 0 || bases.length != length ) { + // update operation. Rip out the exact byte[] and update start so we don't ever do this again + bases = Arrays.copyOfRange(bases, start, start + length); + start = 0; + } + + return bases; + } + + /** + * The length of this kmer + * @return an integer >= 0 + */ + public int length() { + return length; + } + + @Override + public String toString() { + return "Kmer{" + new String(bases()) + "}"; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final Kmer kmer = (Kmer) o; + + // very fast test. If hash aren't equal you are done, otherwise compare the bases + if ( hash != kmer.hash ) return false; + if ( length != kmer.length ) return false; + + for ( int i = 0; i < length; i++ ) + if ( bases[start + i] != kmer.bases[kmer.start + i] ) + return false; + + return true; + } + + @Override + public int hashCode() { + return hash; + } + + /** + * Helper method that computes the hashcode for this kmer based only the bases in + * a[], starting at start and running length bases + * + * @param a a non-null bases array + * @param start where to start in bases + * @param length the length of the bases + * @return a hashcode value appropriate for a[start] -> a[start + length] + */ + private static int myHashCode(final byte a[], final int start, final int length) { + if (a == null) + return 0; + + int result = 1; + for (int i = 0; i < length; i++) + result = 31 * result + a[start + i]; + + return result; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java index c11841dac..13135ddce 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java @@ -107,25 +107,6 @@ public final class DeBruijnGraph extends BaseGraph { addOrUpdateEdge(v1, v2, toAdd); } - /** - * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this - * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The - * edge between them is added with isRef and multiplicity - * - * @param sequence a sequence of bases from which we want to extract a pair of kmers - * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() - * @param isRef should the edge between the two kmers be a reference edge? - * @param multiplicity what's the multiplicity of the edge between these two kmers - */ - public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final boolean isRef, final int multiplicity ) { - if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); - if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); - if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); - final byte[] kmer1 = Arrays.copyOfRange(sequence, start, start + getKmerSize()); - final byte[] kmer2 = Arrays.copyOfRange(sequence, start + 1, start + 1 + getKmerSize()); - addKmersToGraph(kmer1, kmer2, isRef, multiplicity); - } - /** * Convert this kmer graph to a simple sequence graph. * diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java new file mode 100644 index 000000000..9b3c8d481 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java @@ -0,0 +1,84 @@ +/* + * By downloading the PROGRAM you agree to the following terms of use: + * + * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY + * + * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). + * + * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and + * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. + * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: + * + * 1. DEFINITIONS + * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. + * + * 2. LICENSE + * 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. + * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. + * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. + * 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. + * + * 3. OWNERSHIP OF INTELLECTUAL PROPERTY + * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. + * Copyright 2012 Broad Institute, Inc. + * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. + * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. + * + * 4. INDEMNIFICATION + * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. + * + * 5. NO REPRESENTATIONS OR WARRANTIES + * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. + * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. + * + * 6. ASSIGNMENT + * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. + * + * 7. MISCELLANEOUS + * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. + * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. + * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. + * 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. + * 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. + * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. + * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. + */ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KmerCounterUnitTest extends BaseTest { + @Test + public void testMyData() { + final KmerCounter counter = new KmerCounter(3); + + Assert.assertNotNull(counter.toString()); + + counter.addKmers( + "ATG", "ATG", "ATG", "ATG", + "ACC", "ACC", "ACC", + "AAA", "AAA", + "CTG", + "NNA", + "CCC" + ); + + testCounting(counter, "ATG", 4); + testCounting(counter, "ACC", 3); + testCounting(counter, "AAA", 2); + testCounting(counter, "CTG", 1); + testCounting(counter, "NNA", 1); + testCounting(counter, "CCC", 1); + testCounting(counter, "NNN", 0); + testCounting(counter, "NNC", 0); + + Assert.assertNotNull(counter.toString()); + } + + private void testCounting(final KmerCounter counter, final String in, final int expectedCount) { + Assert.assertEquals(counter.getKmerCount(new Kmer(in)), expectedCount); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java new file mode 100644 index 000000000..989c38628 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerUnitTest.java @@ -0,0 +1,133 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class KmerUnitTest extends BaseTest { + @DataProvider(name = "KMerCreationData") + public Object[][] makeKMerCreationData() { + List tests = new ArrayList(); + + final String bases = "ACGTAACCGGTTAAACCCGGGTTT"; + for ( int start = 0; start < bases.length(); start++ ) { + for ( int length = 1; start + length < bases.length(); length++ ) { + final String myBases = bases.substring(start, start+length); + tests.add(new Object[]{bases.getBytes(), start, length, myBases}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "KMerCreationData") + public void testFullConstructor(final byte[] allBases, final int start, final int length, final String expected) { + testKmerCreation(new Kmer(allBases, start, length), start, length, expected); + } + + @Test(dataProvider = "KMerCreationData") + public void testCopyConstructor(final byte[] allBases, final int start, final int length, final String expected) { + testKmerCreation(new Kmer(new Kmer(allBases, start, length)), start, length, expected); + } + + @Test(dataProvider = "KMerCreationData") + public void testByteConstructor(final byte[] allBases, final int start, final int length, final String expected) { + testKmerCreation(new Kmer(Arrays.copyOfRange(allBases, start, start + length)), 0, length, expected); + } + + @Test(dataProvider = "KMerCreationData") + public void testStringConstructor(final byte[] allBases, final int start, final int length, final String expected) { + testKmerCreation(new Kmer(new String(Arrays.copyOfRange(allBases, start, start + length))), 0, length, expected); + } + + private void testKmerCreation(final Kmer kmer, final int start, final int length, final String expected) { + Assert.assertEquals(kmer.start, start); + Assert.assertEquals(kmer.length(), length); + Assert.assertEquals(new String(kmer.bases()), expected); + + // check that the caching is working by calling again + Assert.assertEquals(kmer.start, 0); + Assert.assertEquals(kmer.length(), length); + Assert.assertEquals(new String(kmer.bases()), expected); + } + + @Test + public void testEquals() { + final byte[] bases = "ACGTACGT".getBytes(); + final Kmer eq1 = new Kmer(bases, 0, 3); + final Kmer eq2 = new Kmer(bases, 4, 3); + final Kmer eq3 = new Kmer(new Kmer(bases, 4, 3)); + final Kmer eq4 = new Kmer(new Kmer(bases, 4, 3).bases()); + final Kmer neq = new Kmer(bases, 1, 3); + +// for ( final Kmer eq : Arrays.asList(eq1, eq2) ) { // TODO -- deal with me + for ( final Kmer eq : Arrays.asList(eq1, eq2, eq3, eq4) ) { + Assert.assertEquals(eq1, eq, "Should have been equal but wasn't: " + eq1.hash + " vs " + eq.hash); // , "should be equals " + eq1 + " with " + eq); + Assert.assertEquals(eq1.hashCode(), eq.hashCode()); + Assert.assertNotEquals(eq, neq, "incorrectly equals " + eq + " with " + neq); + } + } + + @Test + public void testSubkmer() { + final String bases = "ACGT"; + final Kmer one = new Kmer(bases.getBytes()); + + for ( int start = 0; start < bases.length(); start++ ) { + for ( int length = 0; start + length < bases.length(); length++ ) { + Assert.assertEquals(new String(one.subKmer(start,length).bases()), bases.substring(start, start+length)); + } + } + } +} From fb86887bf2e51e1784a9c8285c02c31f5373ff64 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 8 Apr 2013 17:19:08 -0400 Subject: [PATCH 170/226] Fast algorithm for determining which kmers are good in a read -- old algorithm was O(kmerSize * readLen) for each read. New algorithm is O(readLen) -- Added real unit tests for the addKmersFromReads to the graph. Using a builder is great because we can create a MockBuilder that captures all of the calls, and then verify that all of the added kmers are the ones we'd expect. --- .../haplotypecaller/DeBruijnAssembler.java | 33 ++++---- .../haplotypecaller/DeBruijnGraphBuilder.java | 56 ++++++++++++-- .../walkers/haplotypecaller/KMerCounter.java | 15 +++- .../DeBruijnAssemblerUnitTest.java | 76 +++++++++++++++++++ ...t.java => KMerCounterCaseFixUnitTest.java} | 6 +- 5 files changed, 153 insertions(+), 33 deletions(-) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/{KmerCounterUnitTest.java => KMerCounterCaseFixUnitTest.java} (98%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index b19b1f8d7..8b89bb1bb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -55,11 +55,11 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -283,30 +283,27 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if( sequence.length > kmerLength + KMER_OVERLAP ) { - final int kmersInSequence = sequence.length - kmerLength + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - // TODO -- this is quite expense as it does O(kmerLength^2 work) per read - // if the qualities of all the bases in the kmers are high enough - boolean badKmer = false; - for( int jjj = iii; jjj < iii + kmerLength + 1; jjj++) { - if( qualities[jjj] < minBaseQualityToUseInAssembly ) { - badKmer = true; - break; - } - } - if( !badKmer ) { + if ( sequence.length > kmerLength + KMER_OVERLAP ) { + int lastGood = -1; // the index of the last good base we've seen + for( int end = 0; end < sequence.length; end++ ) { + if ( qualities[end] < minBaseQualityToUseInAssembly ) { + lastGood = -1; // reset the last good base + } else if ( lastGood == -1 ) { + lastGood = end; // we're at a good base, the last good one is us + } else if ( end - kmerLength >= lastGood ) { + // end - kmerLength (the start) is after the lastGood base, so that kmer is good + final int start = end - kmerLength; // how many observations of this kmer have we seen? A normal read counts for 1, but // a reduced read might imply a higher multiplicity for our the edge int countNumber = 1; - if( read.isReducedRead() ) { + if ( read.isReducedRead() ) { // compute mean number of reduced read counts in current kmer span // precise rounding can make a difference with low consensus counts // TODO -- optimization: should extend arrayMax function to take start stop values - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + kmerLength)); + countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end)); } - builder.addKmerPairFromSeqToGraph(sequence, iii, countNumber); + builder.addKmerPairFromSeqToGraph(sequence, start, countNumber); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java index 418174e64..0f66082c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java @@ -46,32 +46,56 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; /** + * Fast approach to building a DeBruijnGraph + * + * Follows the model: + * + * for each X that has bases for the final graph: + * addKmer pair (single kmer with kmer size + 1 spanning the pair) + * + * flushKmersToGraph + * * User: depristo * Date: 4/7/13 * Time: 4:14 PM */ public class DeBruijnGraphBuilder { - private final static Logger logger = Logger.getLogger(DeBruijnGraphBuilder.class); - + /** The size of the kmer graph we want to build */ private final int kmerSize; - private final DeBruijnGraph graph; - private final KmerCounter counter; + /** The graph we're going to add kmers to */ + private final DeBruijnGraph graph; + + /** keeps counts of all kmer pairs added since the last flush */ + private final KMerCounter counter; + + /** + * Create a new builder that will write out kmers to graph + * + * @param graph a non-null graph that can contain already added kmers + */ public DeBruijnGraphBuilder(final DeBruijnGraph graph) { if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null"); this.kmerSize = graph.getKmerSize(); this.graph = graph; - this.counter = new KmerCounter(kmerSize + 1); + this.counter = new KMerCounter(kmerSize + 1); } + /** + * The graph we're building + * @return a non-null graph + */ public DeBruijnGraph getGraph() { return graph; } + /** + * The kmer size of our graph + * @return positive integer + */ public int getKmerSize() { return kmerSize; } @@ -93,14 +117,30 @@ public class DeBruijnGraphBuilder { addKmerPair(kmerPair, multiplicity); } - + /** + * Add a single kmer pair to this builder + * @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1 + * will have an edge added to this + * @param multiplicity the desired multiplicity of this edge + */ public void addKmerPair(final Kmer kmerPair, final int multiplicity) { if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length()); counter.addKmer(kmerPair, multiplicity); } + /** + * Flushes the currently added kmers to the graph + * + * After this function is called the builder is reset to an empty state + * + * This flushing is expensive, so many kmers should be added to the builder before flushing. The most + * efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases) + * then and do one flush when completed + * + * @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges? + */ public void flushKmersToGraph(final boolean addRefEdges) { - for ( final KmerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { + for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases(); final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases(); graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java index 38d829a87..a7194f85f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounter.java @@ -59,8 +59,8 @@ import java.util.Map; * Date: 3/8/13 * Time: 1:16 PM */ -public class KmerCounter { - //private final static Logger logger = Logger.getLogger(KmerCounter.class); +public class KMerCounter { + //private final static Logger logger = Logger.getLogger(KMerCounter.class); /** * A map of for each kmer to its num occurrences in addKmers @@ -73,7 +73,7 @@ public class KmerCounter { * * @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1 */ - public KmerCounter(final int kmerLength) { + public KMerCounter(final int kmerLength) { if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength); this.kmerLength = kmerLength; } @@ -89,10 +89,17 @@ public class KmerCounter { return counted == null ? 0 : counted.count; } + /** + * Get an unordered collection of the counted kmers in this counter + * @return a non-null collection + */ public Collection getCountedKmers() { return countsByKMer.values(); } + /** + * Remove all current counts, resetting the counter to an empty state + */ public void clear() { countsByKMer.clear(); } @@ -117,7 +124,7 @@ public class KmerCounter { @Override public String toString() { - final StringBuilder b = new StringBuilder("KmerCounter{"); + final StringBuilder b = new StringBuilder("KMerCounter{"); b.append("counting ").append(countsByKMer.size()).append(" distinct kmers"); b.append("\n}"); return b.toString(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java index 59d13dee4..e6dea4d11 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java @@ -55,13 +55,16 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -122,4 +125,77 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { } } + private static class MockBuilder extends DeBruijnGraphBuilder { + public final List addedPairs = new LinkedList(); + + private MockBuilder(final int kmerSize) { + super(new DeBruijnGraph(kmerSize)); + } + + @Override + public void addKmerPair(Kmer kmerPair, int multiplicity) { + logger.info("addKmerPair" + kmerPair); + addedPairs.add(kmerPair); + } + + @Override + public void flushKmersToGraph(boolean addRefEdges) { + // do nothing + } + } + + @DataProvider(name = "AddReadKmersToGraph") + public Object[][] makeAddReadKmersToGraphData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final String bases = "ACGTAACCGGTTAAACCCGGGTTT"; + final int readLen = bases.length(); + final List allBadStarts = new ArrayList(readLen); + for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i); + + for ( final int kmerSize : Arrays.asList(3, 4, 5) ) { + for ( final int nBadQuals : Arrays.asList(0, 1, 2) ) { + for ( final List badStarts : Utils.makePermutations(allBadStarts, nBadQuals, false) ) { + tests.add(new Object[]{bases, kmerSize, badStarts}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AddReadKmersToGraph") + public void testAddReadKmersToGraph(final String bases, final int kmerSize, final List badQualsSites) { + final int readLen = bases.length(); + final DeBruijnAssembler assembler = new DeBruijnAssembler(); + final MockBuilder builder = new MockBuilder(kmerSize); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + final byte[] quals = Utils.dupBytes((byte)20, bases.length()); + for ( final int badSite : badQualsSites ) quals[badSite] = 0; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLen); + read.setReadBases(bases.getBytes()); + read.setBaseQualities(quals); + + final Set expectedBases = new HashSet(); + final Set expectedStarts = new LinkedHashSet(); + for ( int i = 0; i < readLen; i++) { + boolean good = true; + for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing + good &= i + j < readLen && quals[i+j] >= assembler.getMinBaseQualityToUseInAssembly(); + } + if ( good ) { + expectedStarts.add(i); + expectedBases.add(bases.substring(i, i + kmerSize + 1)); + } + } + + assembler.addReadKmersToGraph(builder, Arrays.asList(read)); + Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size()); + for ( final Kmer addedKmer : builder.addedPairs ) { + Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases); + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java similarity index 98% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java index 9b3c8d481..c049121a3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerCounterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KMerCounterCaseFixUnitTest.java @@ -50,10 +50,10 @@ import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.Test; -public class KmerCounterUnitTest extends BaseTest { +public class KMerCounterCaseFixUnitTest extends BaseTest { @Test public void testMyData() { - final KmerCounter counter = new KmerCounter(3); + final KMerCounter counter = new KMerCounter(3); Assert.assertNotNull(counter.toString()); @@ -78,7 +78,7 @@ public class KmerCounterUnitTest extends BaseTest { Assert.assertNotNull(counter.toString()); } - private void testCounting(final KmerCounter counter, final String in, final int expectedCount) { + private void testCounting(final KMerCounter counter, final String in, final int expectedCount) { Assert.assertEquals(counter.getKmerCount(new Kmer(in)), expectedCount); } } From a507381a33cbb4117d8958dd682295ac4a536201 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 11 Apr 2013 15:05:19 -0400 Subject: [PATCH 171/226] Updating BQSR RecalibrationEngine to work correctly with empty BQSR tables. -- Previously would crash when a scatter/gather interval contained no usable data. -- Added unit test to cover this case. --- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 4 +++- .../walkers/bqsr/RecalibrationEngine.java | 6 ++++-- .../recalibration/RecalibrationReport.java | 7 +++++++ .../recalibration/RecalibrationTables.java | 10 +++++++++ .../walkers/bqsr/BQSRGathererUnitTest.java | 21 +++++++++++++++++++ 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index 82d08da41..ad97dc008 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -65,7 +65,7 @@ import java.util.List; public class BQSRGatherer extends Gatherer { - private static final String EMPTY_INPUT_LIST = "list of inputs files is empty"; + private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; private static final String MISSING_OUTPUT_FILE = "missing output file name"; @Override @@ -80,6 +80,8 @@ public class BQSRGatherer extends Gatherer { RecalibrationReport generalReport = null; for (File input : inputs) { final RecalibrationReport inputReport = new RecalibrationReport(input); + if( inputReport.isEmpty() ) { continue; } + if (generalReport == null) generalReport = inputReport; else diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 5e6e2a8d9..9f33234cf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -178,7 +178,7 @@ public class RecalibrationEngine { final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable(); // iterate over all values in the qual table - for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { + for ( final NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { final int rgKey = leaf.keys[0]; final int eventIndex = leaf.keys[2]; final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); @@ -206,7 +206,9 @@ public class RecalibrationEngine { */ @Requires("! finalized") private RecalibrationTables mergeThreadLocalRecalibrationTables() { - if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + if ( recalibrationTablesList.isEmpty() ) { + recalibrationTablesList.add( new RecalibrationTables(covariates, numReadGroups, maybeLogStream) ); + } RecalibrationTables merged = null; for ( final RecalibrationTables table : recalibrationTablesList ) { diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index a3fec6a22..ea45c2abf 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -372,4 +372,11 @@ public class RecalibrationReport { public Covariate[] getCovariates() { return requestedCovariates; } + + /** + * @return true if the report has no data + */ + public boolean isEmpty() { + return recalibrationTables.isEmpty(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index 15b6c8571..7d1a9f956 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -124,6 +124,16 @@ public final class RecalibrationTables { return tables.size(); } + /** + * @return true if all the tables contain no RecalDatums + */ + public boolean isEmpty() { + for( final NestedIntegerArray table : tables ) { + if( !table.getAllValues().isEmpty() ) { return false; } + } + return true; + } + /** * Allocate a new quality score table, based on requested parameters * in this set of tables, without any data in it. The return result diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index f82f24439..658b8527d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -69,6 +69,7 @@ public class BQSRGathererUnitTest extends BaseTest { private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table"); private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table"); private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table"); + private static File recalEmpty = new File(privateTestDir + "HiSeq.1mb.1RG.empty.table"); private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table"); @@ -110,6 +111,26 @@ public class BQSRGathererUnitTest extends BaseTest { testReports(originalReport, calculatedReport); } + @Test(enabled = true) + public void testGatherBQSRWithEmptyFile() { + BQSRGatherer gatherer = new BQSRGatherer(); + List recalFiles = new LinkedList (); + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + recalFiles.add(recal1); + recalFiles.add(recal2); + recalFiles.add(recal3); + recalFiles.add(recal4); + recalFiles.add(recal5); + recalFiles.add(recalEmpty); + gatherer.gather(recalFiles, output); + + GATKReport originalReport = new GATKReport(recal_original); + GATKReport calculatedReport = new GATKReport(output); + + testReports(originalReport, calculatedReport); + } + private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) { // test the Arguments table From 74196ff7dbb8e93b3acd7751a3e2d306352b7e3e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 11 Apr 2013 15:35:09 -0400 Subject: [PATCH 172/226] Trivial BQSR bug fixes and improvement -- Ensure that BQSR works properly for an Ion Torrent BAM. (Added integration test and bam) -- Improve the error message when a unknown platform is found (integration test added) --- .../covariates/CycleCovariate.java | 4 +++- .../gatk/walkers/bqsr/BQSRIntegrationTest.java | 17 ++++++++++++++++- .../broadinstitute/sting/utils/NGSPlatform.java | 16 ++++++++++++++++ .../sting/utils/sam/GATKSAMReadGroupRecord.java | 5 +++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index bcb42f7ef..f585299f4 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -237,7 +237,9 @@ public class CycleCovariate implements StandardCovariate { // Unknown platforms else { - throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); + throw new UserException("The platform (" + read.getReadGroup().getPlatform() + + ") associated with read group " + read.getReadGroup() + + " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString()); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 2149091af..907046704 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -82,7 +82,7 @@ public class BQSRIntegrationTest extends WalkerTest { " -I " + bam + " -L " + interval + args + - " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) + + " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : (reference.equals(b37KGReference) ? b37dbSNP129 : hg18dbSNP132)) + " --allow_potentially_misencoded_quality_scores" + // TODO -- remove me when we get new SOLiD bams " -o %s" + " -sortAllCols"; @@ -115,6 +115,8 @@ public class BQSRIntegrationTest extends WalkerTest { {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "c1c3cda8caceed619d3d439c3990cd26")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "c9953f020a65c1603a6d71aeeb1b95f3")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5bfff0c699345cca12a9b33acf95588f")}, + // make sure we work with ION torrent bam + {new BQSRTest(b37KGReference, privateTestDir + "iontorrent.bam", "20:10,000,000-10,200,000", "", "7375c7b692e76b651c278a9fb478fa1c")}, }; } @@ -257,4 +259,17 @@ public class BQSRIntegrationTest extends WalkerTest { UserException.class); executeTest("testPRFailWithLowMaxCycle", spec); } + + @Test + public void testPRFailWithBadPL() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T BaseRecalibrator" + + " -R " + b37KGReference + + " -I " + privateTestDir + "badPLForBQSR.bam" + + " -L 1:10,000,000-10,200,000" + + " -o %s", + 1, + UserException.class); + executeTest("testPRFailWithBadPL", spec); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java index f08564644..029dfad31 100644 --- a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java @@ -28,6 +28,9 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.LinkedList; +import java.util.List; + /** * A canonical, master list of the standard NGS platforms. These values * can be obtained (efficiently) from a GATKSAMRecord object with the @@ -117,4 +120,17 @@ public enum NGSPlatform { public static boolean isKnown(final String platform) { return fromReadGroupPL(platform) != UNKNOWN; } + + /** + * Get a human-readable list of platform names + * @return the list of platform names + */ + public static String knownPlatformsString() { + final List names = new LinkedList(); + for ( final NGSPlatform pl : values() ) { + for ( final String name : pl.BAM_PL_NAMES ) + names.add(name); + } + return Utils.join(",", names); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index fcebbec9b..ec9d7d219 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -72,6 +72,11 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { return mNGSPlatform; } + @Override + public String toString() { + return "GATKSAMReadGroupRecord @RG:" + getReadGroupId(); + } + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// From 50cdffc61f904acc0c485f62f7fc47f62edb1b0e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 10 Apr 2013 10:54:25 -0400 Subject: [PATCH 173/226] Slightly improved Smith-Waterman parameter values for HaplotypeCaller Path comparisons Key improvement --------------- -- The haplotype caller was producing unstable calls when comparing the following two haplotypes: ref: ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA alt: TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA in which the alt and ref haplotypes differ in having indel at both the start and end of the bubble. The previous parameter values used in the Path algorithm were set so that such haplotype comparisons would result in the either the above alignment or the following alignment depending on exactly how many GA units were present in the bubble. ref: ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA alt: TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA The number of elements could vary depending on how the graph was built, and resulted in real differences in the calls between BWA mem and BWA-SW calls. I added a few unit tests for this case, and found a set of SW parameter values with lower gap-extension penalties that significantly favor the first alignment, which is the right thing to do, as we really don't mind large indels in the haplotypes relative to having lots of mismatches. -- Expanded the unit tests in both SW and KBestPaths to look at complex events like this, and to check as well somewhat sysmatically that we are finding many types of expected mutational events. -- Verified that this change doesn't alter our calls on 20:10,000,000-11,000,000 at all General code cleanup -------------------- -- Move Smith-Waterman to its own package in utils -- Refactored out SWParameters class in SWPairwiseAlignment, and made constructors take either a named parameter set or a Parameter object directly. Depreciated old call to inline constants. This makes it easier to group all of the SW parameters into a single object for callers -- Update users of SW code to use new Parameter class -- Also moved haplotype bam writers to protected so they can use the Path SW parameter, which is protected -- Removed the storage of the SW scoring matrix in SWPairwiseAligner by default. Only the SWPairwiseAlignmentMain test program needs this, so added a gross protected static variable that enables its storage --- .../haplotypecaller/DeBruijnAssembler.java | 13 +- .../haplotypecaller/HaplotypeResolver.java | 2 +- .../walkers/haplotypecaller/graphs/Path.java | 21 +- .../gatk/walkers/indels/IndelRealigner.java | 12 +- .../AllHaplotypeBAMWriter.java | 98 ++ .../CalledHaplotypeBAMWriter.java | 108 +++ .../HaplotypeBAMWriter.java | 70 +- .../GenotypingEngineUnitTest.java | 2 +- .../graphs/KBestPathsUnitTest.java | 134 ++- .../HaplotypeBAMWriterUnitTest.java | 67 +- .../SWPairwiseAlignmentUnitTest.java | 28 +- .../AllHaplotypeBAMWriter.java | 77 -- .../CalledHaplotypeBAMWriter.java | 87 -- .../sting/utils/smithwaterman/Parameters.java | 60 ++ .../SWPairwiseAlignment.java | 882 +++++++++--------- .../SWPairwiseAlignmentMain.java | 4 +- .../utils/smithwaterman/SWParameterSet.java | 51 + 17 files changed, 1048 insertions(+), 668 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java create mode 100644 protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java rename {public => protected}/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java (61%) rename {public => protected}/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java (59%) rename protected/java/test/org/broadinstitute/sting/utils/{ => smithwaterman}/SWPairwiseAlignmentUnitTest.java (88%) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java rename public/java/src/org/broadinstitute/sting/utils/{ => smithwaterman}/SWPairwiseAlignment.java (88%) rename public/java/src/org/broadinstitute/sting/utils/{ => smithwaterman}/SWPairwiseAlignmentMain.java (98%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 8b89bb1bb..5ce65e13f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -56,13 +56,14 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -86,12 +87,6 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; private static final int GRAPH_KMER_STEP = 6; - // Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode - private static final double SW_MATCH = 5.0; // 1.0; - private static final double SW_MISMATCH = -10.0; //-1.0/3.0; - private static final double SW_GAP = -22.0; //-1.0-1.0/3.0; - private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0; - private final boolean debug; private final boolean debugGraphTransformations; private final int minKmer; @@ -587,7 +582,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { if( haplotype == null ) { return false; } - final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS ); haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() ); if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments @@ -616,7 +611,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } final Haplotype h = new Haplotype( newHaplotypeBases ); - final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS ); h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); if ( haplotype.isArtificialHaplotype() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index 134863b8b..01ab421b3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -59,7 +59,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index d91ec0e37..9d2d680c9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -52,7 +52,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.Parameters; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.*; @@ -85,10 +86,8 @@ public class Path { // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base - private static final double SW_MATCH = 20.0; - private static final double SW_MISMATCH = -15.0; - private static final double SW_GAP = -26.0; - private static final double SW_GAP_EXTEND = -1.1; + public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -20.0, -26.0, -0.1); + private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); /** @@ -107,6 +106,16 @@ public class Path { this.graph = graph; } + /** + * Convenience constructor for testing that creates a path through vertices in graph + */ + protected static Path makePath(final List vertices, final BaseGraph graph) { + Path path = new Path(vertices.get(0), graph); + for ( int i = 1; i < vertices.size(); i++ ) + path = new Path(path, graph.getEdge(path.lastVertex, vertices.get(i))); + return path; + } + /** * Create a new Path extending p with edge * @@ -362,7 +371,7 @@ public class Path { padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding ); final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding ); - swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); + swConsensus = new SWPairwiseAlignment( reference, alternate, NEW_SW_PARAMETERS ); if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) { goodAlignment = true; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 7d8243c98..c0848663e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -62,7 +62,8 @@ import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.Parameters; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; @@ -328,10 +329,7 @@ public class IndelRealigner extends ReadWalker { // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; - private static final double SW_MATCH = 30.0; // 1.0; - private static final double SW_MISMATCH = -10.0; //-1.0/3.0; - private static final double SW_GAP = -10.0; //-1.0-1.0/3.0; - private static final double SW_GAP_EXTEND = -2.0; //-1.0/.0; + private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0); // reference base padding size // TODO -- make this a command-line argument if the need arises @@ -999,7 +997,7 @@ public class IndelRealigner extends ReadWalker { private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); if ( c != null ) altConsensesToPopulate.add(c); @@ -1016,7 +1014,7 @@ public class IndelRealigner extends ReadWalker { } // do a pairwise alignment against the reference SWalignmentRuns++; - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases()); if ( c != null ) { altConsensesToPopulate.add(c); diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java new file mode 100644 index 000000000..54061c781 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -0,0 +1,98 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * A haplotype bam writer that writes out all haplotypes as reads and then + * the alignment of reach read to its best match among the best haplotypes. + * + * Primarily useful for people working on the HaplotypeCaller method itself + * + * User: depristo + * Date: 2/22/13 + * Time: 1:50 PM + */ +class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { + public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) { + super(bamWriter); + } + + /** + * {@inheritDoc} + */ + @Override + public void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap) { + writeHaplotypesAsReads(haplotypes, new HashSet(bestHaplotypes), paddedReferenceLoc); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + + // next, output the interesting reads for each sample aligned against the appropriate haplotype + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java new file mode 100644 index 000000000..d63cf65fc --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -0,0 +1,108 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Writes a BAM containing just the reads in stratifiedReadMap aligned to their + * most likely haplotype among all of the called haplotypes. + * + * Primarily useful for users of the HaplotypeCaller who want to better understand the + * support of their calls w.r.t. the reads. + * + * User: depristo + * Date: 2/22/13 + * Time: 1:50 PM + */ +class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { + public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) { + super(bamWriter); + } + + /** + * {@inheritDoc} + */ + @Override + public void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap) { + if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes + return; + + writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : calledHaplotypes ) { + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + } + + // the set of all alleles that were actually called + final Set allelesOfCalledHaplotypes = alleleToHaplotypeMap.keySet(); + + // next, output the interesting reads for each sample aligned against one of the called haplotypes + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + if ( entry.getKey().getMappingQuality() > 0 ) { + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java similarity index 61% rename from public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java rename to protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index c80287bca..2eea664d9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -1,35 +1,57 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -191,7 +213,7 @@ public abstract class HaplotypeBAMWriter { try { // compute the smith-waterman alignment of read -> haplotype - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), 5.0, -10.0, -22.0, -1.2); + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), Path.NEW_SW_PARAMETERS); //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) // sw can fail (reasons not clear) so if it happens just don't write the read diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 6a66d9845..8633a1d9d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -57,9 +57,9 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index 3c6327842..90fdf1fa4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -49,6 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.testng.Assert; @@ -63,7 +65,9 @@ import java.util.*; * Date: 1/31/13 */ -public class KBestPathsUnitTest { +public class KBestPathsUnitTest extends BaseTest { + private final static boolean DEBUG = false; + @DataProvider(name = "BasicPathFindingData") public Object[][] makeBasicPathFindingData() { List tests = new ArrayList(); @@ -96,7 +100,7 @@ public class KBestPathsUnitTest { return vertices; } - @Test(dataProvider = "BasicPathFindingData", enabled = true) + @Test(dataProvider = "BasicPathFindingData", enabled = !DEBUG) public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle, final boolean allowCycles) { SeqGraph graph = new SeqGraph(); @@ -128,7 +132,7 @@ public class KBestPathsUnitTest { Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); } - @Test + @Test(enabled = !DEBUG) public void testPathFindingComplexCycle() { SeqGraph graph = new SeqGraph(); @@ -148,7 +152,7 @@ public class KBestPathsUnitTest { Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); } - @Test + @Test(enabled = !DEBUG) public void testPathFindingCycleLastNode() { SeqGraph graph = new SeqGraph(); @@ -175,7 +179,7 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "BasicBubbleDataProvider", enabled = true) + @Test(dataProvider = "BasicBubbleDataProvider", enabled = !DEBUG) public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { // Construct the assembly graph SeqGraph graph = new SeqGraph(3); @@ -232,7 +236,7 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "GetBasesData", enabled = true) + @Test(dataProvider = "GetBasesData", enabled = !DEBUG) public void testGetBases(final List frags) { // Construct the assembly graph SeqGraph graph = new SeqGraph(3); @@ -268,7 +272,7 @@ public class KBestPathsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "TripleBubbleDataProvider", enabled = true) + @Test(dataProvider = "TripleBubbleDataProvider", enabled = !DEBUG) public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { // Construct the assembly graph SeqGraph graph = new SeqGraph(); @@ -371,4 +375,120 @@ public class KBestPathsUnitTest { Assert.assertEquals(path.calculateCigar().toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar string mismatch"); } + + @Test(enabled = !DEBUG) + public void testIntraNodeInsertionDeletion() { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(); + final SeqVertex top = new SeqVertex("T"); + final SeqVertex bot = new SeqVertex("T"); + final SeqVertex alt = new SeqVertex("AAACCCCC"); + final SeqVertex ref = new SeqVertex("CCCCCGGG"); + + graph.addVertices(top, bot, alt, ref); + graph.addEdges(new BaseEdge(true, 1), top, ref, bot); + graph.addEdges(new BaseEdge(false, 1), top, alt, bot); + + final KBestPaths pathFinder = new KBestPaths(); + final List> paths = pathFinder.getKBestPaths(graph, top, bot); + + Assert.assertEquals(paths.size(), 2); + + final Path refPath = paths.get(0); + final Path altPath = paths.get(1); + + Assert.assertEquals(refPath.calculateCigar().toString(), "10M"); + Assert.assertEquals(altPath.calculateCigar().toString(), "1M3I5M3D1M"); + } + + @Test(enabled = !DEBUG) + public void testHardSWPath() { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(); + final SeqVertex top = new SeqVertex( "NNN"); + final SeqVertex bot = new SeqVertex( "NNN"); + final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); + final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); + graph.addVertices(top, bot, alt, ref); + graph.addEdges(new BaseEdge(true, 1), top, ref, bot); + graph.addEdges(new BaseEdge(false, 1), top, alt, bot); + + final KBestPaths pathFinder = new KBestPaths(); + final List> paths = pathFinder.getKBestPaths(graph, top, bot); + + Assert.assertEquals(paths.size(), 2); + + final Path refPath = paths.get(0); + final Path altPath = paths.get(1); + + logger.warn("RefPath : " + refPath + " cigar " + refPath.calculateCigar()); + logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar()); + + Assert.assertEquals(refPath.calculateCigar().toString(), "51M"); + Assert.assertEquals(altPath.calculateCigar().toString(), "3M14D2M20I32M"); + } + + // ----------------------------------------------------------------- + // + // Systematic tests to ensure that we get the correct SW result for + // a variety of variants in the ref vs alt bubble + // + // ----------------------------------------------------------------- + + @DataProvider(name = "SystematicRefAltSWTestData") + public Object[][] makeSystematicRefAltSWTestData() { + List tests = new ArrayList(); + + final List> allDiffs = Arrays.asList( + Arrays.asList("G", "C", "1M"), + Arrays.asList("G", "", "1D"), + Arrays.asList("", "C", "1I"), + Arrays.asList("AAA", "CGT", "3D3I"), + Arrays.asList("TAT", "CAC", "3M"), + Arrays.asList("AAAAA", "", "5D"), + Arrays.asList("", "AAAAA", "5I"), + Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I") + ); + + for ( final String prefix : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { + for ( final String end : Arrays.asList("", "X", "XXXXXXXXXXXXX")) { + for ( final List diffs : allDiffs ) + tests.add(new Object[]{prefix, end, diffs.get(0), diffs.get(1), diffs.get(2)}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SystematicRefAltSWTestData", enabled = true) + public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) { + // Construct the assembly graph + SeqGraph graph = new SeqGraph(); + + SeqVertex top = new SeqVertex(""); + SeqVertex ref = new SeqVertex(prefix + refMid + end); + SeqVertex alt = new SeqVertex(prefix + altMid + end); + SeqVertex bot = new SeqVertex(""); + + graph.addVertices(top, ref, alt, bot); + graph.addEdges(new BaseEdge(true, 1), top, ref, bot); + graph.addEdges(new BaseEdge(false, 1), top, alt, bot); + + // Construct the test path + Path path = Path.makePath(Arrays.asList(top, alt, bot), graph); + + Cigar expected = new Cigar(); + if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M)); + for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt); + if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M)); + expected = AlignmentUtils.consolidateCigar(expected); + + final Cigar pathCigar = path.calculateCigar(); + + logger.warn("diffs: " + ref + " vs. " + alt + " cigar " + midCigar); + logger.warn("Path " + path + " with cigar " + pathCigar); + logger.warn("Expected cigar " + expected); + + Assert.assertEquals(pathCigar, expected, "Cigar mismatch"); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java similarity index 59% rename from public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java index db16582b8..91a2988aa 100644 --- a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -1,26 +1,47 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ package org.broadinstitute.sting.utils.haplotypeBAMWriter; @@ -28,7 +49,7 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; diff --git a/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java similarity index 88% rename from protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java index c55b4147d..c8fc458e8 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentUnitTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.smithwaterman; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -91,4 +91,30 @@ public class SWPairwiseAlignmentUnitTest extends BaseTest { Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); Assert.assertEquals(sw.getCigar().toString(), expectedCigar); } + + @Test(enabled = true) + public void testIndelsAtStartAndEnd() { + final String match = "CCCCC"; + final String reference = "AAA" + match; + final String read = match + "GGG"; + final int expectedStart = 3; + final String expectedCigar = "5M3S"; + final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes()); + sw.printAlignment(reference.getBytes(), read.getBytes()); + Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); + Assert.assertEquals(sw.getCigar().toString(), expectedCigar); + } + + @Test(enabled = true) + public void testDegenerateAlignmentWithIndelsAtBothEnds() { + logger.warn("testDegenerateAlignmentWithIndelsAtBothEnds"); + final String ref = "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"; + final String alt = "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"; + final int expectedStart = 14; + final String expectedCigar = "31M20S"; + final SWPairwiseAlignment sw = new SWPairwiseAlignment(ref.getBytes(), alt.getBytes(), SWParameterSet.STANDARD_NGS); + sw.printAlignment(ref.getBytes(), alt.getBytes()); + Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); + Assert.assertEquals(sw.getCigar().toString(), expectedCigar); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java deleted file mode 100644 index 9936bd9ab..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ /dev/null @@ -1,77 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.haplotypeBAMWriter; - -import net.sf.samtools.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.*; - -/** - * A haplotype bam writer that writes out all haplotypes as reads and then - * the alignment of reach read to its best match among the best haplotypes. - * - * Primarily useful for people working on the HaplotypeCaller method itself - * - * User: depristo - * Date: 2/22/13 - * Time: 1:50 PM - */ -class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { - public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) { - super(bamWriter); - } - - /** - * {@inheritDoc} - */ - @Override - public void writeReadsAlignedToHaplotypes(final List haplotypes, - final GenomeLoc paddedReferenceLoc, - final List bestHaplotypes, - final Set calledHaplotypes, - final Map stratifiedReadMap) { - writeHaplotypesAsReads(haplotypes, new HashSet(bestHaplotypes), paddedReferenceLoc); - - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); - for ( final Haplotype haplotype : haplotypes ) - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - - // next, output the interesting reads for each sample aligned against the appropriate haplotype - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java deleted file mode 100644 index 08b4fff7c..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.haplotypeBAMWriter; - -import net.sf.samtools.SAMFileWriter; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; - -import java.util.*; - -/** - * Writes a BAM containing just the reads in stratifiedReadMap aligned to their - * most likely haplotype among all of the called haplotypes. - * - * Primarily useful for users of the HaplotypeCaller who want to better understand the - * support of their calls w.r.t. the reads. - * - * User: depristo - * Date: 2/22/13 - * Time: 1:50 PM - */ -class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { - public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) { - super(bamWriter); - } - - /** - * {@inheritDoc} - */ - @Override - public void writeReadsAlignedToHaplotypes(final List haplotypes, - final GenomeLoc paddedReferenceLoc, - final List bestHaplotypes, - final Set calledHaplotypes, - final Map stratifiedReadMap) { - if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes - return; - - writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); - - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); - for ( final Haplotype haplotype : calledHaplotypes ) { - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - } - - // the set of all alleles that were actually called - final Set allelesOfCalledHaplotypes = alleleToHaplotypeMap.keySet(); - - // next, output the interesting reads for each sample aligned against one of the called haplotypes - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - if ( entry.getKey().getMappingQuality() > 0 ) { - final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart()); - } - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java new file mode 100644 index 000000000..d4364afdf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/Parameters.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.smithwaterman; + +/** + * Holds the core Smith-Waterman alignment parameters of + * + * match value, and mismatch, gap open and gap extension penalties + * + * User: depristo + * Date: 4/11/13 + * Time: 12:03 PM + */ +public final class Parameters { + public final double w_match; + public final double w_mismatch; + public final double w_open; + public final double w_extend; + + /** + * Create a new set of SW parameters + * @param w_match the match score + * @param w_mismatch the mismatch penalty + * @param w_open the gap open penalty + * @param w_extend the gap extension penalty + */ + public Parameters(double w_match, double w_mismatch, double w_open, double w_extend) { + if ( w_mismatch > 0 ) throw new IllegalArgumentException("w_mismatch must be <= 0 but got " + w_mismatch); + if ( w_open> 0 ) throw new IllegalArgumentException("w_open must be <= 0 but got " + w_open); + if ( w_extend> 0 ) throw new IllegalArgumentException("w_extend must be <= 0 but got " + w_extend); + + this.w_match = w_match; + this.w_mismatch = w_mismatch; + this.w_open = w_open; + this.w_extend = w_extend; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java similarity index 88% rename from public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java rename to public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java index 6c8beb32d..890faa82a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java @@ -1,424 +1,458 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils; - -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; - -import java.util.*; - -/** - * Pairwise discrete smith-waterman alignment - * - * ************************************************************************ - * **** IMPORTANT NOTE: **** - * **** This class assumes that all bytes come from UPPERCASED chars! **** - * ************************************************************************ - * - * User: asivache - * Date: Mar 23, 2009 - * Time: 1:54:54 PM - */ -public final class SWPairwiseAlignment { - private int alignment_offset; // offset of s2 w/respect to s1 - private Cigar alignmentCigar; - - private final double w_match; - private final double w_mismatch; - private final double w_open; - private final double w_extend; - - private static final int MSTATE = 0; - private static final int ISTATE = 1; - private static final int DSTATE = 2; - private static final int CLIP = 3; - - protected static boolean cutoff = false; - private static boolean DO_SOFTCLIP = true; - - double[] SW; - - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { - w_match = match; - w_mismatch = mismatch; - w_open = open; - w_extend = extend; - align(seq1,seq2); - } - - public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { - this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) - } - - public Cigar getCigar() { return alignmentCigar ; } - - public int getAlignmentStart2wrt1() { return alignment_offset; } - - public void align(final byte[] a, final byte[] b) { - final int n = a.length; - final int m = b.length; - double [] sw = new double[(n+1)*(m+1)]; - SW = sw; - int [] btrack = new int[(n+1)*(m+1)]; - - calculateMatrix(a, b, sw, btrack); - calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) - } - - - private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { - final int n = a.length+1; - final int m = b.length+1; - - //final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff - final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff - if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0; - else MATRIX_MIN_CUTOFF = -1e100; - - double [] best_gap_v = new double[m+1]; - Arrays.fill(best_gap_v,-1.0e40); - int [] gap_size_v = new int[m+1]; - double [] best_gap_h = new double[n+1]; - Arrays.fill(best_gap_h,-1.0e40); - int [] gap_size_h = new int[n+1]; - - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop - byte a_base = a[i-1]; // letter in a at the current pos - - final int row_offset = row_offset_1 + m; - - // On the entrance into the loop, row_offset_1 is the (linear) offset - // of the first element of row (i-1) and row_offset is the linear offset of the - // start of row i - - for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { - - // data_offset_1 is linearized offset of element [i-1][j-1] - - final byte b_base = b[j-1]; // letter in b at the current pos - - // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_diag = sw[data_offset_1] + wd(a_base,b_base); - - // optimized "traversal" of all the matrix cells above the current one (i.e. traversing - // all 'step down' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - // if a gap (length 1) was just opened above, this is the cost of arriving to the current cell: - double prev_gap = sw[data_offset_1+1]+w_open; - - best_gap_v[j] += w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend - - if ( prev_gap > best_gap_v[j] ) { - // opening a gap just before the current cell results in better score than extending by one - // the best previously opened gap. This will hold for ALL cells below: since any gap - // once opened always costs w_extend to extend by another base, we will always get a better score - // by arriving to any cell below from the gap we just opened (prev_gap) rather than from the previous best gap - best_gap_v[j] = prev_gap; - gap_size_v[j] = 1; // remember that the best step-down gap from above has length 1 (we just opened it) - } else { - // previous best gap is still the best, even after extension by another base, so we just record that extension: - gap_size_v[j]++; - } - - final double step_down = best_gap_v[j] ; - final int kd = gap_size_v[j]; - - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing - // all 'step right' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - final int data_offset = row_offset + j; // linearized offset of element [i][j] - prev_gap = sw[data_offset-1]+w_open; // what would it cost us to open length 1 gap just to the left from current cell - best_gap_h[i] += w_extend; // previous best gap would cost us that much if extended by another base - - if ( prev_gap > best_gap_h[i] ) { - // newly opened gap is better (score-wise) than any previous gap with the same row index i; since - // gap penalty is linear with k, this new gap location is going to remain better than any previous ones - best_gap_h[i] = prev_gap; - gap_size_h[i] = 1; - } else { - gap_size_h[i]++; - } - - final double step_right = best_gap_h[i]; - final int ki = gap_size_h[i]; - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down); - btrack[data_offset] = kd ; // positive=vertical - } else { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } else { - // step_down <= step_right - if ( step_right > step_diag ) { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_right); - btrack[data_offset] = -ki; // negative = horizontal - } else { - sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } - } - - // IMPORTANT, IMPORTANT, IMPORTANT: - // note that we update this (secondary) outer loop variable here, - // so that we DO NOT need to update it - // in the for() statement itself. - row_offset_1 = row_offset; - } - } - - - private void calculateCigar(int n, int m, double [] sw, int [] btrack) { - // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - int p1 = 0, p2 = 0; - - double maxscore = Double.NEGATIVE_INFINITY; // sw scores are allowed to be negative - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) - - // look for largest score. we use >= combined with the traversal direction - // to ensure that if two scores are equal, the one closer to diagonal gets picked - for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) { - // data_offset is the offset of [i][m] - if ( sw[data_offset] >= maxscore ) { - p1 = i; p2 = m ; maxscore = sw[data_offset]; - } - } - - for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) { - // data_offset is the offset of [n][j] - if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { - p1 = n; - p2 = j ; - maxscore = sw[data_offset]; - segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment - } - } - - List lce = new ArrayList(5); - - if ( segment_length > 0 && DO_SOFTCLIP ) { - lce.add(makeElement(CLIP, segment_length)); - segment_length = 0; - } - - // we will be placing all insertions and deletions into sequence b, so the states are named w/regard - // to that sequence - - int state = MSTATE; - - int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] - do { - int btr = btrack[data_offset]; - - int new_state; - int step_length = 1; - - if ( btr > 0 ) { - new_state = DSTATE; - step_length = btr; - } else if ( btr < 0 ) { - new_state = ISTATE; - step_length = (-btr); - } else new_state = MSTATE; // and step_length =1, already set above - - // move to next best location in the sw matrix: - switch( new_state ) { - case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in the sw matrix - case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left - case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up - } - - // now let's see if the state actually changed: - if ( new_state == state ) segment_length+=step_length; - else { - // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). - lce.add(makeElement(state, segment_length)); - segment_length = step_length; - state = new_state; - } -// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2: - } while ( p1 > 0 && p2 > 0 ); - - // post-process the last segment we are still keeping; - // NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting - // those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false; - // otherwise they will be softclipped. For instance, - // if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only - // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if - // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true. - // The consumers need to check for the alignment offset and deal with it properly. - if (DO_SOFTCLIP ) { - lce.add(makeElement(state, segment_length)); - if ( p2> 0 ) lce.add(makeElement(CLIP, p2)); - alignment_offset = p1 ; - } else { - lce.add(makeElement(state, segment_length + p2)); - alignment_offset = p1 - p2; - } - - Collections.reverse(lce); - alignmentCigar = AlignmentUtils.consolidateCigar(new Cigar(lce)); - } - - private CigarElement makeElement(int state, int segment_length) { - CigarOperator o = null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - case CLIP: o = CigarOperator.S; break; - } - return new CigarElement(segment_length,o); - } - - private double wd(byte x, byte y) { - return (x == y ? w_match : w_mismatch); - } - - public void printAlignment(byte[] ref, byte[] read) { - printAlignment(ref,read,100); - } - - public void printAlignment(byte[] ref, byte[] read, int width) { - StringBuilder bread = new StringBuilder(); - StringBuilder bref = new StringBuilder(); - StringBuilder match = new StringBuilder(); - - int i = 0; - int j = 0; - - final int offset = getAlignmentStart2wrt1(); - - Cigar cigar = getCigar(); - - if ( ! DO_SOFTCLIP ) { - - // we need to go through all the hassle below only if we do not do softclipping; - // otherwise offset is never negative - if ( offset < 0 ) { - for ( ; j < (-offset) ; j++ ) { - bread.append((char)read[j]); - bref.append(' '); - match.append(' '); - } - // at negative offsets, our cigar's first element carries overhanging bases - // that we have just printed above. Tweak the first element to - // exclude those bases. Here we create a new list of cigar elements, so the original - // list/original cigar are unchanged (they are unmodifiable anyway!) - - List tweaked = new ArrayList(); - tweaked.addAll(cigar.getCigarElements()); - tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset, - cigar.getCigarElement(0).getOperator())); - cigar = new Cigar(tweaked); - } - } - - if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 - for ( ; i < getAlignmentStart2wrt1() ; i++ ) { - bref.append((char)ref[i]); - bread.append(' '); - match.append(' '); - } - } - - for ( CigarElement e : cigar.getCigarElements() ) { - switch (e.getOperator()) { - case M : - for ( int z = 0 ; z < e.getLength() ; z++, i++, j++ ) { - bref.append((i= s.length() ) { - System.out.println(); - return; - } - int end = Math.min(start+width,s.length()); - System.out.println(s.substring(start,end)); - } -} +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.smithwaterman; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; + +import java.util.*; + +/** + * Pairwise discrete smith-waterman alignment + * + * ************************************************************************ + * **** IMPORTANT NOTE: **** + * **** This class assumes that all bytes come from UPPERCASED chars! **** + * ************************************************************************ + * + * User: asivache + * Date: Mar 23, 2009 + * Time: 1:54:54 PM + */ +public final class SWPairwiseAlignment { + private int alignment_offset; // offset of s2 w/respect to s1 + private Cigar alignmentCigar; + + private final Parameters parameters; + + private static final int MSTATE = 0; + private static final int ISTATE = 1; + private static final int DSTATE = 2; + private static final int CLIP = 3; + + protected static boolean cutoff = false; + private static boolean DO_SOFTCLIP = true; + + /** + * The SW scoring matrix, stored for debugging purposes if keepScoringMatrix is true + */ + protected double[] SW = null; + + /** + * Only for testing purposes in the SWPairwiseAlignmentMain function + * set to true to keep SW scoring matrix after align call + */ + protected static boolean keepScoringMatrix = false; + + /** + * Create a new SW pairwise aligner. + * + * @deprecated in favor of constructors using the Parameter or ParameterSet class + */ + @Deprecated + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { + this(seq1, seq2, new Parameters(match, mismatch, open, extend)); + } + + /** + * Create a new SW pairwise aligner + * + * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2) + * + * @param seq1 the first sequence we want to align + * @param seq2 the second sequence we want to align + * @param parameters the SW parameters to use + */ + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, Parameters parameters) { + this.parameters = parameters; + align(seq1,seq2); + } + + /** + * Create a new SW pairwise aligner + * + * After creating the object the two sequences are aligned with an internal call to align(seq1, seq2) + * + * @param seq1 the first sequence we want to align + * @param seq2 the second sequence we want to align + * @param namedParameters the named parameter set to get our parameters from + */ + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, SWParameterSet namedParameters) { + this(seq1, seq2, namedParameters.parameters); + } + + public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { + this(seq1,seq2,SWParameterSet.ORIGINAL_DEFAULT); + } + + public Cigar getCigar() { return alignmentCigar ; } + + public int getAlignmentStart2wrt1() { return alignment_offset; } + + public void align(final byte[] a, final byte[] b) { + final int n = a.length; + final int m = b.length; + double [] sw = new double[(n+1)*(m+1)]; + if ( keepScoringMatrix ) SW = sw; + int [] btrack = new int[(n+1)*(m+1)]; + + calculateMatrix(a, b, sw, btrack); + calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) + } + + + private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { + final int n = a.length+1; + final int m = b.length+1; + + //final double MATRIX_MIN_CUTOFF=-1e100; // never let matrix elements drop below this cutoff + final double MATRIX_MIN_CUTOFF; // never let matrix elements drop below this cutoff + if ( cutoff ) MATRIX_MIN_CUTOFF = 0.0; + else MATRIX_MIN_CUTOFF = -1e100; + + double [] best_gap_v = new double[m+1]; + Arrays.fill(best_gap_v,-1.0e40); + int [] gap_size_v = new int[m+1]; + double [] best_gap_h = new double[n+1]; + Arrays.fill(best_gap_h,-1.0e40); + int [] gap_size_h = new int[n+1]; + + // build smith-waterman matrix and keep backtrack info: + for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop + byte a_base = a[i-1]; // letter in a at the current pos + + final int row_offset = row_offset_1 + m; + + // On the entrance into the loop, row_offset_1 is the (linear) offset + // of the first element of row (i-1) and row_offset is the linear offset of the + // start of row i + + for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { + + // data_offset_1 is linearized offset of element [i-1][j-1] + + final byte b_base = b[j-1]; // letter in b at the current pos + + // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); + double step_diag = sw[data_offset_1] + wd(a_base,b_base); + + // optimized "traversal" of all the matrix cells above the current one (i.e. traversing + // all 'step down' events that would end in the current cell. The optimized code + // does exactly the same thing as the commented out loop below. IMPORTANT: + // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! + + // if a gap (length 1) was just opened above, this is the cost of arriving to the current cell: + double prev_gap = sw[data_offset_1+1]+parameters.w_open; + + best_gap_v[j] += parameters.w_extend; // for the gaps that were already opened earlier, extending them by 1 costs w_extend + + if ( prev_gap > best_gap_v[j] ) { + // opening a gap just before the current cell results in better score than extending by one + // the best previously opened gap. This will hold for ALL cells below: since any gap + // once opened always costs w_extend to extend by another base, we will always get a better score + // by arriving to any cell below from the gap we just opened (prev_gap) rather than from the previous best gap + best_gap_v[j] = prev_gap; + gap_size_v[j] = 1; // remember that the best step-down gap from above has length 1 (we just opened it) + } else { + // previous best gap is still the best, even after extension by another base, so we just record that extension: + gap_size_v[j]++; + } + + final double step_down = best_gap_v[j] ; + final int kd = gap_size_v[j]; + + // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing + // all 'step right' events that would end in the current cell. The optimized code + // does exactly the same thing as the commented out loop below. IMPORTANT: + // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! + + final int data_offset = row_offset + j; // linearized offset of element [i][j] + prev_gap = sw[data_offset-1]+parameters.w_open; // what would it cost us to open length 1 gap just to the left from current cell + best_gap_h[i] += parameters.w_extend; // previous best gap would cost us that much if extended by another base + + if ( prev_gap > best_gap_h[i] ) { + // newly opened gap is better (score-wise) than any previous gap with the same row index i; since + // gap penalty is linear with k, this new gap location is going to remain better than any previous ones + best_gap_h[i] = prev_gap; + gap_size_h[i] = 1; + } else { + gap_size_h[i]++; + } + + final double step_right = best_gap_h[i]; + final int ki = gap_size_h[i]; + + if ( step_down > step_right ) { + if ( step_down > step_diag ) { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down); + btrack[data_offset] = kd ; // positive=vertical + } else { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); + btrack[data_offset] = 0; // 0 = diagonal + } + } else { + // step_down <= step_right + if ( step_right > step_diag ) { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_right); + btrack[data_offset] = -ki; // negative = horizontal + } else { + sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_diag); + btrack[data_offset] = 0; // 0 = diagonal + } + } + } + + // IMPORTANT, IMPORTANT, IMPORTANT: + // note that we update this (secondary) outer loop variable here, + // so that we DO NOT need to update it + // in the for() statement itself. + row_offset_1 = row_offset; + } + } + + + private void calculateCigar(int n, int m, double [] sw, int [] btrack) { + // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order + int p1 = 0, p2 = 0; + + double maxscore = Double.NEGATIVE_INFINITY; // sw scores are allowed to be negative + int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) + + // look for largest score. we use >= combined with the traversal direction + // to ensure that if two scores are equal, the one closer to diagonal gets picked + for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) { + // data_offset is the offset of [i][m] + if ( sw[data_offset] >= maxscore ) { + p1 = i; p2 = m ; maxscore = sw[data_offset]; + } + } + + for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) { + // data_offset is the offset of [n][j] + if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { + p1 = n; + p2 = j ; + maxscore = sw[data_offset]; + segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment + } + } + + List lce = new ArrayList(5); + + if ( segment_length > 0 && DO_SOFTCLIP ) { + lce.add(makeElement(CLIP, segment_length)); + segment_length = 0; + } + + // we will be placing all insertions and deletions into sequence b, so the states are named w/regard + // to that sequence + + int state = MSTATE; + + int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] + do { + int btr = btrack[data_offset]; + + int new_state; + int step_length = 1; + + if ( btr > 0 ) { + new_state = DSTATE; + step_length = btr; + } else if ( btr < 0 ) { + new_state = ISTATE; + step_length = (-btr); + } else new_state = MSTATE; // and step_length =1, already set above + + // move to next best location in the sw matrix: + switch( new_state ) { + case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in the sw matrix + case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left + case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up + } + + // now let's see if the state actually changed: + if ( new_state == state ) segment_length+=step_length; + else { + // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). + lce.add(makeElement(state, segment_length)); + segment_length = step_length; + state = new_state; + } +// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2: + } while ( p1 > 0 && p2 > 0 ); + + // post-process the last segment we are still keeping; + // NOTE: if reads "overhangs" the ref on the left (i.e. if p2>0) we are counting + // those extra bases sticking out of the ref into the first cigar element if DO_SOFTCLIP is false; + // otherwise they will be softclipped. For instance, + // if read length is 5 and alignment starts at offset -2 (i.e. read starts before the ref, and only + // last 3 bases of the read overlap with/align to the ref), the cigar will be still 5M if + // DO_SOFTCLIP is false or 2S3M if DO_SOFTCLIP is true. + // The consumers need to check for the alignment offset and deal with it properly. + if (DO_SOFTCLIP ) { + lce.add(makeElement(state, segment_length)); + if ( p2> 0 ) lce.add(makeElement(CLIP, p2)); + alignment_offset = p1 ; + } else { + lce.add(makeElement(state, segment_length + p2)); + alignment_offset = p1 - p2; + } + + Collections.reverse(lce); + alignmentCigar = AlignmentUtils.consolidateCigar(new Cigar(lce)); + } + + private CigarElement makeElement(int state, int segment_length) { + CigarOperator o = null; + switch(state) { + case MSTATE: o = CigarOperator.M; break; + case ISTATE: o = CigarOperator.I; break; + case DSTATE: o = CigarOperator.D; break; + case CLIP: o = CigarOperator.S; break; + } + return new CigarElement(segment_length,o); + } + + private double wd(byte x, byte y) { + return (x == y ? parameters.w_match : parameters.w_mismatch); + } + + public void printAlignment(byte[] ref, byte[] read) { + printAlignment(ref,read,100); + } + + public void printAlignment(byte[] ref, byte[] read, int width) { + StringBuilder bread = new StringBuilder(); + StringBuilder bref = new StringBuilder(); + StringBuilder match = new StringBuilder(); + + int i = 0; + int j = 0; + + final int offset = getAlignmentStart2wrt1(); + + Cigar cigar = getCigar(); + + if ( ! DO_SOFTCLIP ) { + + // we need to go through all the hassle below only if we do not do softclipping; + // otherwise offset is never negative + if ( offset < 0 ) { + for ( ; j < (-offset) ; j++ ) { + bread.append((char)read[j]); + bref.append(' '); + match.append(' '); + } + // at negative offsets, our cigar's first element carries overhanging bases + // that we have just printed above. Tweak the first element to + // exclude those bases. Here we create a new list of cigar elements, so the original + // list/original cigar are unchanged (they are unmodifiable anyway!) + + List tweaked = new ArrayList(); + tweaked.addAll(cigar.getCigarElements()); + tweaked.set(0,new CigarElement(cigar.getCigarElement(0).getLength()+offset, + cigar.getCigarElement(0).getOperator())); + cigar = new Cigar(tweaked); + } + } + + if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 + for ( ; i < getAlignmentStart2wrt1() ; i++ ) { + bref.append((char)ref[i]); + bread.append(' '); + match.append(' '); + } + } + + for ( CigarElement e : cigar.getCigarElements() ) { + switch (e.getOperator()) { + case M : + for ( int z = 0 ; z < e.getLength() ; z++, i++, j++ ) { + bref.append((i= s.length() ) { + System.out.println(); + return; + } + int end = Math.min(start+width,s.length()); + System.out.println(s.substring(start,end)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java rename to public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java index a49d7e5e6..8c832fa75 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignmentMain.java @@ -23,8 +23,9 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.smithwaterman; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import java.util.ArrayList; @@ -100,6 +101,7 @@ public class SWPairwiseAlignmentMain { w_extend = (ext == null ? -2.0 : ext.doubleValue()); + SWPairwiseAlignment.keepScoringMatrix = true; SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java new file mode 100644 index 000000000..100780023 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWParameterSet.java @@ -0,0 +1,51 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.smithwaterman; + +/** + * Handy named collection of common Smith-waterman parameter sets + * + * User: depristo + * Date: 4/11/13 + * Time: 12:02 PM + */ +public enum SWParameterSet { + // match=1, mismatch = -1/3, gap=-(1+k/3) + ORIGINAL_DEFAULT(new Parameters(1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0)), + + /** + * A standard set of values for NGS alignments + */ + STANDARD_NGS(new Parameters(5.0, -10.0, -22.0, -1.2)); + + protected Parameters parameters; + + SWParameterSet(final Parameters parameters) { + if ( parameters == null ) throw new IllegalArgumentException("parameters cannot be null"); + + this.parameters = parameters; + } +} From 0e627bce93d475959bc8810d17d23d4745dd36da Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 12 Apr 2013 12:42:47 -0400 Subject: [PATCH 174/226] Slight update to Path SW parameters. -- Decreasing the match value means that we no longer think that ACTG vs. ATCG is best modeled by 1M1D1M1I1M, since we don't get so much value for the middle C match that we can pay two gap open penalties to get it. --- .../sting/gatk/walkers/haplotypecaller/graphs/Path.java | 2 +- .../walkers/haplotypecaller/graphs/KBestPathsUnitTest.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 9d2d680c9..f232a4ce0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -86,7 +86,7 @@ public class Path { // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base - public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -20.0, -26.0, -0.1); + public static final Parameters NEW_SW_PARAMETERS = new Parameters(10, -20.0, -26.0, -0.1); private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index 90fdf1fa4..302866b55 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -445,6 +445,7 @@ public class KBestPathsUnitTest extends BaseTest { Arrays.asList("", "C", "1I"), Arrays.asList("AAA", "CGT", "3D3I"), Arrays.asList("TAT", "CAC", "3M"), + Arrays.asList("GCTG", "GTCG", "4M"), Arrays.asList("AAAAA", "", "5D"), Arrays.asList("", "AAAAA", "5I"), Arrays.asList("AAAAACC", "CCGGGGGG", "5D2M6I") @@ -460,7 +461,7 @@ public class KBestPathsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "SystematicRefAltSWTestData", enabled = true) + @Test(dataProvider = "SystematicRefAltSWTestData", enabled = !DEBUG) public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) { // Construct the assembly graph SeqGraph graph = new SeqGraph(); From 403f9de12245696d52f9e687321dcb8f9f83b73b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 12 Apr 2013 10:42:03 -0400 Subject: [PATCH 175/226] Fix another caching issue with the PairHMM The Problem ---------- Some read x haplotype pairs were getting very low likelihood when caching is on. Turning it off seemed to give the right result. Solution -------- The HaplotypeCaller only initializes the PairHMM once and then feed it with a set of reads and haplotypes. The PairHMM always caches the matrix when the previous haplotype length is the same as the current one. This is not true when the read has changed. This commit adds another condition to zero the haplotype start index when the read changes. Summarized Changes ------------------ * Added the recacheReadValue check to flush the matrix (hapStartIndex = 0) * Updated related MD5's Bamboo link: http://gsabamboo.broadinstitute.org/browse/GSAUNSTABLE-PARALLEL9 --- ...rComplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 14 +++++++------- .../sting/utils/pairhmm/PairHMM.java | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 6d85421c4..292760e89 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "490ecf6619740c01c81a463392ef23cf"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7a035437f145b714cb844666b0736925"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "125e93deeb3b390a14d9b777aa2a220f"); + "aacfcc50c9aa5cfbec8ae8026d937ecd"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "6957fd0e8a5bc66d2572a6ca8626fa7a"); + "eae75a3dc5c2e0fbdf016dbbafe425e2"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 573cc83fd..9cd225df3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "6fa37c449a800bcd59069be03ad2fff2"); + HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "6140447b34bd1d08b3ed4d473d2c2f23"); + HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "cbd119f3d37a9af0b3539c13b8053bd9"); + "d00a604abe02586f803b1bb9d63af0f7"); } @Test @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "9eeeada2f7145adfe08f538aad704982"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "16ecd2f282bcb10dc32e7f3fe714a000"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6fd839641ee038048626fbd1154f173")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("020b1a4feb82f050894f6066dc07cc4a")); + Arrays.asList("8a62597f2c005f373efbe398ab51a2f1")); executeTest("HC calling on a ReducedRead BAM", spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index 33cd191f6..6b57a1354 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -134,7 +134,7 @@ public abstract class PairHMM { paddedReadLength = readBases.length + 1; paddedHaplotypeLength = haplotypeBases.length + 1; - final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length ) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases); + final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length || recacheReadValues) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases); double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); From f11c8d22d47218c7d0dfbdfb5c19cbdd336a5df4 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 13 Apr 2013 08:21:48 -0400 Subject: [PATCH 178/226] Updating java 7 md5's to java 6 md5's --- ...typeCallerComplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 292760e89..f09711094 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7a035437f145b714cb844666b0736925"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "aacfcc50c9aa5cfbec8ae8026d937ecd"); + "125e93deeb3b390a14d9b777aa2a220f"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "eae75a3dc5c2e0fbdf016dbbafe425e2"); + "6957fd0e8a5bc66d2572a6ca8626fa7a"); } } From a063e79597fd82ae15264ec07ae48bcdbc285593 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 13 Apr 2013 09:07:13 -0400 Subject: [PATCH 179/226] Updating the exampleGRP.grp test file It had been generated with an old version of BQSRv2 and wasn't compatible with exampleBAM anymore. --- public/testdata/exampleGRP.grp | 2612 ++++++++++++++------------------ 1 file changed, 1106 insertions(+), 1506 deletions(-) diff --git a/public/testdata/exampleGRP.grp b/public/testdata/exampleGRP.grp index 2ec55ec57..61a10ac4a 100644 --- a/public/testdata/exampleGRP.grp +++ b/public/testdata/exampleGRP.grp @@ -1,1518 +1,1118 @@ #:GATKReport.v1.1:5 -#:GATKTable:2:14::; +#:GATKTable:2:18:%s:%s:; #:GATKTable:Arguments:Recalibration argument collection values used in this run -Argument Value -covariate null -default_platform null -deletions_context_size 8 -force_platform null -insertions_context_size 8 -insertions_default_quality 45 -low_quality_tail 2 -mismatches_context_size 2 -mismatches_default_quality -1 -no_standard_covs false -quantizing_levels 16 -run_without_dbsnp false -solid_nocall_strategy THROW_EXCEPTION -solid_recal_mode SET_Q_ZERO +Argument Value +binary_tag_name null +covariate ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate +default_platform null +deletions_default_quality 45 +force_platform null +indels_context_size 3 +insertions_default_quality 45 +low_quality_tail 2 +maximum_cycle_value 500 +mismatches_context_size 2 +mismatches_default_quality -1 +no_standard_covs false +plot_pdf_file null +quantizing_levels 16 +recalibration_report null +run_without_dbsnp false +solid_nocall_strategy THROW_EXCEPTION +solid_recal_mode SET_Q_ZERO -#:GATKTable:3:94:::; +#:GATKTable:3:94:%s:%s:%s:; #:GATKTable:Quantized:Quality quantization map QualityScore Count QuantizedScore -0 20 3 -1 0 3 -2 6 3 -3 1041 3 -4 8 3 -5 190 3 -6 102 3 -7 28 7 -8 795 8 -9 0 93 -10 0 93 -11 0 93 -12 0 93 -13 0 93 -14 0 93 -15 0 93 -16 0 93 -17 0 93 -18 0 93 -19 0 93 -20 0 93 -21 0 93 -22 0 93 -23 0 93 -24 0 93 -25 0 93 -26 0 93 -27 0 93 -28 0 93 -29 0 93 -30 0 93 -31 0 93 -32 0 93 -33 0 93 -34 0 93 -35 0 93 -36 0 93 -37 0 93 -38 0 93 -39 0 93 -40 0 93 -41 0 93 -42 0 93 -43 0 93 -44 0 93 -45 0 93 -46 0 93 -47 0 93 -48 0 93 -49 0 93 -50 0 93 -51 0 93 -52 0 93 -53 0 93 -54 0 93 -55 0 93 -56 0 93 -57 0 93 -58 0 93 -59 0 93 -60 0 93 -61 0 93 -62 0 93 -63 0 93 -64 0 93 -65 0 93 -66 0 93 -67 0 93 -68 0 93 -69 0 93 -70 0 93 -71 0 93 -72 0 93 -73 0 93 -74 0 93 -75 0 93 -76 0 93 -77 0 93 -78 0 93 -79 0 93 -80 0 93 -81 0 93 -82 0 82 -83 0 83 -84 0 84 -85 0 85 -86 0 86 -87 0 87 -88 0 88 -89 0 89 -90 0 90 -91 0 91 -92 0 92 -93 0 93 + 0 0 8 + 1 0 8 + 2 0 8 + 3 0 8 + 4 0 8 + 5 0 8 + 6 11 8 + 7 0 8 + 8 7 8 + 9 4 8 + 10 1 8 + 11 2 8 + 12 4 19 + 13 3 19 + 14 1 19 + 15 5 19 + 16 10 19 + 17 6 19 + 18 7 19 + 19 15 19 + 20 5 19 + 21 17 19 + 22 9 19 + 23 15 23 + 24 20 24 + 25 15 13 + 26 6 13 + 27 22 27 + 28 15 28 + 29 20 29 + 30 20 30 + 31 25 31 + 32 32 32 + 33 35 33 + 34 36 34 + 35 0 93 + 36 0 93 + 37 0 93 + 38 0 93 + 39 0 93 + 40 0 93 + 41 0 93 + 42 0 93 + 43 0 93 + 44 0 93 + 45 736 45 + 46 0 93 + 47 0 93 + 48 0 93 + 49 0 93 + 50 0 93 + 51 0 93 + 52 0 93 + 53 0 93 + 54 0 93 + 55 0 93 + 56 0 93 + 57 0 93 + 58 0 93 + 59 0 93 + 60 0 93 + 61 0 93 + 62 0 93 + 63 0 93 + 64 0 93 + 65 0 93 + 66 0 93 + 67 0 93 + 68 0 93 + 69 0 93 + 70 0 93 + 71 0 93 + 72 0 93 + 73 0 93 + 74 0 93 + 75 0 93 + 76 0 93 + 77 0 93 + 78 0 93 + 79 0 93 + 80 0 93 + 81 0 93 + 82 0 93 + 83 0 93 + 84 0 93 + 85 0 93 + 86 0 93 + 87 0 93 + 88 0 93 + 89 0 93 + 90 0 93 + 91 0 93 + 92 0 93 + 93 0 93 -#:GATKTable:6:3:%s:%s:%.4f:%.4f:%d:%d:; +#:GATKTable:6:3:%s:%s:%.4f:%.4f:%d:%.2f:; #:GATKTable:RecalTable0: -ReadGroup EventType EmpiricalQuality EstimatedQReported Observations Errors -exampleBAM.bam.bam D 25.8092 45.0000 380 0 -exampleBAM.bam.bam M 14.0483 15.4820 380 14 -exampleBAM.bam.bam I 25.8092 45.0000 380 0 +ReadGroup EventType EmpiricalQuality EstimatedQReported Observations Errors +exampleBAM.bam M 17.0000 17.4959 368 11.00 +exampleBAM.bam I 45.0000 45.0000 368 0.00 +exampleBAM.bam D 45.0000 45.0000 368 0.00 -#:GATKTable:6:32:%s:%s:%s:%.4f:%d:%d:; +#:GATKTable:6:30:%s:%s:%s:%.4f:%d:%.2f:; #:GATKTable:RecalTable1: -ReadGroup QualityScore EventType EmpiricalQuality Observations Errors -exampleBAM.bam.bam 32 M 15.1851 32 0 -exampleBAM.bam.bam 19 M 9.0309 15 1 -exampleBAM.bam.bam 33 M 15.5630 35 0 -exampleBAM.bam.bam 18 M 6.0206 7 1 -exampleBAM.bam.bam 34 M 15.6820 36 0 -exampleBAM.bam.bam 17 M 5.4407 6 1 -exampleBAM.bam.bam 16 M 7.4036 10 1 -exampleBAM.bam.bam 23 M 12.0412 15 0 -exampleBAM.bam.bam 6 M 4.7712 11 3 -exampleBAM.bam.bam 45 I 25.8092 380 0 -exampleBAM.bam.bam 22 M 10.0000 9 0 -exampleBAM.bam.bam 4 M 4.7712 5 1 -exampleBAM.bam.bam 21 M 12.5527 17 0 -exampleBAM.bam.bam 5 M 4.2597 7 2 -exampleBAM.bam.bam 20 M 4.7712 5 1 -exampleBAM.bam.bam 27 M 13.6173 22 0 -exampleBAM.bam.bam 10 M 3.0103 1 0 -exampleBAM.bam.bam 26 M 8.4510 6 0 -exampleBAM.bam.bam 11 M 1.7609 2 1 -exampleBAM.bam.bam 8 M 6.0206 7 1 -exampleBAM.bam.bam 25 M 12.0412 15 0 -exampleBAM.bam.bam 9 M 6.9897 4 0 -exampleBAM.bam.bam 24 M 10.2119 20 1 -exampleBAM.bam.bam 31 M 14.1497 25 0 -exampleBAM.bam.bam 14 M 3.0103 1 0 -exampleBAM.bam.bam 30 M 13.2222 20 0 -exampleBAM.bam.bam 15 M 7.7815 5 0 -exampleBAM.bam.bam 12 M 6.9897 4 0 -exampleBAM.bam.bam 29 M 13.2222 20 0 -exampleBAM.bam.bam 45 D 25.8092 380 0 -exampleBAM.bam.bam 13 M 6.0206 3 0 -exampleBAM.bam.bam 28 M 12.0412 15 0 +ReadGroup QualityScore EventType EmpiricalQuality Observations Errors +exampleBAM.bam 6 M 6.0000 11 3.00 +exampleBAM.bam 8 M 8.0000 7 1.00 +exampleBAM.bam 9 M 9.0000 4 0.00 +exampleBAM.bam 10 M 10.0000 1 0.00 +exampleBAM.bam 11 M 11.0000 2 1.00 +exampleBAM.bam 12 M 12.0000 4 0.00 +exampleBAM.bam 13 M 13.0000 3 0.00 +exampleBAM.bam 14 M 14.0000 1 0.00 +exampleBAM.bam 15 M 15.0000 5 0.00 +exampleBAM.bam 16 M 16.0000 10 1.00 +exampleBAM.bam 17 M 17.0000 6 1.00 +exampleBAM.bam 18 M 18.0000 7 1.00 +exampleBAM.bam 19 M 19.0000 15 1.00 +exampleBAM.bam 20 M 20.0000 5 1.00 +exampleBAM.bam 21 M 21.0000 17 0.00 +exampleBAM.bam 22 M 22.0000 9 0.00 +exampleBAM.bam 23 M 23.0000 15 0.00 +exampleBAM.bam 24 M 24.0000 20 1.00 +exampleBAM.bam 25 M 25.0000 15 0.00 +exampleBAM.bam 26 M 26.0000 6 0.00 +exampleBAM.bam 27 M 27.0000 22 0.00 +exampleBAM.bam 28 M 28.0000 15 0.00 +exampleBAM.bam 29 M 29.0000 20 0.00 +exampleBAM.bam 30 M 30.0000 20 0.00 +exampleBAM.bam 31 M 31.0000 25 0.00 +exampleBAM.bam 32 M 32.0000 32 0.00 +exampleBAM.bam 33 M 33.0000 35 0.00 +exampleBAM.bam 34 M 34.0000 36 0.00 +exampleBAM.bam 45 I 45.0000 368 0.00 +exampleBAM.bam 45 D 45.0000 368 0.00 -#:GATKTable:8:1354:%s:%s:%s:%s:%s:%.4f:%d:%d:; +#:GATKTable:8:952:%s:%s:%s:%s:%s:%.4f:%d:%.2f:; #:GATKTable:RecalTable2: -ReadGroup QualityScore CovariateValue CovariateName EventType EmpiricalQuality Observations Errors -exampleBAM.bam.bam 45 TGAAAGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTATTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTCGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTGTGTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTAAGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTATTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 23 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 27 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ATTCTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTAATCTC Context I 3.0103 1 0 -exampleBAM.bam.bam 34 GC Context M 4.7712 2 0 -exampleBAM.bam.bam 8 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 TAGAGTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 9 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTCGGG Context I 6.0206 3 0 -exampleBAM.bam.bam 45 AGTTTCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CATTTCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 16 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 5 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGATAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 53 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 57 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 25 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCAGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 33 CT Context M 8.4510 6 0 -exampleBAM.bam.bam 45 AAGTGACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGACAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGAGTTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTCTTTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTGAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 12 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 41 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 21 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 26 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ACCTGGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACAGCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 20 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGTGGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCAAAATC Context I 3.0103 1 0 -exampleBAM.bam.bam 27 TA Context M 6.9897 4 0 -exampleBAM.bam.bam 27 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AAAATCTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 22 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 26 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 33 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTATTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTCAATGT Context I 3.0103 1 0 -exampleBAM.bam.bam 21 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 17 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATCGTGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGATCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATCGTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 52 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 56 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 9 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 23 CT Context M 4.7712 2 0 -exampleBAM.bam.bam 31 26 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 ATGTGAAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTACTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACACAGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 26 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTTGG Context D 4.7712 2 0 -exampleBAM.bam.bam 33 8 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 34 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTCTTAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GAGCCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 20 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAGGG Context D 4.7712 2 0 -exampleBAM.bam.bam 33 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGCAAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 6 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 32 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 29 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 13 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 21 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTAATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATTATTG Context D 3.0103 1 0 -exampleBAM.bam.bam 24 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTCAGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GACATGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCATGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 21 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 25 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 34 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 GG Context M 3.9794 4 1 -exampleBAM.bam.bam 9 16 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCAGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAAGTGAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGACATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 55 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 59 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CATGATCG Context I 3.0103 1 0 -exampleBAM.bam.bam 16 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 32 43 Cycle M 6.0206 3 0 -exampleBAM.bam.bam 19 33 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GTATTTGC Context D 3.0103 1 0 -exampleBAM.bam.bam 26 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTAAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 33 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 11 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 18 1 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ACCCAGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGACAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTTTGC Context D 3.0103 1 0 -exampleBAM.bam.bam 27 16 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 32 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 21 44 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTACTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGCTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 16 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 25 21 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 22 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGCCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 20 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 24 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 30 26 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGTATTT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 53 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 19 70 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 25 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCCACC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 54 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 58 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ACTTTCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGTGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTGATAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AATGTGAA Context I 3.0103 1 0 -exampleBAM.bam.bam 9 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 19 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 28 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTTG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 TCTTTGTA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GTTCGGGT Context I 6.0206 3 0 -exampleBAM.bam.bam 27 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 27 17 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCAAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGCAGCCT Context I 3.0103 1 0 -exampleBAM.bam.bam 20 GT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 TGGAGCCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTGGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 28 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGTGTCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCTCCAGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 49 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 61 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CCTCGTCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCACCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 44 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 AGGTTATC Context I 3.0103 1 0 -exampleBAM.bam.bam 34 41 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTGTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTTGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTCACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 9 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 19 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 31 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 25 TA Context M 6.0206 3 0 -exampleBAM.bam.bam 34 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 28 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAGATTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 GGCTGGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATTAGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 GG Context M 3.0103 3 1 -exampleBAM.bam.bam 32 15 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 22 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCAGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGCCAGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTCTTTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGAACTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 26 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTCTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGATAACC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTTTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCTTTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 46 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 29 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATCCATTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 48 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 60 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GATCCAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATGAGTC Context D 3.0103 1 0 -exampleBAM.bam.bam 24 TT Context M 3.0103 3 1 -exampleBAM.bam.bam 45 TCTTTATA Context I 3.0103 1 0 -exampleBAM.bam.bam 6 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 23 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 34 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 18 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 30 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CAAAATCT Context I 3.0103 1 0 -exampleBAM.bam.bam 22 15 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCATGGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAATCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 CA Context M 6.9897 4 0 -exampleBAM.bam.bam 45 CCCAGATC Context D 3.0103 1 0 -exampleBAM.bam.bam 18 36 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 16 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGTATTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 33 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTAGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 19 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 32 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 32 14 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 12 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGCCT Context I 3.0103 1 0 -exampleBAM.bam.bam 4 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 27 53 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 23 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTATTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATAAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 51 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 63 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CACCCAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CGTGAGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCTTTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 34 CT Context M 4.7712 2 0 -exampleBAM.bam.bam 4 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTT Context I 4.7712 2 0 -exampleBAM.bam.bam 24 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCATGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CACATGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 17 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 29 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ATCAATAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ACCATGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 32 GT Context M 8.4510 6 0 -exampleBAM.bam.bam 19 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCATTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATAACCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AACTGGGA Context I 3.0103 1 0 -exampleBAM.bam.bam 4 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 33 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCAGGCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCACTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTCCAGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 6 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 23 15 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GATATAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTAGAGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 50 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 62 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GCCACCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTCGG Context D 6.0206 3 0 -exampleBAM.bam.bam 24 TC Context M 6.0206 3 0 -exampleBAM.bam.bam 25 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 16 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 28 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ACATGGTA Context I 3.0103 1 0 -exampleBAM.bam.bam 16 34 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 AATCTCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTCACT Context I 3.0103 1 0 -exampleBAM.bam.bam 22 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 ATATCAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAATGTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTCAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGCAATCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTAATGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 30 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 GG Context M 7.7815 5 0 -exampleBAM.bam.bam 20 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 20 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTAGATT Context D 3.0103 1 0 -exampleBAM.bam.bam 33 44 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCTGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGAGATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 21 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATATAAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTTTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACTTTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CCATTTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGGCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 11 TT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 TTTCACTG Context I 3.0103 1 0 -exampleBAM.bam.bam 33 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCGTGAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TACTCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAATGAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCTTTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 22 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTTTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGCCAGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 1 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TTTCAGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATTCTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGATAACC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTCTTTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGAACTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 21 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 32 33 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGCTGGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GATTAGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 33 35 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAGATTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGG Context D 4.7712 2 0 -exampleBAM.bam.bam 19 CT Context M 1.7609 2 1 -exampleBAM.bam.bam 45 19 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 31 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TGTTGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTGTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTCACA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 30 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 30 17 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 33 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 36 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 21 64 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 16 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTCGTCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 49 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 61 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGGTTATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCACCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTGTCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTCCAGG Context D 3.0103 1 0 -exampleBAM.bam.bam 6 AA Context M 4.7712 2 0 -exampleBAM.bam.bam 31 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 31 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGCCT Context D 3.0103 1 0 -exampleBAM.bam.bam 18 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 18 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 27 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TGTATTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTAGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 13 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 20 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCCAGATC Context I 3.0103 1 0 -exampleBAM.bam.bam 32 2 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCATGGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAATCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 30 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 18 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 30 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CCAGGTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAAAATCT Context D 3.0103 1 0 -exampleBAM.bam.bam 25 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 6 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 17 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 23 35 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTTATA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GATCCAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 48 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 60 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATCCATTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATGAGTC Context I 3.0103 1 0 -exampleBAM.bam.bam 31 TA Context M 4.7712 2 0 -exampleBAM.bam.bam 21 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 34 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTCCAGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 18 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 33 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCAGGCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCACTT Context D 3.0103 1 0 -exampleBAM.bam.bam 28 53 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 19 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 32 1 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GATAACCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AACTGGGA Context D 3.0103 1 0 -exampleBAM.bam.bam 16 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCATTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 21 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 AT Context M 8.4510 6 0 -exampleBAM.bam.bam 16 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CACATGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 17 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 29 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATCAATAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACCATGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCGGGTTT Context D 4.7712 2 0 -exampleBAM.bam.bam 45 TCCATGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 6 AG Context M -0.0000 1 1 -exampleBAM.bam.bam 6 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATAAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 51 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 63 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CGTGAGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CACCCAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 16 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 5 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GCTTTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTGGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTATTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 34 64 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 AC Context M 6.0206 3 0 -exampleBAM.bam.bam 33 2 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCACTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCGTGAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAATGAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TACTCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CACTTTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCATTTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATATAAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTTTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGGCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 29 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TTTCTGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 33 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTTGGGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGAGATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTAGATT Context I 3.0103 1 0 -exampleBAM.bam.bam 34 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGCAATCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAGGGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTAATGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 30 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 TA Context M 6.9897 4 0 -exampleBAM.bam.bam 45 16 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 28 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ACATGGTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTCAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAATGTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATCTCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTCACT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATATCAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 8 57 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 34 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 16 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 GGGTTCGG Context I 6.0206 3 0 -exampleBAM.bam.bam 45 CTAGAGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 50 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 62 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GATATAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCACCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACCTGGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 5 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGTGGAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCAAAATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACAGCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 28 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 33 39 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 23 64 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AAGTGACA Context D 3.0103 1 0 -exampleBAM.bam.bam 5 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGAGTTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGACAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTGAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTCTTTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 33 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGGCAGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 4 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 29 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 34 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGTTTCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CATTTCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 53 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 57 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CATGATAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAGAGTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTCGGG Context D 6.0206 3 0 -exampleBAM.bam.bam 45 CTTTATTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTCGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTGTGTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTAAGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTCTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTAATCTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 23 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 27 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 30 21 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGAAAGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTATTA Context I 3.0103 1 0 -exampleBAM.bam.bam 23 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAGGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 GTGCAAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 28 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 ATTCTTAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GAGCCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 27 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 19 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 4 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTTGG Context I 4.7712 2 0 -exampleBAM.bam.bam 33 AG Context M 6.0206 3 0 -exampleBAM.bam.bam 28 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATTACTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ACACAGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGTGAAC Context I 3.0103 1 0 -exampleBAM.bam.bam 32 36 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 29 TA Context M 4.7712 2 0 -exampleBAM.bam.bam 34 70 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 76 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 30 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATCGTGAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GATCGTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 52 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 56 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CCAGATCC Context D 3.0103 1 0 -exampleBAM.bam.bam 16 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 8 63 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 14 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 23 AT Context M 6.0206 3 0 -exampleBAM.bam.bam 19 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTATTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTCAATGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AAAATCTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 22 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 26 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 34 2 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 6 68 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 23 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 28 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 5 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTACTC Context D 3.0103 1 0 -exampleBAM.bam.bam 33 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGCTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 28 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 4 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 29 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 18 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGACAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTTTGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACCCAGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTAAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 13 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTATTTGC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 23 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 8 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 22 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGATCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 55 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 59 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TCCAGTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGACATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCACATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TAAGTGAC Context D 3.0103 1 0 -exampleBAM.bam.bam 4 64 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 25 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 22 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 CTTTCAGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCATGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 21 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 25 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GACATGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTATTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTAATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 32 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 23 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGAGCCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTGGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 28 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCAAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCAGCCT Context D 3.0103 1 0 -exampleBAM.bam.bam 34 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTTTGTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTCGGGT Context D 6.0206 3 0 -exampleBAM.bam.bam 28 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 18 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTTG Context D 4.7712 2 0 -exampleBAM.bam.bam 34 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 30 52 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCCACC Context D 3.0103 1 0 -exampleBAM.bam.bam 20 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AAAGTGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTGATAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATGTGAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 54 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 58 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ACTTTCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 23 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 15 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGTATTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 20 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 24 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CAGGCCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 23 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 17 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 CG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGATATA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTTAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 15 14 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GAACTGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 6 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 10 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GGGCTGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 31 10 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 31 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 30 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTCTAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TATTTGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 24 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTTTGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGCACC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 36 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 40 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 29 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 21 29 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAATCTCC Context I 3.0103 1 0 -exampleBAM.bam.bam 15 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 33 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCTGGGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 66 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CTTGGCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGCCACCA Context D 3.0103 1 0 -exampleBAM.bam.bam 19 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TTCAGGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGAGC Context I 3.0103 1 0 -exampleBAM.bam.bam 28 GG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 GAGATTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 7 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 11 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 TTACTCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTATATC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTTAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTATTACT Context D 3.0103 1 0 -exampleBAM.bam.bam 31 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 34 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 36 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ACAGCAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGCAAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 37 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 41 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 TCCAGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAGTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTATCATG Context D 3.0103 1 0 -exampleBAM.bam.bam 24 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 29 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 32 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 67 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 18 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTGGAGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGATTTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAATCTAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTGAAAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCACCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTGTGTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 28 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 19 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 29 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTGGAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCAGGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTATTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 33 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTATTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAACCTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 30 CA Context M 6.0206 3 0 -exampleBAM.bam.bam 15 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GACACAGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AACCTGGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 4 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 8 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 25 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 6 63 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TTTGCAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGCACT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTAAGTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGAGTCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTCGTCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 38 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 42 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 34 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 CG Context M 3.0103 1 0 -exampleBAM.bam.bam 31 8 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 27 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 26 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATAAAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGTTGG Context D 4.7712 2 0 -exampleBAM.bam.bam 45 64 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 76 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GATTCTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGACACAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTGTTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 29 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 GG Context M 6.9897 4 0 -exampleBAM.bam.bam 8 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAACTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGCTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 9 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTGAAAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGCAC Context D 3.0103 1 0 -exampleBAM.bam.bam 20 29 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 12 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 24 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCACCCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 16 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATCGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 5 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 9 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 30 CC Context M 4.7712 2 0 -exampleBAM.bam.bam 23 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 43 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 ATAACCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 39 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 43 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GAAAGTGC Context D 3.0103 1 0 -exampleBAM.bam.bam 24 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 24 6 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TTATTGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 34 63 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 CT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 65 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 18 TT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 GATTTTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGTTCTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TAAAGACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGAGTGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCACAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGAGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 19 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 5 26 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 AAGTGCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTGCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTAATC Context I 3.0103 1 0 -exampleBAM.bam.bam 20 28 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 GGTATTAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGTGAACT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCCTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 60 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TCGTCCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGATTCTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATCCAGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 32 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 44 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CATGATTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAATCCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGTTCTA Context I 3.0103 1 0 -exampleBAM.bam.bam 34 26 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 AT Context M -0.0000 1 1 -exampleBAM.bam.bam 45 GGGTTAGG Context D 4.7712 2 0 -exampleBAM.bam.bam 30 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATATCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCAATCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGAGCCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGATCCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 2 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 14 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GAGTGTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 32 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 21 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGTCTTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCTTTA Context I 3.0103 1 0 -exampleBAM.bam.bam 13 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATT Context D 3.0103 1 0 -exampleBAM.bam.bam 29 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 19 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATCAATA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGGGCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGTTAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGCACTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAGAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 26 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 20 57 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTCGTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 70 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 74 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 18 22 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 15 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 31 GC Context M 6.0206 3 0 -exampleBAM.bam.bam 45 33 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 45 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GGAGATTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGATCCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 16 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTCCAG Context D 3.0103 1 0 -exampleBAM.bam.bam 13 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGTATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATCATGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGACATGG Context I 3.0103 1 0 -exampleBAM.bam.bam 17 TT Context M 3.0103 3 1 -exampleBAM.bam.bam 31 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 8 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 34 27 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 3 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 15 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 TTATATCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGATATAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTATCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCACTGAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGCCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 19 21 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 32 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CACTGATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATAAAGAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCACTTTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCCTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 28 CT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 71 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 75 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 AGCAAAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCAATC Context I 3.0103 1 0 -exampleBAM.bam.bam 33 29 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 26 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTTGGG Context D 4.7712 2 0 -exampleBAM.bam.bam 45 GGGTTGGG Context D 6.0206 3 0 -exampleBAM.bam.bam 24 3 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTTCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTAGATTT Context D 3.0103 1 0 -exampleBAM.bam.bam 16 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 34 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 46 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 ATGAGTCA Context D 3.0103 1 0 -exampleBAM.bam.bam 27 65 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 12 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 GG Context M 6.9897 4 0 -exampleBAM.bam.bam 34 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 33 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 15 8 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 26 67 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 12 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GGCCTGAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGATTAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCAGCCTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AATCCATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTATAT Context D 3.0103 1 0 -exampleBAM.bam.bam 29 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GTTAGGGT Context I 6.0206 3 0 -exampleBAM.bam.bam 45 ACTCTTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTTTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACATGATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATTATTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 32 28 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 29 42 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 AT Context M 6.9897 4 0 -exampleBAM.bam.bam 45 TGGGTTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGTTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 26 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTCTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 68 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 72 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 AGTCAATG Context I 3.0103 1 0 -exampleBAM.bam.bam 29 8 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 CG Context M 4.7712 2 0 -exampleBAM.bam.bam 4 29 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 16 TT Context M 3.9794 4 1 -exampleBAM.bam.bam 45 CACCATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 35 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 47 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CTATTCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATCTAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTTGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 30 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCACATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 9 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GTCCATGA Context I 3.0103 1 0 -exampleBAM.bam.bam 31 13 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 34 59 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AAGACACA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCACCATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 1 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 13 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 16 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CGTCCATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTGGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTCGGGTT Context I 6.0206 3 0 -exampleBAM.bam.bam 45 TTAGGGTT Context I 6.0206 3 0 -exampleBAM.bam.bam 45 TGGGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 9 38 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTATCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 17 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 34 25 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATA Context D 3.0103 1 0 -exampleBAM.bam.bam 28 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTGATA Context D 3.0103 1 0 -exampleBAM.bam.bam 29 43 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGTTCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGTTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 69 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 73 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 28 41 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 31 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGATCGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 29 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 12 GC Context M 3.0103 1 0 -exampleBAM.bam.bam 29 6 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GCCTCGTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 70 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 74 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TTTGGGCT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TATCAATA Context D 3.0103 1 0 -exampleBAM.bam.bam 33 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 45 TTGGTTAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTAGAGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGCACTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 4 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 10 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 27 11 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATT Context I 3.0103 1 0 -exampleBAM.bam.bam 5 TT Context M 1.7609 2 1 -exampleBAM.bam.bam 18 56 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCTTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTCTTTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCAATGTG Context D 3.0103 1 0 -exampleBAM.bam.bam 12 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGAGCCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGATCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 2 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 14 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GCAATCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GAGTGTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 15 AA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GGGTTAGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 TATATCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 17 62 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGATTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 32 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 44 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATCCAGTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGTTCTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CAATCCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGATTCTA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCGTCCAT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 GT Context M 4.7712 2 0 -exampleBAM.bam.bam 24 13 Cycle M 6.0206 3 0 -exampleBAM.bam.bam 30 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 29 7 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 74 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 40 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 39 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGCAATC Context D 3.0103 1 0 -exampleBAM.bam.bam 33 TT Context M 6.9897 4 0 -exampleBAM.bam.bam 30 69 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 71 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 75 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGCAAAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 32 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 TC Context M 6.0206 3 0 -exampleBAM.bam.bam 29 37 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 CA Context M 4.7712 2 0 -exampleBAM.bam.bam 45 ATAAAGAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACTGATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGCCTCG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCACTTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 25 14 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 6 52 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 TGATATAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTATCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTATATCA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCACTGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGCCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 3 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 15 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 17 63 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 TG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 30 35 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 45 TATCATGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGACATGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGATCCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 33 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 45 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GGAGATTA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATGGTATT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTCCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CGGGTTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 68 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 72 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGTCAATG Context D 3.0103 1 0 -exampleBAM.bam.bam 33 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGTTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGGGTTCG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTCTGT Context D 3.0103 1 0 -exampleBAM.bam.bam 4 TT Context M -0.0000 1 1 -exampleBAM.bam.bam 29 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AGCCTTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ACTCTTTG Context D 3.0103 1 0 -exampleBAM.bam.bam 18 58 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 ATTATTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACATGATC Context I 3.0103 1 0 -exampleBAM.bam.bam 28 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 33 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTTAGGGT Context D 6.0206 3 0 -exampleBAM.bam.bam 32 16 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 32 TG Context M 4.7712 2 0 -exampleBAM.bam.bam 45 GGCCTGAA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 12 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGATTAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCAGCCTC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AATCCATT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTATAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 22 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 24 45 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 GT Context M 6.0206 3 0 -exampleBAM.bam.bam 31 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 20 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 34 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 46 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 ATGAGTCA Context I 3.0103 1 0 -exampleBAM.bam.bam 22 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTTTCTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGTTGGG Context I 6.0206 3 0 -exampleBAM.bam.bam 45 GGTTTGGG Context I 4.7712 2 0 -exampleBAM.bam.bam 45 TTAGATTT Context I 3.0103 1 0 -exampleBAM.bam.bam 30 32 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 19 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 TC Context M 3.0103 1 0 -exampleBAM.bam.bam 25 47 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 10 75 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 11 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 33 TC Context M 8.4510 6 0 -exampleBAM.bam.bam 45 TGATCGTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CAGGTTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CCAGTTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 69 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 73 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 32 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 29 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 29 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 33 49 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATTGATA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCATGATA Context I 3.0103 1 0 -exampleBAM.bam.bam 32 TT Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGGGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTAGGGTT Context D 6.0206 3 0 -exampleBAM.bam.bam 45 TTCGGGTT Context D 6.0206 3 0 -exampleBAM.bam.bam 45 TTGGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTATCAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CGTCCATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCACCATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAGACACA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 1 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 13 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CTGGGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 22 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 25 GG Context M 4.7712 2 0 -exampleBAM.bam.bam 8 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 34 21 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 GA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGTTGGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TCACATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTCCATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CACCATGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 35 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 47 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CTATTCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AATCTAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 25 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 76 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 55 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 1 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 23 18 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 66 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GAGATTAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCAGGCC Context D 3.0103 1 0 -exampleBAM.bam.bam 13 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTTAATG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGTGGAGC Context D 3.0103 1 0 -exampleBAM.bam.bam 21 TT Context M 3.0103 1 0 -exampleBAM.bam.bam 21 17 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 12 AG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 GGCCACCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GCTGGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTGGCTT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 66 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 26 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TAATCTCC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 28 34 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGGGT Context D 3.0103 1 0 -exampleBAM.bam.bam 17 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 6 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CCTTTGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 36 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 40 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CAGGCACC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTCTAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TATTTGCA Context I 3.0103 1 0 -exampleBAM.bam.bam 34 TA Context M 3.0103 1 0 -exampleBAM.bam.bam 25 CC Context M 3.0103 1 0 -exampleBAM.bam.bam 22 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GAACTGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 6 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 10 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GGGCTGGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTGATATA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTCTTAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 27 GA Context M 4.7712 2 0 -exampleBAM.bam.bam 27 14 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 32 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAACCTGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TCTATTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 11 40 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 TTTATTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATTCT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CCTGGAGA Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GCCAGGCA Context D 3.0103 1 0 -exampleBAM.bam.bam 12 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 32 53 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 21 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 26 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TCTGTGTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTTGGGGG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGGCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AAATCTAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 67 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 CTGGAGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGATTTTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGGCACCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 CTGAAAGT Context I 3.0103 1 0 -exampleBAM.bam.bam 8 46 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TCCAGGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAGTGT Context I 3.0103 1 0 -exampleBAM.bam.bam 24 CG Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTATCATG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ACAGCAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 37 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 41 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGTGCAAA Context I 3.0103 1 0 -exampleBAM.bam.bam 34 TC Context M 6.0206 3 0 -exampleBAM.bam.bam 25 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 30 AT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 TTTATATC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTACTCTT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GTATTACT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGTTAAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 7 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 11 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 CCTGAAAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 CTTTGCAC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GTGAACTG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TTGGCTTT Context I 3.0103 1 0 -exampleBAM.bam.bam 28 2 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 19 30 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 GT Context M 3.0103 1 0 -exampleBAM.bam.bam 45 64 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 76 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 AGTGTTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGGGTTGG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GATTCTAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 AGACACAG Context D 3.0103 1 0 -exampleBAM.bam.bam 45 GGGGTTGG Context I 4.7712 2 0 -exampleBAM.bam.bam 15 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TATAAAGA Context I 3.0103 1 0 -exampleBAM.bam.bam 33 22 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 12 AA Context M 3.0103 1 0 -exampleBAM.bam.bam 32 54 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 CTCGTCCA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 38 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 42 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 TTAAGTGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGCAAT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTGCACT Context D 3.0103 1 0 -exampleBAM.bam.bam 24 CC Context M 4.7712 2 0 -exampleBAM.bam.bam 45 TGAGTCAA Context D 3.0103 1 0 -exampleBAM.bam.bam 6 TT Context M 1.7609 2 1 -exampleBAM.bam.bam 31 4 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 31 AG Context M 4.7712 2 0 -exampleBAM.bam.bam 34 50 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 73 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GACACAGC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AACCTGGA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 4 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 8 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 16 58 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 AA Context M 4.7712 2 0 -exampleBAM.bam.bam 24 41 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 TG Context M 6.0206 3 0 -exampleBAM.bam.bam 29 68 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 25 9 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 26 44 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GGTATTAC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 TGTGAACT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TGGCCTGA Context D 3.0103 1 0 -exampleBAM.bam.bam 5 22 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 AAGTGCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATTTGCAA Context I 3.0103 1 0 -exampleBAM.bam.bam 45 ATCTAATC Context D 3.0103 1 0 -exampleBAM.bam.bam 27 GG Context M 3.0103 1 0 -exampleBAM.bam.bam 21 48 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TGAGTGTT Context D 3.0103 1 0 -exampleBAM.bam.bam 13 39 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 TAAAGACA Context D 3.0103 1 0 -exampleBAM.bam.bam 33 23 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 45 GTGGAGCC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 TTTCACAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 65 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 GATTTTTC Context D 3.0103 1 0 -exampleBAM.bam.bam 45 AGTTCTAG Context I 3.0103 1 0 -exampleBAM.bam.bam 19 61 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 28 71 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 15 35 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 24 CA Context M 3.0103 1 0 -exampleBAM.bam.bam 24 10 Cycle M -0.0000 1 1 -exampleBAM.bam.bam 45 TTATTGAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATAACCTG Context I 3.0103 1 0 -exampleBAM.bam.bam 45 GAAAGTGC Context I 3.0103 1 0 -exampleBAM.bam.bam 45 39 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 43 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 31 AT Context M 4.7712 2 0 -exampleBAM.bam.bam 31 5 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 34 51 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 27 72 Cycle M 3.0103 1 0 -exampleBAM.bam.bam 30 AC Context M 3.0103 1 0 -exampleBAM.bam.bam 45 CATGGTAT Context D 3.0103 1 0 -exampleBAM.bam.bam 45 ATGATCGT Context I 3.0103 1 0 -exampleBAM.bam.bam 45 5 Cycle D 7.7815 5 0 -exampleBAM.bam.bam 45 9 Cycle I 7.7815 5 0 -exampleBAM.bam.bam 45 GCACCCAG Context I 3.0103 1 0 -exampleBAM.bam.bam 34 TT Context M 8.4510 6 0 -exampleBAM.bam.bam 31 39 Cycle M 4.7712 2 0 -exampleBAM.bam.bam 14 33 Cycle M 3.0103 1 0 +ReadGroup QualityScore CovariateValue CovariateName EventType EmpiricalQuality Observations Errors +exampleBAM.bam 6 AA Context M 6.0000 1 0.00 +exampleBAM.bam 6 GA Context M 6.0000 1 0.00 +exampleBAM.bam 6 GC Context M 6.0000 2 2.00 +exampleBAM.bam 6 TG Context M 6.0000 1 0.00 +exampleBAM.bam 6 AT Context M 6.0000 1 0.00 +exampleBAM.bam 6 CT Context M 6.0000 1 0.00 +exampleBAM.bam 6 GT Context M 6.0000 2 0.00 +exampleBAM.bam 6 TT Context M 6.0000 2 1.00 +exampleBAM.bam 8 AA Context M 8.0000 1 0.00 +exampleBAM.bam 8 GA Context M 8.0000 2 0.00 +exampleBAM.bam 8 GC Context M 8.0000 1 0.00 +exampleBAM.bam 8 TG Context M 8.0000 1 1.00 +exampleBAM.bam 8 GT Context M 8.0000 2 0.00 +exampleBAM.bam 9 TG Context M 9.0000 1 0.00 +exampleBAM.bam 9 AT Context M 9.0000 1 0.00 +exampleBAM.bam 9 CT Context M 9.0000 1 0.00 +exampleBAM.bam 9 GT Context M 9.0000 1 0.00 +exampleBAM.bam 10 TT Context M 10.0000 1 0.00 +exampleBAM.bam 11 TC Context M 11.0000 1 1.00 +exampleBAM.bam 11 GT Context M 11.0000 1 0.00 +exampleBAM.bam 12 GA Context M 12.0000 1 0.00 +exampleBAM.bam 12 CC Context M 12.0000 1 0.00 +exampleBAM.bam 12 TC Context M 12.0000 1 0.00 +exampleBAM.bam 12 AG Context M 12.0000 1 0.00 +exampleBAM.bam 13 AA Context M 13.0000 1 0.00 +exampleBAM.bam 13 AG Context M 13.0000 1 0.00 +exampleBAM.bam 13 AT Context M 13.0000 1 0.00 +exampleBAM.bam 14 GA Context M 14.0000 1 0.00 +exampleBAM.bam 15 AA Context M 15.0000 2 0.00 +exampleBAM.bam 15 GA Context M 15.0000 1 0.00 +exampleBAM.bam 15 GT Context M 15.0000 2 0.00 +exampleBAM.bam 16 AA Context M 16.0000 1 0.00 +exampleBAM.bam 16 TA Context M 16.0000 4 1.00 +exampleBAM.bam 16 TC Context M 16.0000 1 0.00 +exampleBAM.bam 16 GG Context M 16.0000 1 0.00 +exampleBAM.bam 16 TG Context M 16.0000 1 0.00 +exampleBAM.bam 16 CT Context M 16.0000 1 0.00 +exampleBAM.bam 16 GT Context M 16.0000 1 0.00 +exampleBAM.bam 17 CA Context M 17.0000 1 0.00 +exampleBAM.bam 17 TA Context M 17.0000 1 0.00 +exampleBAM.bam 17 TG Context M 17.0000 1 1.00 +exampleBAM.bam 17 GT Context M 17.0000 1 0.00 +exampleBAM.bam 17 TT Context M 17.0000 2 0.00 +exampleBAM.bam 18 AC Context M 18.0000 1 0.00 +exampleBAM.bam 18 TC Context M 18.0000 1 1.00 +exampleBAM.bam 18 AT Context M 18.0000 1 0.00 +exampleBAM.bam 18 GT Context M 18.0000 1 0.00 +exampleBAM.bam 18 TT Context M 18.0000 2 0.00 +exampleBAM.bam 19 AA Context M 19.0000 2 0.00 +exampleBAM.bam 19 CA Context M 19.0000 2 0.00 +exampleBAM.bam 19 GA Context M 19.0000 2 0.00 +exampleBAM.bam 19 TA Context M 19.0000 3 0.00 +exampleBAM.bam 19 GC Context M 19.0000 1 0.00 +exampleBAM.bam 19 GG Context M 19.0000 1 0.00 +exampleBAM.bam 19 TG Context M 19.0000 2 1.00 +exampleBAM.bam 19 GT Context M 19.0000 1 0.00 +exampleBAM.bam 19 TT Context M 19.0000 1 0.00 +exampleBAM.bam 20 AA Context M 20.0000 1 0.00 +exampleBAM.bam 20 CA Context M 20.0000 1 0.00 +exampleBAM.bam 20 CC Context M 20.0000 1 0.00 +exampleBAM.bam 20 TG Context M 20.0000 1 0.00 +exampleBAM.bam 20 TT Context M 20.0000 1 1.00 +exampleBAM.bam 21 CA Context M 21.0000 1 0.00 +exampleBAM.bam 21 GA Context M 21.0000 1 0.00 +exampleBAM.bam 21 TA Context M 21.0000 1 0.00 +exampleBAM.bam 21 CC Context M 21.0000 1 0.00 +exampleBAM.bam 21 TC Context M 21.0000 2 0.00 +exampleBAM.bam 21 AG Context M 21.0000 1 0.00 +exampleBAM.bam 21 GG Context M 21.0000 4 0.00 +exampleBAM.bam 21 TG Context M 21.0000 1 0.00 +exampleBAM.bam 21 AT Context M 21.0000 2 0.00 +exampleBAM.bam 21 CT Context M 21.0000 1 0.00 +exampleBAM.bam 21 GT Context M 21.0000 2 0.00 +exampleBAM.bam 22 CA Context M 22.0000 1 0.00 +exampleBAM.bam 22 GA Context M 22.0000 3 0.00 +exampleBAM.bam 22 TA Context M 22.0000 1 0.00 +exampleBAM.bam 22 GC Context M 22.0000 1 0.00 +exampleBAM.bam 22 GG Context M 22.0000 1 0.00 +exampleBAM.bam 22 TG Context M 22.0000 1 0.00 +exampleBAM.bam 22 TT Context M 22.0000 1 0.00 +exampleBAM.bam 23 AA Context M 23.0000 1 0.00 +exampleBAM.bam 23 CA Context M 23.0000 1 0.00 +exampleBAM.bam 23 TA Context M 23.0000 1 0.00 +exampleBAM.bam 23 CC Context M 23.0000 2 0.00 +exampleBAM.bam 23 GC Context M 23.0000 1 0.00 +exampleBAM.bam 23 TC Context M 23.0000 2 0.00 +exampleBAM.bam 23 GG Context M 23.0000 1 0.00 +exampleBAM.bam 23 TG Context M 23.0000 3 0.00 +exampleBAM.bam 23 AT Context M 23.0000 1 0.00 +exampleBAM.bam 23 TT Context M 23.0000 2 0.00 +exampleBAM.bam 24 CA Context M 24.0000 3 0.00 +exampleBAM.bam 24 GA Context M 24.0000 2 0.00 +exampleBAM.bam 24 TA Context M 24.0000 2 1.00 +exampleBAM.bam 24 GC Context M 24.0000 1 0.00 +exampleBAM.bam 24 AG Context M 24.0000 2 0.00 +exampleBAM.bam 24 CG Context M 24.0000 1 0.00 +exampleBAM.bam 24 GG Context M 24.0000 3 0.00 +exampleBAM.bam 24 AT Context M 24.0000 1 0.00 +exampleBAM.bam 24 CT Context M 24.0000 1 0.00 +exampleBAM.bam 24 GT Context M 24.0000 1 0.00 +exampleBAM.bam 24 TT Context M 24.0000 3 0.00 +exampleBAM.bam 25 AA Context M 25.0000 2 0.00 +exampleBAM.bam 25 CA Context M 25.0000 1 0.00 +exampleBAM.bam 25 GG Context M 25.0000 2 0.00 +exampleBAM.bam 25 TG Context M 25.0000 2 0.00 +exampleBAM.bam 25 AT Context M 25.0000 2 0.00 +exampleBAM.bam 25 GT Context M 25.0000 1 0.00 +exampleBAM.bam 25 TT Context M 25.0000 5 0.00 +exampleBAM.bam 26 TA Context M 26.0000 1 0.00 +exampleBAM.bam 26 GG Context M 26.0000 1 0.00 +exampleBAM.bam 26 TG Context M 26.0000 1 0.00 +exampleBAM.bam 26 AT Context M 26.0000 1 0.00 +exampleBAM.bam 26 GT Context M 26.0000 1 0.00 +exampleBAM.bam 26 TT Context M 26.0000 1 0.00 +exampleBAM.bam 27 AA Context M 27.0000 2 0.00 +exampleBAM.bam 27 CA Context M 27.0000 1 0.00 +exampleBAM.bam 27 TA Context M 27.0000 1 0.00 +exampleBAM.bam 27 TC Context M 27.0000 2 0.00 +exampleBAM.bam 27 AG Context M 27.0000 3 0.00 +exampleBAM.bam 27 GG Context M 27.0000 3 0.00 +exampleBAM.bam 27 TG Context M 27.0000 2 0.00 +exampleBAM.bam 27 AT Context M 27.0000 4 0.00 +exampleBAM.bam 27 CT Context M 27.0000 2 0.00 +exampleBAM.bam 27 TT Context M 27.0000 2 0.00 +exampleBAM.bam 28 AA Context M 28.0000 1 0.00 +exampleBAM.bam 28 CA Context M 28.0000 1 0.00 +exampleBAM.bam 28 TA Context M 28.0000 2 0.00 +exampleBAM.bam 28 AG Context M 28.0000 3 0.00 +exampleBAM.bam 28 GG Context M 28.0000 3 0.00 +exampleBAM.bam 28 TG Context M 28.0000 1 0.00 +exampleBAM.bam 28 GT Context M 28.0000 4 0.00 +exampleBAM.bam 29 CA Context M 29.0000 1 0.00 +exampleBAM.bam 29 TA Context M 29.0000 1 0.00 +exampleBAM.bam 29 AC Context M 29.0000 1 0.00 +exampleBAM.bam 29 CC Context M 29.0000 1 0.00 +exampleBAM.bam 29 GC Context M 29.0000 1 0.00 +exampleBAM.bam 29 AG Context M 29.0000 3 0.00 +exampleBAM.bam 29 CG Context M 29.0000 1 0.00 +exampleBAM.bam 29 GG Context M 29.0000 4 0.00 +exampleBAM.bam 29 TG Context M 29.0000 1 0.00 +exampleBAM.bam 29 AT Context M 29.0000 1 0.00 +exampleBAM.bam 29 GT Context M 29.0000 1 0.00 +exampleBAM.bam 29 TT Context M 29.0000 4 0.00 +exampleBAM.bam 30 AA Context M 30.0000 2 0.00 +exampleBAM.bam 30 CA Context M 30.0000 1 0.00 +exampleBAM.bam 30 AC Context M 30.0000 4 0.00 +exampleBAM.bam 30 CC Context M 30.0000 1 0.00 +exampleBAM.bam 30 TC Context M 30.0000 2 0.00 +exampleBAM.bam 30 AG Context M 30.0000 3 0.00 +exampleBAM.bam 30 GG Context M 30.0000 1 0.00 +exampleBAM.bam 30 TG Context M 30.0000 1 0.00 +exampleBAM.bam 30 AT Context M 30.0000 2 0.00 +exampleBAM.bam 30 CT Context M 30.0000 2 0.00 +exampleBAM.bam 30 GT Context M 30.0000 1 0.00 +exampleBAM.bam 31 CA Context M 31.0000 1 0.00 +exampleBAM.bam 31 GA Context M 31.0000 1 0.00 +exampleBAM.bam 31 CC Context M 31.0000 2 0.00 +exampleBAM.bam 31 GC Context M 31.0000 2 0.00 +exampleBAM.bam 31 AG Context M 31.0000 2 0.00 +exampleBAM.bam 31 GG Context M 31.0000 6 0.00 +exampleBAM.bam 31 TG Context M 31.0000 2 0.00 +exampleBAM.bam 31 AT Context M 31.0000 2 0.00 +exampleBAM.bam 31 CT Context M 31.0000 2 0.00 +exampleBAM.bam 31 GT Context M 31.0000 1 0.00 +exampleBAM.bam 31 TT Context M 31.0000 3 0.00 +exampleBAM.bam 32 CA Context M 32.0000 2 0.00 +exampleBAM.bam 32 TA Context M 32.0000 1 0.00 +exampleBAM.bam 32 AC Context M 32.0000 1 0.00 +exampleBAM.bam 32 CC Context M 32.0000 1 0.00 +exampleBAM.bam 32 TC Context M 32.0000 1 0.00 +exampleBAM.bam 32 AG Context M 32.0000 1 0.00 +exampleBAM.bam 32 CG Context M 32.0000 1 0.00 +exampleBAM.bam 32 GG Context M 32.0000 8 0.00 +exampleBAM.bam 32 TG Context M 32.0000 3 0.00 +exampleBAM.bam 32 AT Context M 32.0000 4 0.00 +exampleBAM.bam 32 CT Context M 32.0000 2 0.00 +exampleBAM.bam 32 TT Context M 32.0000 6 0.00 +exampleBAM.bam 33 CA Context M 33.0000 2 0.00 +exampleBAM.bam 33 GA Context M 33.0000 2 0.00 +exampleBAM.bam 33 TA Context M 33.0000 1 0.00 +exampleBAM.bam 33 AC Context M 33.0000 1 0.00 +exampleBAM.bam 33 CC Context M 33.0000 2 0.00 +exampleBAM.bam 33 GC Context M 33.0000 1 0.00 +exampleBAM.bam 33 TC Context M 33.0000 3 0.00 +exampleBAM.bam 33 AG Context M 33.0000 2 0.00 +exampleBAM.bam 33 CG Context M 33.0000 1 0.00 +exampleBAM.bam 33 GG Context M 33.0000 2 0.00 +exampleBAM.bam 33 TG Context M 33.0000 5 0.00 +exampleBAM.bam 33 AT Context M 33.0000 2 0.00 +exampleBAM.bam 33 CT Context M 33.0000 4 0.00 +exampleBAM.bam 33 GT Context M 33.0000 1 0.00 +exampleBAM.bam 33 TT Context M 33.0000 4 0.00 +exampleBAM.bam 34 AA Context M 34.0000 1 0.00 +exampleBAM.bam 34 CA Context M 34.0000 3 0.00 +exampleBAM.bam 34 GA Context M 34.0000 1 0.00 +exampleBAM.bam 34 TA Context M 34.0000 2 0.00 +exampleBAM.bam 34 AC Context M 34.0000 1 0.00 +exampleBAM.bam 34 CC Context M 34.0000 1 0.00 +exampleBAM.bam 34 GC Context M 34.0000 2 0.00 +exampleBAM.bam 34 TC Context M 34.0000 6 0.00 +exampleBAM.bam 34 AG Context M 34.0000 1 0.00 +exampleBAM.bam 34 CG Context M 34.0000 1 0.00 +exampleBAM.bam 34 GG Context M 34.0000 1 0.00 +exampleBAM.bam 34 TG Context M 34.0000 4 0.00 +exampleBAM.bam 34 AT Context M 34.0000 4 0.00 +exampleBAM.bam 34 CT Context M 34.0000 2 0.00 +exampleBAM.bam 34 GT Context M 34.0000 1 0.00 +exampleBAM.bam 34 TT Context M 34.0000 5 0.00 +exampleBAM.bam 45 AAA Context I 45.0000 5 0.00 +exampleBAM.bam 45 AAA Context D 45.0000 5 0.00 +exampleBAM.bam 45 CAA Context I 45.0000 5 0.00 +exampleBAM.bam 45 CAA Context D 45.0000 5 0.00 +exampleBAM.bam 45 GAA Context I 45.0000 2 0.00 +exampleBAM.bam 45 GAA Context D 45.0000 2 0.00 +exampleBAM.bam 45 TAA Context I 45.0000 6 0.00 +exampleBAM.bam 45 TAA Context D 45.0000 6 0.00 +exampleBAM.bam 45 ACA Context I 45.0000 4 0.00 +exampleBAM.bam 45 ACA Context D 45.0000 4 0.00 +exampleBAM.bam 45 CCA Context I 45.0000 8 0.00 +exampleBAM.bam 45 CCA Context D 45.0000 8 0.00 +exampleBAM.bam 45 GCA Context I 45.0000 5 0.00 +exampleBAM.bam 45 GCA Context D 45.0000 5 0.00 +exampleBAM.bam 45 TCA Context I 45.0000 6 0.00 +exampleBAM.bam 45 TCA Context D 45.0000 6 0.00 +exampleBAM.bam 45 AGA Context I 45.0000 5 0.00 +exampleBAM.bam 45 AGA Context D 45.0000 5 0.00 +exampleBAM.bam 45 GGA Context I 45.0000 3 0.00 +exampleBAM.bam 45 GGA Context D 45.0000 3 0.00 +exampleBAM.bam 45 TGA Context I 45.0000 10 0.00 +exampleBAM.bam 45 TGA Context D 45.0000 10 0.00 +exampleBAM.bam 45 ATA Context I 45.0000 6 0.00 +exampleBAM.bam 45 ATA Context D 45.0000 6 0.00 +exampleBAM.bam 45 CTA Context I 45.0000 3 0.00 +exampleBAM.bam 45 CTA Context D 45.0000 3 0.00 +exampleBAM.bam 45 GTA Context I 45.0000 2 0.00 +exampleBAM.bam 45 GTA Context D 45.0000 2 0.00 +exampleBAM.bam 45 TTA Context I 45.0000 11 0.00 +exampleBAM.bam 45 TTA Context D 45.0000 11 0.00 +exampleBAM.bam 45 CAC Context I 45.0000 6 0.00 +exampleBAM.bam 45 CAC Context D 45.0000 6 0.00 +exampleBAM.bam 45 GAC Context I 45.0000 2 0.00 +exampleBAM.bam 45 GAC Context D 45.0000 2 0.00 +exampleBAM.bam 45 TAC Context I 45.0000 1 0.00 +exampleBAM.bam 45 TAC Context D 45.0000 1 0.00 +exampleBAM.bam 45 ACC Context I 45.0000 3 0.00 +exampleBAM.bam 45 ACC Context D 45.0000 3 0.00 +exampleBAM.bam 45 CCC Context I 45.0000 1 0.00 +exampleBAM.bam 45 CCC Context D 45.0000 1 0.00 +exampleBAM.bam 45 GCC Context I 45.0000 5 0.00 +exampleBAM.bam 45 GCC Context D 45.0000 5 0.00 +exampleBAM.bam 45 TCC Context I 45.0000 4 0.00 +exampleBAM.bam 45 TCC Context D 45.0000 4 0.00 +exampleBAM.bam 45 AGC Context I 45.0000 3 0.00 +exampleBAM.bam 45 AGC Context D 45.0000 3 0.00 +exampleBAM.bam 45 GGC Context I 45.0000 6 0.00 +exampleBAM.bam 45 GGC Context D 45.0000 6 0.00 +exampleBAM.bam 45 TGC Context I 45.0000 4 0.00 +exampleBAM.bam 45 TGC Context D 45.0000 4 0.00 +exampleBAM.bam 45 ATC Context I 45.0000 7 0.00 +exampleBAM.bam 45 ATC Context D 45.0000 7 0.00 +exampleBAM.bam 45 CTC Context I 45.0000 3 0.00 +exampleBAM.bam 45 CTC Context D 45.0000 3 0.00 +exampleBAM.bam 45 GTC Context I 45.0000 3 0.00 +exampleBAM.bam 45 GTC Context D 45.0000 3 0.00 +exampleBAM.bam 45 TTC Context I 45.0000 9 0.00 +exampleBAM.bam 45 TTC Context D 45.0000 9 0.00 +exampleBAM.bam 45 AAG Context I 45.0000 4 0.00 +exampleBAM.bam 45 AAG Context D 45.0000 4 0.00 +exampleBAM.bam 45 CAG Context I 45.0000 7 0.00 +exampleBAM.bam 45 CAG Context D 45.0000 7 0.00 +exampleBAM.bam 45 GAG Context I 45.0000 6 0.00 +exampleBAM.bam 45 GAG Context D 45.0000 6 0.00 +exampleBAM.bam 45 TAG Context I 45.0000 5 0.00 +exampleBAM.bam 45 TAG Context D 45.0000 5 0.00 +exampleBAM.bam 45 TCG Context I 45.0000 5 0.00 +exampleBAM.bam 45 TCG Context D 45.0000 5 0.00 +exampleBAM.bam 45 AGG Context I 45.0000 7 0.00 +exampleBAM.bam 45 AGG Context D 45.0000 7 0.00 +exampleBAM.bam 45 CGG Context I 45.0000 3 0.00 +exampleBAM.bam 45 CGG Context D 45.0000 3 0.00 +exampleBAM.bam 45 GGG Context I 45.0000 16 0.00 +exampleBAM.bam 45 GGG Context D 45.0000 16 0.00 +exampleBAM.bam 45 TGG Context I 45.0000 16 0.00 +exampleBAM.bam 45 TGG Context D 45.0000 16 0.00 +exampleBAM.bam 45 ATG Context I 45.0000 8 0.00 +exampleBAM.bam 45 ATG Context D 45.0000 8 0.00 +exampleBAM.bam 45 CTG Context I 45.0000 6 0.00 +exampleBAM.bam 45 CTG Context D 45.0000 6 0.00 +exampleBAM.bam 45 GTG Context I 45.0000 8 0.00 +exampleBAM.bam 45 GTG Context D 45.0000 8 0.00 +exampleBAM.bam 45 TTG Context I 45.0000 11 0.00 +exampleBAM.bam 45 TTG Context D 45.0000 11 0.00 +exampleBAM.bam 45 AAT Context I 45.0000 7 0.00 +exampleBAM.bam 45 AAT Context D 45.0000 7 0.00 +exampleBAM.bam 45 CAT Context I 45.0000 6 0.00 +exampleBAM.bam 45 CAT Context D 45.0000 6 0.00 +exampleBAM.bam 45 GAT Context I 45.0000 8 0.00 +exampleBAM.bam 45 GAT Context D 45.0000 8 0.00 +exampleBAM.bam 45 TAT Context I 45.0000 9 0.00 +exampleBAM.bam 45 TAT Context D 45.0000 9 0.00 +exampleBAM.bam 45 ACT Context I 45.0000 4 0.00 +exampleBAM.bam 45 ACT Context D 45.0000 4 0.00 +exampleBAM.bam 45 CCT Context I 45.0000 4 0.00 +exampleBAM.bam 45 CCT Context D 45.0000 4 0.00 +exampleBAM.bam 45 GCT Context I 45.0000 2 0.00 +exampleBAM.bam 45 GCT Context D 45.0000 2 0.00 +exampleBAM.bam 45 TCT Context I 45.0000 8 0.00 +exampleBAM.bam 45 TCT Context D 45.0000 8 0.00 +exampleBAM.bam 45 AGT Context I 45.0000 5 0.00 +exampleBAM.bam 45 AGT Context D 45.0000 5 0.00 +exampleBAM.bam 45 CGT Context I 45.0000 2 0.00 +exampleBAM.bam 45 CGT Context D 45.0000 2 0.00 +exampleBAM.bam 45 GGT Context I 45.0000 13 0.00 +exampleBAM.bam 45 GGT Context D 45.0000 13 0.00 +exampleBAM.bam 45 TGT Context I 45.0000 5 0.00 +exampleBAM.bam 45 TGT Context D 45.0000 5 0.00 +exampleBAM.bam 45 ATT Context I 45.0000 9 0.00 +exampleBAM.bam 45 ATT Context D 45.0000 9 0.00 +exampleBAM.bam 45 CTT Context I 45.0000 7 0.00 +exampleBAM.bam 45 CTT Context D 45.0000 7 0.00 +exampleBAM.bam 45 GTT Context I 45.0000 17 0.00 +exampleBAM.bam 45 GTT Context D 45.0000 17 0.00 +exampleBAM.bam 45 TTT Context I 45.0000 12 0.00 +exampleBAM.bam 45 TTT Context D 45.0000 12 0.00 +exampleBAM.bam 6 -4 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 31 Cycle M 6.0000 1 1.00 +exampleBAM.bam 6 36 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -52 Cycle M 6.0000 1 1.00 +exampleBAM.bam 6 -62 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 63 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -63 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -65 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 67 Cycle M 6.0000 1 0.00 +exampleBAM.bam 6 -68 Cycle M 6.0000 1 1.00 +exampleBAM.bam 6 75 Cycle M 6.0000 1 0.00 +exampleBAM.bam 8 17 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 46 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 57 Cycle M 8.0000 1 1.00 +exampleBAM.bam 8 58 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 60 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 63 Cycle M 8.0000 1 0.00 +exampleBAM.bam 8 71 Cycle M 8.0000 1 0.00 +exampleBAM.bam 9 -16 Cycle M 9.0000 1 0.00 +exampleBAM.bam 9 38 Cycle M 9.0000 1 0.00 +exampleBAM.bam 9 52 Cycle M 9.0000 1 0.00 +exampleBAM.bam 9 69 Cycle M 9.0000 1 0.00 +exampleBAM.bam 10 -75 Cycle M 10.0000 1 0.00 +exampleBAM.bam 11 -20 Cycle M 11.0000 1 0.00 +exampleBAM.bam 11 -40 Cycle M 11.0000 1 1.00 +exampleBAM.bam 12 25 Cycle M 12.0000 1 0.00 +exampleBAM.bam 12 40 Cycle M 12.0000 1 0.00 +exampleBAM.bam 12 62 Cycle M 12.0000 1 0.00 +exampleBAM.bam 12 68 Cycle M 12.0000 1 0.00 +exampleBAM.bam 13 39 Cycle M 13.0000 1 0.00 +exampleBAM.bam 13 55 Cycle M 13.0000 1 0.00 +exampleBAM.bam 13 75 Cycle M 13.0000 1 0.00 +exampleBAM.bam 14 -33 Cycle M 14.0000 1 0.00 +exampleBAM.bam 15 -8 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 -14 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 35 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 68 Cycle M 15.0000 1 0.00 +exampleBAM.bam 15 74 Cycle M 15.0000 1 0.00 +exampleBAM.bam 16 7 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 19 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -34 Cycle M 16.0000 1 1.00 +exampleBAM.bam 16 -47 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 51 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -55 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -58 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 -65 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 70 Cycle M 16.0000 1 0.00 +exampleBAM.bam 16 73 Cycle M 16.0000 1 0.00 +exampleBAM.bam 17 -4 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 -20 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 58 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 62 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 -63 Cycle M 17.0000 1 0.00 +exampleBAM.bam 17 -76 Cycle M 17.0000 1 1.00 +exampleBAM.bam 18 -1 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 10 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 -19 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 22 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 36 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 -56 Cycle M 18.0000 1 0.00 +exampleBAM.bam 18 -58 Cycle M 18.0000 1 1.00 +exampleBAM.bam 19 5 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 -7 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 10 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 21 Cycle M 19.0000 2 0.00 +exampleBAM.bam 19 -30 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 32 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 33 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 49 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 54 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 61 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 65 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 -70 Cycle M 19.0000 1 1.00 +exampleBAM.bam 19 -71 Cycle M 19.0000 1 0.00 +exampleBAM.bam 19 72 Cycle M 19.0000 1 0.00 +exampleBAM.bam 20 9 Cycle M 20.0000 1 0.00 +exampleBAM.bam 20 -28 Cycle M 20.0000 1 1.00 +exampleBAM.bam 20 -29 Cycle M 20.0000 1 0.00 +exampleBAM.bam 20 -57 Cycle M 20.0000 1 0.00 +exampleBAM.bam 20 69 Cycle M 20.0000 1 0.00 +exampleBAM.bam 21 -3 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 11 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 17 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 29 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -42 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -44 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 48 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -50 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 59 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -60 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 -61 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 64 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 66 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 67 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 71 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 73 Cycle M 21.0000 1 0.00 +exampleBAM.bam 21 74 Cycle M 21.0000 1 0.00 +exampleBAM.bam 22 -9 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -15 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -23 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 38 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 44 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -44 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 51 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -59 Cycle M 22.0000 1 0.00 +exampleBAM.bam 22 -70 Cycle M 22.0000 1 0.00 +exampleBAM.bam 23 -12 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -15 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 18 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 19 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -35 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 37 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -38 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 56 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 59 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 61 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 64 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -64 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 66 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -67 Cycle M 23.0000 1 0.00 +exampleBAM.bam 23 -75 Cycle M 23.0000 1 0.00 +exampleBAM.bam 24 3 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 5 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 6 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -6 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -10 Cycle M 24.0000 1 1.00 +exampleBAM.bam 24 13 Cycle M 24.0000 2 0.00 +exampleBAM.bam 24 -13 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -25 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 27 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 33 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 41 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 45 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -48 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -49 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 50 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 52 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 53 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 56 Cycle M 24.0000 1 0.00 +exampleBAM.bam 24 -62 Cycle M 24.0000 1 0.00 +exampleBAM.bam 25 -9 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 14 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -21 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -24 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 31 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -32 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -36 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 37 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 46 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 47 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -51 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -52 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 55 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -73 Cycle M 25.0000 1 0.00 +exampleBAM.bam 25 -74 Cycle M 25.0000 1 0.00 +exampleBAM.bam 26 -3 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 7 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 20 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 -44 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 50 Cycle M 26.0000 1 0.00 +exampleBAM.bam 26 -67 Cycle M 26.0000 1 0.00 +exampleBAM.bam 27 11 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 14 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 16 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -17 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -18 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 22 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -27 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 28 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 30 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -31 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 40 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 53 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -53 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -55 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -56 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 57 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 65 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -66 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -69 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -72 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 -73 Cycle M 27.0000 1 0.00 +exampleBAM.bam 27 76 Cycle M 27.0000 1 0.00 +exampleBAM.bam 28 -2 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -11 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 25 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -27 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 30 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 34 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 39 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -41 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 47 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 48 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -50 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -53 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 54 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -61 Cycle M 28.0000 1 0.00 +exampleBAM.bam 28 -71 Cycle M 28.0000 1 0.00 +exampleBAM.bam 29 4 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -5 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 6 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -7 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -8 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 9 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 12 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -24 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 27 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -28 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -37 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 42 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -43 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -45 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -47 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -48 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -54 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -60 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -68 Cycle M 29.0000 1 0.00 +exampleBAM.bam 29 -76 Cycle M 29.0000 1 0.00 +exampleBAM.bam 30 -9 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 12 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -17 Cycle M 30.0000 2 0.00 +exampleBAM.bam 30 18 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 20 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -21 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 23 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 24 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 26 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -30 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 32 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 34 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 35 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -35 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -42 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -45 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -52 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -54 Cycle M 30.0000 1 0.00 +exampleBAM.bam 30 -69 Cycle M 30.0000 1 0.00 +exampleBAM.bam 31 -1 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 4 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -5 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -6 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 8 Cycle M 31.0000 2 0.00 +exampleBAM.bam 31 -10 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -11 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -12 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -13 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 15 Cycle M 31.0000 2 0.00 +exampleBAM.bam 31 -16 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -19 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -25 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 26 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -26 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -32 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -34 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -39 Cycle M 31.0000 2 0.00 +exampleBAM.bam 31 43 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -45 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 60 Cycle M 31.0000 1 0.00 +exampleBAM.bam 31 -66 Cycle M 31.0000 1 0.00 +exampleBAM.bam 32 1 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 2 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -2 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -14 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 -15 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 16 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -16 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -18 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -19 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 23 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 24 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 28 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -28 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -30 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -31 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -33 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 -36 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 41 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -41 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 43 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -43 Cycle M 32.0000 2 0.00 +exampleBAM.bam 32 -46 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -49 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -51 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -53 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -54 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -57 Cycle M 32.0000 1 0.00 +exampleBAM.bam 32 -72 Cycle M 32.0000 1 0.00 +exampleBAM.bam 33 1 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -1 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 2 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 3 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -7 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -8 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -10 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -12 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -18 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -22 Cycle M 33.0000 2 0.00 +exampleBAM.bam 33 -23 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -24 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -29 Cycle M 33.0000 2 0.00 +exampleBAM.bam 33 -31 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -32 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -35 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -37 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -39 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -40 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 42 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 44 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 45 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -46 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -48 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -49 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -57 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -59 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -66 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -67 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -69 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -72 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -73 Cycle M 33.0000 1 0.00 +exampleBAM.bam 33 -76 Cycle M 33.0000 1 0.00 +exampleBAM.bam 34 -2 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -3 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -4 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -5 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -6 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -11 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -13 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -20 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -21 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -23 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -25 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -26 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -27 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -34 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -38 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -40 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -41 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -42 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -47 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -50 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -51 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -55 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -56 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -58 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -59 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -60 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -61 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -62 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -63 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -64 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -65 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -68 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -70 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -71 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -74 Cycle M 34.0000 1 0.00 +exampleBAM.bam 34 -75 Cycle M 34.0000 1 0.00 +exampleBAM.bam 45 5 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 5 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -5 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -5 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 6 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 6 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -6 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -6 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 7 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 7 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -7 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -7 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 8 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 8 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -8 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -8 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 9 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 9 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -9 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -9 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 10 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 10 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -10 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -10 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 11 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 11 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -11 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -11 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 12 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 12 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -12 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -12 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 13 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 13 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -13 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -13 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 14 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 14 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -14 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -14 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 15 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 15 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -15 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -15 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 16 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 16 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -16 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -16 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 17 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 17 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -17 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -17 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 18 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 18 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -18 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -18 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 19 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 19 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -19 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -19 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 20 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 20 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -20 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -20 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 21 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 21 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -21 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -21 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 22 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 22 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -22 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -22 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 23 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 23 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -23 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -23 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 24 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 24 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -24 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -24 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 25 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 25 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -25 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -25 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 26 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 26 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -26 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -26 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 27 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 27 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -27 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -27 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 28 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 28 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -28 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -28 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 29 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 29 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -29 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -29 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 30 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 30 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -30 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -30 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 31 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 31 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -31 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -31 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 32 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 32 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -32 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -32 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 33 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 33 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -33 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -33 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 34 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 34 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -34 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -34 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 35 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 35 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -35 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -35 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 36 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 36 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -36 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -36 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 37 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 37 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -37 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -37 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 38 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 38 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -38 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -38 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 39 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 39 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -39 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -39 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 40 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 40 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -40 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -40 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 41 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 41 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -41 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -41 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 42 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 42 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -42 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -42 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 43 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 43 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -43 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -43 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 44 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 44 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -44 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -44 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 45 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 45 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -45 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -45 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 46 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 46 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -46 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -46 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 47 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 47 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -47 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -47 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 48 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 48 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -48 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -48 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 49 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 49 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -49 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -49 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 50 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 50 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -50 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -50 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 51 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 51 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -51 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -51 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 52 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 52 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -52 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -52 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 53 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 53 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -53 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -53 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 54 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 54 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -54 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -54 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 55 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 55 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -55 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -55 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 56 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 56 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -56 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -56 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 57 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 57 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -57 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -57 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 58 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 58 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -58 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -58 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 59 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 59 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -59 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -59 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 60 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 60 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -60 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -60 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 61 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 61 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -61 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -61 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 62 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 62 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -62 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -62 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 63 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 63 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -63 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -63 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 64 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 64 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -64 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 -64 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 65 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 65 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -65 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -65 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 66 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 66 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -66 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -66 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 67 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 67 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -67 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -67 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 68 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 68 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -68 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -68 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 69 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 69 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -69 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -69 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 70 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 70 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -70 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -70 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 71 Cycle I 45.0000 2 0.00 +exampleBAM.bam 45 71 Cycle D 45.0000 2 0.00 +exampleBAM.bam 45 -71 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -71 Cycle D 45.0000 3 0.00 +exampleBAM.bam 45 72 Cycle I 45.0000 1 0.00 +exampleBAM.bam 45 72 Cycle D 45.0000 1 0.00 +exampleBAM.bam 45 -72 Cycle I 45.0000 3 0.00 +exampleBAM.bam 45 -72 Cycle D 45.0000 3 0.00 From a971e7ab6d3ae0e196fcc188328777bf5e80c734 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 8 Apr 2013 18:49:05 -0400 Subject: [PATCH 180/226] Several improvements to ReadAdaptorTrimmer so that it can be incorporated into ancient DNA processing pipelines (for which it was developed): -- Add pair cleaning feature. Reads in query-name sorted order are required and pairs need to appear consecutively, but if -cleanPairs option is set, a malformed pair where second read is missing is just skipped instead of erroring out. -- Add integration tests -- Move walker to public --- .../walkers/readutils/ReadAdaptorTrimmer.java | 392 ++++++++++++++++++ .../ReadAdaptorTrimmerIntegrationTest.java | 60 +++ 2 files changed, 452 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java new file mode 100644 index 000000000..43a1ddd74 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java @@ -0,0 +1,392 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMFileWriter; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +/** + * Utility tool to blindly strip base adaptors. Main application is for FASTQ/unaligned BAM pre-processing where libraries + * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present. + *

        + * By design, tool will only work for Illumina-like library constructs, where the typical library architecture is: + * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)] + *

        + * It is assumed that when data is paired, one read will span the forward strand and one read will span the reverse strand. + * Hence, when specifying adaptors they should be specified as both forward and reverse-complement to make sure they're removed in all cases. + * By design, as well, "circular" constructions where a read can have an insert, then adaptor, then more genomic insert, are not supported. + * When an adaptor is detected, all bases downstream from it (i.e. in the 3' direction) will be removed. + * Adaptor detection is carried out by looking for overlaps between forward and reverse reads in a pair. + * If a sufficiently high overlap is found, the insert size is computed and if insert size < read lengths adaptor bases are removed from reads. + * + * Advantages over ReadClipper: + * - No previous knowledge of adaptors or library structure is necessary + * + * Advantages over 3rd party tools like SeqPrep: + * - Can do BAM streaming instead of having to convert to fastq + * - No need to merge reads - merging reads can have some advantages, but complicates downstream processing and loses information that can be used, + * e.g. in variant calling + *

        + * + *

        Input

        + *

        + * The input read data in BAM format. Read data MUST be in query name ordering as produced, for example with Picard's FastqToBam + * + *

        Output

        + *

        + * A merged BAM file with unaligned reads + *

        + * + *

        Examples

        + *
        + * java -Xmx4g -jar GenomeAnalysisTK.jar \
        + *   -T ReadAdaptorTrimmer \
        + *   -I my_reads.bam \
        + *   -R resources/Homo_sapiens_assembly18.fasta \
        + *   -o trimmed_Reads.bam
        + * 
        + */ + +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@PartitionBy(PartitionType.READ) +public class ReadAdaptorTrimmer extends ReadWalker, SAMFileWriter> implements NanoSchedulable { + @Output(doc="Write output to this BAM filename instead of STDOUT", required = false) + SAMFileWriter out; + + /** + * Only prints the first n reads of the file - for short testing + */ + @Hidden + @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) + int nReadsToPrint = -1; + + /** + * Argument to control strictness of match between forward and reverse reads - by default, we require 15 matches between them to declare + * an overlap. + */ + @Advanced + @Argument(fullName = "minMatches", shortName = "minMatches", doc="Minimum number of substring matches to detect pair overlaps", required = false) + int minMatchesForOverlap = 15; + + + /** + * If true, this argument will make the walker discard unpaired reads instead of erroring out. + */ + @Advanced + @Argument(fullName = "removeUnpairedReads", shortName = "removeUnpairedReads", doc="Remove unpaired reads instead of erroring out", required = false) + boolean cleanUnpairedReads = false; + + /** + * private class members + */ + private GATKSAMRecord firstReadInPair; + private TrimStats trimStats = new TrimStats(); + + static class TrimStats { + long numReadsProcessed; + long numReadsWithAdaptorTrimmed; + long numUnpairedReadsFound; + } + + /** + * The reads filter function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param read the read itself, as a GATKSAMRecord + * @return true if the read passes the filter, false if it doesn't + */ + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + // check if we've reached the output limit + if ( nReadsToPrint == 0 ) { + return false; // n == 0 means we've printed all we needed. + } + else if (nReadsToPrint > 0) { + nReadsToPrint--; // n > 0 means there are still reads to be printed. + } + return true; + } + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise + */ + public SAMFileWriter reduceInit() { + return out; + } + + public List map( final ReferenceContext ref, final GATKSAMRecord readIn, final RefMetaDataTracker metaDataTracker ) { + + + final List readsToEmit = new ArrayList(); + + + // cache first read in pair if flag set. + if (readIn.getFirstOfPairFlag()) { + firstReadInPair = GATKSAMRecord.emptyRead(readIn); + firstReadInPair.setReadString(readIn.getReadString()); + firstReadInPair.setReadName(readIn.getReadName()); + firstReadInPair.setBaseQualities(readIn.getBaseQualities()); + } + else { + if (!readIn.getReadName().equals(firstReadInPair.getReadName())) { + if (cleanUnpairedReads) { + trimStats.numUnpairedReadsFound++; + return readsToEmit; + } + else // by default require that reads be completely paired + throw new IllegalStateException("Second read in pair must follow first read in pair: data not ordered?"); + } + + final int oldLength1 = firstReadInPair.getReadLength(); + final int oldLength2 = readIn.getReadLength(); + // try to strip any adaptor sequence in read pair + final Integer result = trimReads(firstReadInPair, readIn, minMatchesForOverlap, logger); + + if (logger.isDebugEnabled()) { + if (result == null) + logger.debug("No overlap found, insert size cannot be computed"); + else + logger.debug("Insert size estimate = " + result); + + } + + + readsToEmit.add(firstReadInPair); + readsToEmit.add(readIn); + + if (oldLength1 != firstReadInPair.getReadLength()) + trimStats.numReadsWithAdaptorTrimmed++; + if (oldLength2 != readIn.getReadLength()) + trimStats.numReadsWithAdaptorTrimmed++; + + } + + + trimStats.numReadsProcessed++; + return readsToEmit; + + } + + /** + * given a read and a output location, reduce by emitting the read + * + * @param readsToEmit the read itself + * @param output the output source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public SAMFileWriter reduce( final List readsToEmit, final SAMFileWriter output ) { + for (final GATKSAMRecord read : readsToEmit) + output.addAlignment(read); + + return output; + } + + @Override + public void onTraversalDone(SAMFileWriter output) { + + logger.info("Finished Trimming:"); + logger.info("Number of processed reads: "+ trimStats.numReadsProcessed); + logger.info("Number of reads with adaptor sequence trimmed: "+ trimStats.numReadsWithAdaptorTrimmed); + if (cleanUnpairedReads) + logger.info("Number of unpaired reads thrown out: "+ trimStats.numUnpairedReadsFound); + } + + + /** + * + * Workhorse routines... + * + */ + /** + * Core routine that does most underlying work for walker. Takes two reads and looks for overlaps in them. + * An overlap is defined as a contiguous chunk of N bases that matches reverse-complement between reads. + * Currently, the only insert structure that it will look for overlaps is as follows: + * CASE 1: Insert shorter than read length: + * 3' XXXXXXXXXXXXXXXX 5' (second read) + * 5' YYYYYYYYYYYYYYYY 3' (first read) + * *********** + * + * In this case, if X and Y are complements at the 11 positions marked by *, routine will do the following + * iff minMatchesForOverlap <= 11: + * a) Cleave adaptor from end of second read (leftmost dangling part in diagram above) + * b) Cleave adaptor from end of first read (rightmost part in diagram). + * + * CASE 2: Insert size >= read length: + * 3' XXXXXXXXXXXXXXXX 5' (second read) + * 5' YYYYYYYYYYYYYYYY 3' (first read) + * ********* (overlap) + * + * In this case, no trimming is done and reads are left unchanged + * @param first (I/O) First read in pair - read contents (bases/quals) can be modified if adaptor is detected + * @param second (I/O) Second read in pair - read contents (bases/quals) can be modified if adaptor is detected + * @param minMatchesForOverlap Reads need to match in these # of bases to be joined + * @return Offset between second and first read. + * If there's no detectable offset, return Null + */ + @Requires({"first != null","second != null","minMatchesForOverlap>0"}) + protected static Integer trimReads(final GATKSAMRecord first, + final GATKSAMRecord second, + final int minMatchesForOverlap, + final Logger logger) { + + final Integer insertSize = estimateInsertSize(first.getReadBases(), second.getReadBases(), + minMatchesForOverlap, logger); + + if (insertSize == null) + return insertSize; + if (insertSize < first.getReadLength()) { + // trim adaptor sequence from read + first.setReadBases(Arrays.copyOfRange(first.getReadBases(),0,insertSize)); + first.setBaseQualities(Arrays.copyOfRange(first.getBaseQualities(),0,insertSize)); + } + if (insertSize < second.getReadLength()) { + // trim adaptor sequence from read + second.setReadBases(Arrays.copyOfRange(second.getReadBases(),0,insertSize)); + second.setBaseQualities(Arrays.copyOfRange(second.getBaseQualities(),0,insertSize)); + } + return insertSize; + } + + /** + * Brain-dead implementation of an aligner of two sequences, where it's assumed that there might be an overlap + * from the first into the second. From this, an estimate of insert size is performed and returned + * Assumes that reads come in reverse direction, so one of the base sequences needs to be reverse-complemented.] + * + * @param firstRead Bytes from first read + * @param secondRead Bytes from second read (reverse direction) + * @return Estimated insert size based on offset between first and second read. + * If no overlap can be detected, return null + */ + + @Requires({"firstRead != null","secondRead != null","minMatches>0","firstRead.length == secondRead.length"}) + protected static Integer estimateInsertSize(final byte[] firstRead, + final byte[] secondRead, + final int minMatches, + final Logger logger) { + final byte[] firstBases = firstRead; + final byte[] secondBases = BaseUtils.simpleReverseComplement(secondRead); + + final Pair overlaps = findOverlappingSequence(firstBases, secondBases); + final int bestOffset = overlaps.first; + final int maxScore = overlaps.second; + if ( logger.isDebugEnabled()) { + String sb="", s1 = new String(firstBases), s2 = new String(secondBases); + for (int k=0; k < Math.abs(bestOffset); k++) sb+=" "; + if (maxScore >= minMatches) { + logger.debug(String.format("Match, Max Score = %d, best offset = %d\n",maxScore, bestOffset)); + if (bestOffset>0) + s2 = sb+s2; + else + s1 = sb+s1; + } + else logger.debug("NoMatch:"); + logger.debug("R1:"+s1); + logger.debug("R2:"+s2); + + + } + + if (maxScore < minMatches) + return null; // no overlap detected + + return bestOffset+secondRead.length; + + + } + + + /** + * Tries to find overlapping sequence between two reads, and computes offset between them + * For each possible offset, computes matching score, which is = MATCH_SCORE*Num_matches + MISMATCH_SCORE*num_mismatches + * (like SW with infinite gap penalties). + * @param first First read bytes + * @param second Second read bytes + * @return Pair of integers (x,y). x = best offset between reads, y = corresponding score + */ + @Requires({"first != null","second != null"}) + @Ensures("result != null") + protected static Pair findOverlappingSequence(final byte[] first, + final byte[] second) { + final int MATCH_SCORE = 1; + final int MISMATCH_SCORE = -1; + // try every possible offset - O(N^2) algorithm + + // In case of following structure, + // 111111111 + // 222222222 + // computed offset will be negative (=-5 in this case). + // If however, + // 111111111 + // 222222222 + // then offset will be positive (=3 in this case) + int maxScore = 0, bestOffset =0; + for (int offset = -second.length; offset < first.length; offset++) { + int score = 0; + // compute start index for each array + int ind1 = (offset<0)?0:offset; + int ind2 = (offset<0)?-offset:0; + for (int k=0; k < Math.min(first.length, second.length) ; k++) { + if (ind1 >= first.length) + break; + if (ind2 >= second.length ) + break; + if (first[ind1] != 'N' && second[ind2] != 'N') { + if (first[ind1] == second[ind2]) + score += MATCH_SCORE; + else + score += MISMATCH_SCORE; + } + ind1++; + ind2++; + } + if (score > maxScore) { + maxScore = score; + bestOffset = offset; + } + } + return new Pair(bestOffset,maxScore); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java new file mode 100644 index 000000000..3b184ae3b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +/** + * Created with IntelliJ IDEA. + * User: delangel + * Date: 4/13/13 + * Time: 7:28 AM + * To change this template use File | Settings | File Templates. + */ +public class ReadAdaptorTrimmerIntegrationTest extends WalkerTest { + private String getBaseCommand(final String BAM) { + return "-T ReadAdaptorTrimmer -R " + b37KGReference + + " -I " + privateTestDir + BAM + + " -o %s"; + } + + @Test + public void testBasicTrimmer() { + WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest.bam"), 1, Arrays.asList("1d42414e12b45d44e6f396d97d0f60fe")); + executeTest(String.format("testBasicTrimmer"), spec); + } + + @Test + public void testSkippingBadPairs() { + WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest2.bam")+" -removeUnpairedReads", 1, Arrays.asList("5e796345502fbfc31134f7736ce68868")); + executeTest(String.format("testSkippingBadPairs"), spec); + } + +} From 3144eae51c19ac19f336318dfe354b43b8ecc795 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 13 Apr 2013 14:53:30 -0400 Subject: [PATCH 181/226] UnifiedGenotyper bugfix: don't create haplotypes with 0 bases -- The PairHMM no longer allows us to create haplotypes with 0 bases. The UG indel caller used to create such haplotypes. Now we assign -Double.MAX_VALUE likelihoods to such haplotypes. -- Add integration test to cover this case, along with private/testdata BAM -- [Fixes #47523579] --- .../indels/PairHMMIndelErrorModel.java | 24 ++++++++++--------- ...dGenotyperIndelCallingIntegrationTest.java | 11 +++++++++ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index a1ce5afdb..5702fff2a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -388,20 +388,22 @@ public class PairHMMIndelErrorModel { System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), - (int)indStart, (int)indStop); + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, haplotypeBases.length); - firstHap = false; + // it's possible that the indel starts at the last base of the haplotypes + if ( haplotypeBases.length == 0 ) { + readLikelihood = -Double.MAX_VALUE; + } else { + if (firstHap) { + //no need to reallocate arrays for each new haplotype, as length won't change + pairHMM.initialize(readBases.length, haplotypeBases.length); + firstHap = false; + } + + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, + baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap); } - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap); - - if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 6b26be0d0..ece92f50f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -51,6 +51,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; +import java.util.Collections; import java.util.List; public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { @@ -194,4 +195,14 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); executeTest("test minIndelFraction 1.0", spec); } + + // No testing of MD5 here, we previously blew up due to a 0 length haplotypes, so we just need to pass + @Test + public void testHaplotype0Length() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", + 0, + Collections.emptyList()); + executeTest("testHaplotype0Length", spec); + } } From 9bfa5eb70fb3a7911583e883706841bdd0052c14 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sun, 14 Apr 2013 23:25:33 -0400 Subject: [PATCH 182/226] Quick optimization to the PairHMM Problem -------- the logless HMM scale factor (to avoid double under-flows) was 10^300. Although this serves the purpose this value results in a complex mantissa that further complicates cpu calculations. Solution --------- initialize with 2^1020 (2^1023 is the max value), and adjust the scale factor accordingly. --- .../broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index b62d7a334..ab2a5bb2a 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -56,8 +56,8 @@ import org.broadinstitute.sting.utils.QualityUtils; * Date: 10/16/12 */ public final class LoglessPairHMM extends PairHMM { - protected static final double SCALE_FACTOR_LOG10 = 300.0; - protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10); + protected static final double INITIAL_CONDITION = Math.pow(2, 1020); + protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); private static final int matchToMatch = 0; private static final int indelToMatch = 1; @@ -118,7 +118,7 @@ public final class LoglessPairHMM extends PairHMM { for (int j = 1; j < paddedHaplotypeLength; j++) { finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; } - return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10; + return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10; } /** From 564fe36d22ee19beebae4f79799c97c8428f2566 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 5 Apr 2013 09:28:46 -0400 Subject: [PATCH 183/226] VariantRecalibrator's VQSR.vcf now contains NEG/POS labels -- It's useful to know which sites have been used in the training of the model. The recal_file emitted by VR now contains VCF info field annotations labeling each site that was used in the positive or negative training models with POSITIVE_TRAINING_SITE and/or NEGATIVE_TRAINING_SITE -- Update MD5s, which all changed now that the recal file and the resulting applied vcfs all have these pos / neg labels --- .../ApplyRecalibration.java | 6 ++++++ .../VariantDataManager.java | 18 ++++++++---------- .../VariantRecalibrator.java | 2 ++ ...antRecalibrationWalkersIntegrationTest.java | 18 +++++++++--------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 7de0c7e60..e15b99824 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker implements T hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); } //--------------------------------------------------------------------------------------------------------------- @@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker implements T // Annotate the new record with its VQSLOD and the worst performing annotation builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); + if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY)) + builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY)) + builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 3f6b6ed09..a830a801e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -335,19 +335,17 @@ public class VariantDataManager { }} ); // create dummy alleles to be used - final List alleles = new ArrayList(2); - alleles.add(Allele.create("N", true)); - alleles.add(Allele.create("", false)); - - // to be used for the important INFO tags - final HashMap attributes = new HashMap(3); + final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); for( final VariantDatum datum : data ) { - attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); - attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); - attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); + builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); recalWriter.add(builder.make()); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 320328ab1..b0970cce2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -135,6 +135,8 @@ public class VariantRecalibrator extends RodWalker Date: Thu, 11 Apr 2013 10:52:59 -0400 Subject: [PATCH 184/226] Improvements to the VariantRecalibrator R plots -- VariantRecalibrator now emits plots with denormlized values (original values) instead of their normalized (x - mu / sigma) which helps to understand the distribution of values that are good and bad --- .../VariantDataManager.java | 62 ++++++++++++------- .../VariantRecalibrator.java | 10 ++- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a830a801e..40032a886 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -127,30 +127,46 @@ public class VariantDataManager { } } - public void addTrainingSet( final TrainingSet trainingSet ) { - trainingSets.add( trainingSet ); - } + /** + * Convert a normalized point to it's original annotation value + * + * norm = (orig - mu) / sigma + * orig = norm * sigma + mu + * + * @param normalizedValue the normalized value of the ith annotation + * @param annI the index of the annotation value + * @return the denormalized value for the annotation + */ + public double denormalizeDatum(final double normalizedValue, final int annI) { + final double mu = meanVector[annI]; + final double sigma = varianceVector[annI]; + return normalizedValue * sigma + mu; + } - public boolean checkHasTrainingSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTraining ) { return true; } - } - return false; - } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } - public boolean checkHasTruthSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTruth ) { return true; } - } - return false; - } + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } - public boolean checkHasKnownSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isKnown ) { return true; } - } - return false; - } + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); @@ -260,7 +276,7 @@ public class VariantDataManager { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM - value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); + value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } @@ -297,7 +313,7 @@ public class VariantDataManager { private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index b0970cce2..bee695e2a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -435,14 +435,20 @@ public class VariantRecalibrator extends RodWalker Date: Tue, 16 Apr 2013 09:30:06 -0400 Subject: [PATCH 185/226] Select the haplotypes we move forward for genotyping per sample, not pooled -- The previous algorithm would compute the likelihood of each haplotype pooled across samples. This has a tendency to select "consensus" haplotypes that are reasonably good across all samples, while missing the true haplotypes that each sample likes. The new algorithm computes instead the most likely pair of haplotypes among all haplotypes for each sample independently, contributing 1 vote to each haplotype it selects. After all N samples have been run, we sort the haplotypes by their counts, and take 2 * nSample + 1 haplotypes or maxHaplotypesInPopulation, whichever is smaller. -- After discussing with Mauricio our view is that the algorithmic complexity of this approach is no worse than the previous approach, so it should be equivalently fast. -- One potential improvement is to use not hard counts for the haplotypes, but this would radically complicate the current algorithm so it wasn't selected. -- For an example of a specific problem caused by this, see https://jira.broadinstitute.org/browse/GSA-871. -- Remove old pooled likelihood model. It's worse than the current version in both single and multiple samples: 1000G EUR samples: 10Kb per sample: 7.17 minutes pooled: 7.36 minutes Name VariantType TRUE_POSITIVE FALSE_POSITIVE FALSE_NEGATIVE TRUE_NEGATIVE CALLED_NOT_IN_DB_AT_ALL per_sample SNPS 50 0 5 8 1 per_sample INDELS 6 0 7 2 1 pooled SNPS 49 0 6 8 1 pooled INDELS 5 0 8 2 1 100 kb per sample: 140.00 minutes pooled: 145.27 minutes Name VariantType TRUE_POSITIVE FALSE_POSITIVE FALSE_NEGATIVE TRUE_NEGATIVE CALLED_NOT_IN_DB_AT_ALL per_sample SNPS 144 0 22 28 1 per_sample INDELS 28 1 16 9 11 pooled SNPS 143 0 23 28 1 pooled INDELS 27 1 17 9 11 java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T HaplotypeCaller -I private/testdata/AFR.structural.indels.bam -L 20:8187565-8187800 -L 20:18670537-18670730 -R ~/Desktop/broadLocal/localData/human_g1k_v37.fasta -o /dev/null -debug haplotypes from samples: 8 seconds haplotypes from pools: 8 seconds java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T HaplotypeCaller -I /Users/depristo/Desktop/broadLocal/localData/phaseIII.4x.100kb.bam -L 20:10,000,000-10,001,000 -R ~/Desktop/broadLocal/localData/human_g1k_v37.fasta -o /dev/null -debug haplotypes from samples: 173.32 seconds haplotypes from pools: 167.12 seconds --- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../LikelihoodCalculationEngine.java | 147 +++++++++++++----- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 14 +- .../genotyper/MostLikelyAlleleUnitTest.java | 11 +- .../PerReadAlleleLikelihoodMapUnitTest.java | 53 ++++++- .../utils/genotyper/MostLikelyAllele.java | 9 +- .../genotyper/PerReadAlleleLikelihoodMap.java | 60 ++++++- 8 files changed, 252 insertions(+), 55 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 55490a1cb..cd3847b81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -737,7 +737,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); - if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + if ( DEBUG ) { + logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + for ( final Haplotype remaining: trimmedHaplotypes ) { + logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar()); + } + } + // trim down the reads and add them to the trimmed active region final List trimmedReads = new ArrayList(originalActiveRegion.getReads().size()); @@ -761,11 +767,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * @return the list of haplotypes to genotype */ protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { - // TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping) if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return haplotypes; } else { - return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); + return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 1fb873e81..8697833a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -49,12 +49,14 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -253,54 +255,127 @@ public class LikelihoodCalculationEngine { return likelihoodMatrix; } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.size() <= haplotypes.size()"}) - public List selectBestHaplotypesFromPooledLikelihoods(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { + // -------------------------------------------------------------------------------- + // + // System to compute the best N haplotypes for genotyping + // + // -------------------------------------------------------------------------------- - final int numHaplotypes = haplotypes.size(); - final Set sampleKeySet = stratifiedReadMap.keySet(); - final List bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - final List haplotypesAsAlleles = new ArrayList(); - for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); } + /** + * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele + * @param map an annoying map object that moves us between the allele and haplotype representation + * @param haplotypeAsAllele the allele version of the haplotype + * @return the haplotype version, with its score incremented by 1 if its non-reference + */ + private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { + final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic + if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value + return h; + } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together + /** + * Take the best N haplotypes and return them as a list + * + * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample + * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing + * order of score (so higher score haplotypes are preferred). The N we take is determined by + * + * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) + * + * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is + * bounded by maxNumHaplotypesInPopulation as that number can grow without bound + * + * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 + * @param nSamples the number of samples used to select the haplotypes + * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples + * @return a list of N or fewer haplotypes, with the reference haplotype first + */ + private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { + final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); + Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); + final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; + final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); + final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); + if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); + return bestHaplotypes; + } - int hap1 = 0; - int hap2 = 0; - //double bestElement = Double.NEGATIVE_INFINITY; - final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 ); - while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) { - double maxElement = Double.NEGATIVE_INFINITY; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) { - maxElement = haplotypeLikelihoodMatrix[iii][jjj]; - hap1 = iii; - hap2 = jjj; - } - } - } - if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } - haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; + /** + * Select the best haplotypes for genotyping the samples in stratifiedReadMap + * + * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely + * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for + * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get + * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation + * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the + * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. + * + * @param haplotypes a list of all haplotypes we're considering + * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype + * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes + * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation + */ + public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { + if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } - if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } + if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes + + // all of the haplotypes that at least one sample called as one of the most likely + final Set selectedHaplotypes = new HashSet(); + selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected + + // our annoying map from allele -> haplotype + final Map allele2Haplotype = new HashMap(); + for ( final Haplotype h : haplotypes ) { + h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes + allele2Haplotype.put(Allele.create(h, h.isReference()), h); } - if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + // for each sample, compute the most likely pair of haplotypes + for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { + // get the two most likely haplotypes under a diploid model for this sample + final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - final List bestHaplotypes = new ArrayList(); - for( final int hIndex : bestHaplotypesIndexList ) { - bestHaplotypes.add( haplotypes.get(hIndex) ); + if ( mla != null ) { // there was something to evaluate in this sample + // note that there must be at least 2 haplotypes + final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); + final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); + +// if ( DEBUG ) { +// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +// } + + // add these two haplotypes to the set of haplotypes that have been selected + selectedHaplotypes.add(best); + selectedHaplotypes.add(second); + + // we've already selected all of our haplotypes, and we don't need to prune them down + if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) + break; + } + } + + // take the best N haplotypes forward, in order of the number of samples that choose them + final int nSamples = stratifiedReadMap.size(); + final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); + + if ( DEBUG ) { + logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); + for ( final Haplotype h : bestHaplotypes ) { + logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); + } } return bestHaplotypes; } - public static int findReferenceIndex( final List haplotypes ) { + /** + * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found + * @param haplotypes non-null list of haplotypes + * @return the reference haplotype + */ + private static Haplotype findReferenceHaplotype( final List haplotypes ) { for( final Haplotype h : haplotypes ) { - if( h.isReference() ) { return haplotypes.indexOf(h); } + if( h.isReference() ) return h; } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index f09711094..92cf45652 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 9cd225df3..a308b4893 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a"); + HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988"); + HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a62597f2c005f373efbe398ab51a2f1")); + Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd")); + Arrays.asList("8adfa8a27a312760dab50787da595c57")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java index cf077392b..08d82281e 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java @@ -53,12 +53,14 @@ import org.testng.annotations.Test; public class MostLikelyAlleleUnitTest extends BaseTest { final Allele a = Allele.create("A"); + final Allele b = Allele.create("C"); @Test public void testBasicCreation() { final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1; - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second); Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), b); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second); @@ -73,7 +75,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testNotDefaultInformative() { final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2); - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second); Assert.assertEquals(mla.isInformative(), false); Assert.assertEquals(mla.isInformative(10), false); Assert.assertEquals(mla.isInformative(0), true); @@ -84,8 +86,9 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testCreationNoGoodSecond() { - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY); + MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), null); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); @@ -99,7 +102,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testCreationNoAllele() { - MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); + MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index c50849a54..6ca49d3e5 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -47,11 +47,9 @@ package org.broadinstitute.sting.utils.genotyper; import net.sf.samtools.*; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -298,4 +296,55 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood); Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood); } + + @DataProvider(name = "MostLikelyAlleleData") + public Object[][] makeMostLikelyAlleleData() { + List tests = new ArrayList(); + + final Allele a = Allele.create("A"); + final Allele c = Allele.create("C"); + final Allele g = Allele.create("G"); + + tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c}); + tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), + Arrays.asList( + Arrays.asList(0.0, -10.0, -10.0), + Arrays.asList(-100.0, 0.0, -10.0)), + c, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), + Arrays.asList( + Arrays.asList(0.0, -10.0, -10.0), + Arrays.asList(-20.0, 0.0, -100.0)), + c, a}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MostLikelyAlleleData") + public void testMostLikelyAllele(final List alleles, final List> perReadlikelihoods, final Allele best, final Allele second) { + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + + for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) { + final List likelihoods = perReadlikelihoods.get(readI); + + final byte[] bases = Utils.dupBytes((byte)'A', 10); + final byte[] quals = Utils.dupBytes((byte) 30, 10); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M"); + read.setReadName("readName" + readI); + + for ( int i = 0; i < alleles.size(); i++ ) { + final Allele allele = alleles.get(i); + final double likelihood = likelihoods.get(i); + map.add(read, allele, likelihood); + } + } + + final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles(); + Assert.assertEquals(mla.getMostLikelyAllele(), best); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), second); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java index e12fb546c..03a2b8077 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -50,6 +50,7 @@ public final class MostLikelyAllele { public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; final Allele mostLikely; + final Allele secondLikely; final double log10LikelihoodOfMostLikely; final double log10LikelihoodOfSecondBest; @@ -60,10 +61,11 @@ public final class MostLikelyAllele { * mostLikely should be a NO_CALL allele. * * @param mostLikely the most likely allele + * @param secondMostLikely the most likely allele after mostLikely * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) */ - public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); @@ -73,6 +75,7 @@ public final class MostLikelyAllele { throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest); this.mostLikely = mostLikely; + this.secondLikely = secondMostLikely; this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely; this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest; } @@ -81,6 +84,10 @@ public final class MostLikelyAllele { return mostLikely; } + public Allele getSecondMostLikelyAllele() { + return secondLikely; + } + public double getLog10LikelihoodOfMostLikely() { return log10LikelihoodOfMostLikely; } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 47be30871..8134b1257 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -27,10 +27,13 @@ package org.broadinstitute.sting.utils.genotyper; import com.google.java.contract.Ensures; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import java.io.PrintStream; @@ -41,6 +44,7 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { + private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class); protected List alleles; protected Map> likelihoodReadMap; @@ -187,6 +191,57 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + /** + * Get the most likely alleles estimated across all reads in this object + * + * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for + * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum + * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. + * + * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele + * doesn't have a meaningful get best likelihood. + * + * @return a MostLikelyAllele object, or null if this map is empty + */ + public MostLikelyAllele getMostLikelyDiploidAlleles() { + if ( isEmpty() ) return null; + + int hap1 = 0; + int hap2 = 0; + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < alleles.size(); iii++ ) { + final Allele iii_allele = alleles.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleles.get(jjj); + + double haplotypeLikelihood = 0.0; + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + final GATKSAMRecord read = entry.getKey(); + final int count = ReadUtils.getMeanRepresentativeReadCount(read); + final double likelihood_iii = entry.getValue().get(iii_allele); + final double likelihood_jjj = entry.getValue().get(jjj_allele); + haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF); + + // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair + if ( haplotypeLikelihood < maxElement ) break; + } + + // keep track of the max element and associated indices + if ( haplotypeLikelihood > maxElement ) { + hap1 = iii; + hap2 = jjj; + maxElement = haplotypeLikelihood; + } + } + } + + if ( maxElement == Double.NEGATIVE_INFINITY ) + throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); + + return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); + } + private static final double LOG_ONE_HALF = -Math.log10(2.0); /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. @@ -213,6 +268,7 @@ public class PerReadAlleleLikelihoodMap { double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; + Allele secondMostLikely = null; for (final Map.Entry el : alleleMap.entrySet()) { if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) @@ -221,13 +277,15 @@ public class PerReadAlleleLikelihoodMap { if (el.getValue() > maxLike) { prevMaxLike = maxLike; maxLike = el.getValue(); + secondMostLikely = mostLikelyAllele; mostLikelyAllele = el.getKey(); } else if( el.getValue() > prevMaxLike ) { + secondMostLikely = el.getKey(); prevMaxLike = el.getValue(); } } - return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike); + return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); } /** From 17982bcbf86dc4aac940cbe1f2f96c9ef40eb669 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 16 Apr 2013 11:45:45 -0400 Subject: [PATCH 186/226] Update MD5s for VQSR header change --- ...antRecalibrationWalkersIntegrationTest.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 7fddc63c8..e7a3f23a4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -73,8 +73,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", "4d08c8eee61dd1bdea8c5765f34e41f0", // tranches - "83756d1058ee3c816edf643148ae20df", // recal file - "06353a59fa4857135b5a63ea0791b035"); // cut VCF + "ca7de32b6143cce58aa4bc59b311feb7", // recal file + "cc7f413ba50b3d12f11f95aaa31e67d1"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -122,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", "6a1eef4d02857dbb117a15420b5c0ce9", // tranches - "ea85f0293e9c016bd1bbe3c2977905d8", // recal file - "4cab4a11130e2f84bd5fe4f9981811bd"); // cut VCF + "db9faaee11ee5427a81ddee328245f8c", // recal file + "42e0fcd8e048a5f6abc41a4d1c3e97a5"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -174,14 +174,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest indelUnfiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . "b7589cd098dc153ec64c02dcff2838e4", // tranches - "6091d44e5c750620c6d5493864eeb160", // recal file - "ef4c7931f134c1c860864772d69dd89c"); // cut VCF + "5a9ba210a3c68109289a71039a04509d", // recal file + "d816bd43c844069d65711a7975707437"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS "b7589cd098dc153ec64c02dcff2838e4", // tranches - "6091d44e5c750620c6d5493864eeb160", // recal file - "f8decee61f409b6041856c5a20e3865d"); // cut VCF + "5a9ba210a3c68109289a71039a04509d", // recal file + "6bcb344511c727c28523825f73c7daee"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -239,7 +239,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("8d2e886523c050e0ea2952cbbde4cc26")); + Arrays.asList("20c23643a78c5b95abd1526fdab8960d")); executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); } } From e0dfe5ca149b702e25b4146e5128c90644ff5a07 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 16 Apr 2013 09:37:04 -0400 Subject: [PATCH 187/226] Restore the read filter function in the HaplotypeCaller. --- .../walkers/haplotypecaller/HaplotypeCaller.java | 11 ++++++----- ...omplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 16 ++++++++-------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index cd3847b81..2ecc152df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -565,17 +565,18 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // abort early if something is out of the acceptable range if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! - if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! if (dontGenotype) return 1; // user requested we not proceed - // evaluate each sample's reads against all haplotypes - //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); - final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); - // filter out reads from genotyping which fail mapping quality based criteria final List filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); + if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + + // evaluate each sample's reads against all haplotypes + //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); + // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) final List bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 92cf45652..f8580f271 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "c0379d32c8c743d84c6da5956d67c004"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "125e93deeb3b390a14d9b777aa2a220f"); + "2fb56d241baca3658af5811e680bde4c"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "6957fd0e8a5bc66d2572a6ca8626fa7a"); + "bd7d24e87776f939b36742c1fd33b25c"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a308b4893..a77304e57 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4"); + HCTest(CEUTRIO_BAM, "", "943302eb9b9798d1ffeb9136612cbc85"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a"); + HCTest(NA12878_BAM, "", "3199bebe4e34b5df7558f74b05fb3a4e"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "d00a604abe02586f803b1bb9d63af0f7"); + "aef51f79d58634e4b35a1a98caba329c"); } @Test @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5ac0d4b30a0c9a97a71ad014e63f11cf"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "a7e3b05fdc9866965e3ab71dbbd288ff"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8252f956e94cb8538b18210e9350f0e3")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d")); + Arrays.asList("7d4da215e86658e8da70fa0ade7f3eca")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("8adfa8a27a312760dab50787da595c57")); + Arrays.asList("b0f0467dd4bfc4cdc85fff85ffa6f0c1")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From df189293ce2d7b4498a774c6a9169c1cf4ff86b9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 8 Apr 2013 14:01:57 -0400 Subject: [PATCH 188/226] Improve compression in Reduce Reads by incorporating probabilistic model and global het compression The Problem: Exomes seem to be more prone to base errors and one error in 20x coverage (or below, like most regions in an exome) causes RR (with default settings) to consider it a variant region. This seriously hurts compression performance. The Solution: 1. We now use a probabilistic model for determining whether we can create a consensus (in other words, whether we can error correct a site) instead of the old ratio threshold. We calculate the cumulative binomial probability of seeing the given ratio and trigger consensus creation if that pvalue is lower than the provided threshold (0.01 by default, so rather conservative). 2. We also allow het compression globally, not just at known sites. So if we cannot create a consensus at a given site then we try to perform het compression; and if we cannot perform het compression that we just don't reduce the variant region. This way very wonky regions stay uncompressed, regions with one errorful read get fully compressed, and regions with one errorful locus get het compressed. Details: 1. -minvar is now deprecated in favor of -min_pvalue. 2. Added integration test for bad pvalue input. 3. -known argument still works to force het compression only at known sites; if it's not included then we allow het compression anywhere. Added unit tests for this. 4. This commit includes fixes to het compression problems that were revealed by systematic qual testing. Before finalizing het compression, we now check for insertions or other variant regions (usually due to multi-allelics) which can render a region incompressible (and we back out if we find one). We were checking for excessive softclips before, but now we add these tests too. 5. We now allow het compression on some but not all of the 4 consensus reads: if creating one of the consensuses is not possible (e.g. because of excessive softclips) then we just back that one consensus out instead of backing out all of them. 6. We no longer create a mini read at the stop of the variant window for het compression. Instead, we allow it to be part of the next global consensus. 7. The coverage test is no longer run systematically on all integration tests because the quals test supercedes it. The systematic quals test is now much stricter in order to catch bugs and edge cases (very useful!). 8. Each consensus (both the normal and filtered) keep track of their own mapping qualities (before the MQ for a consensus was affected by good and bad bases/reads). 9. We now completely ignore low quality bases, unless they are the only bases present in a pileup. This way we preserve the span of reads across a region (needed for assembly). Min base qual moved to Q15. 10.Fixed long-standing bug where sliding window didn't do the right thing when removing reads that start with insertions from a header. Note that this commit must come serially before the next commit in which I am refactoring the binomial prob code in MathUtils (which is failing and slow). --- .../reducereads/BaseAndQualsCounts.java | 51 ++- .../compression/reducereads/BaseCounts.java | 43 ++- .../reducereads/HeaderElement.java | 141 ++++---- .../reducereads/MultiSampleCompressor.java | 4 +- .../compression/reducereads/ReduceReads.java | 38 ++- .../reducereads/SingleSampleCompressor.java | 8 +- .../reducereads/SlidingWindow.java | 321 ++++++++++-------- .../gatk/walkers/qc/AssessReducedQuals.java | 5 +- .../reducereads/BaseCountsUnitTest.java | 2 +- .../reducereads/HeaderElementUnitTest.java | 52 +-- .../ReduceReadsIntegrationTest.java | 72 ++-- .../reducereads/ReduceReadsUnitTest.java | 6 +- .../reducereads/SlidingWindowUnitTest.java | 48 ++- .../broadinstitute/sting/utils/MathUtils.java | 4 +- 14 files changed, 484 insertions(+), 311 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index c7b990a88..416f66ec6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -69,10 +69,30 @@ public class BaseAndQualsCounts extends BaseCounts { private long sumInsertionQual_N = 0; private long sumDeletionQual_N = 0; + /* + * Increments the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + */ + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + // if we already have high quality bases, ignore low quality ones + if ( isLowQualBase && !isLowQuality() ) + return; + + // if this is a high quality base then remove any low quality bases and start from scratch + if ( !isLowQualBase && isLowQuality() ) { + if ( totalCount() > 0 ) + clear(); + setLowQuality(false); + } - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { final BaseIndex i = BaseIndex.byteToBase(base); - super.incr(i, baseQual); + super.incr(i, baseQual, baseMappingQual); switch (i) { case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; @@ -84,9 +104,23 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { + /* + * Decrements the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + */ + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + // if this is not the right type of base, ignore it + if ( isLowQualBase != isLowQuality() ) + return; + final BaseIndex i = BaseIndex.byteToBase(base); - super.decr(i, baseQual); + super.decr(i, baseQual, baseMappingQual); switch (i) { case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; @@ -131,4 +165,13 @@ public class BaseAndQualsCounts extends BaseCounts { default: throw new IllegalArgumentException(base.name()); } } + + /** + * Clears out all stored data in this object + */ + public void clear() { + super.clear(); + sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0; + sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 17ce3c90d..afcaf1510 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import org.broadinstitute.sting.utils.MathUtils; /** @@ -78,6 +80,8 @@ import com.google.java.contract.Requires; private int count_N = 0; private int sumQual_N = 0; private int totalCount = 0; // keeps track of total count since this is requested so often + private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this + private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise public static BaseCounts createWithCounts(int[] countsACGT) { @@ -100,6 +104,7 @@ import com.google.java.contract.Requires; this.count_I += other.count_I; this.count_N += other.count_N; this.totalCount += other.totalCount; + this.mappingQualities.addAll(other.mappingQualities); } @Requires("other != null") @@ -112,6 +117,7 @@ import com.google.java.contract.Requires; this.count_I -= other.count_I; this.count_N -= other.count_N; this.totalCount -= other.totalCount; + this.mappingQualities.removeAll(other.mappingQualities); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") @@ -120,7 +126,7 @@ import com.google.java.contract.Requires; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final BaseIndex base, final byte qual) { + public void incr(final BaseIndex base, final byte qual, final int mappingQuality) { switch (base) { case A: ++count_A; sumQual_A += qual; break; case C: ++count_C; sumQual_C += qual; break; @@ -131,6 +137,7 @@ import com.google.java.contract.Requires; case N: ++count_N; sumQual_N += qual; break; } ++totalCount; + mappingQualities.add(mappingQuality); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") @@ -152,7 +159,7 @@ import com.google.java.contract.Requires; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final BaseIndex base, final byte qual) { + public void decr(final BaseIndex base, final byte qual, final int mappingQuality) { switch (base) { case A: --count_A; sumQual_A -= qual; break; case C: --count_C; sumQual_C -= qual; break; @@ -163,6 +170,7 @@ import com.google.java.contract.Requires; case N: --count_N; sumQual_N -= qual; break; } --totalCount; + mappingQualities.remove((Integer) mappingQuality); } @Ensures("result >= 0") @@ -229,6 +237,15 @@ import com.google.java.contract.Requires; return totalCount; } + /** + * The RMS of the mapping qualities of all reads that contributed to this object + * + * @return the RMS of the mapping qualities of all reads that contributed to this object + */ + public double getRMS() { + return MathUtils.rms(mappingQualities); + } + /** * Given a base , it returns the proportional count of this base compared to all other bases * @@ -325,4 +342,26 @@ import com.google.java.contract.Requires; final int total = totalCountWithoutIndels(); return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total; } + + /** + * @return true if this instance represents low quality bases + */ + public boolean isLowQuality() { return isLowQuality; } + + /** + * Sets the low quality value + * + * @param value true if this instance represents low quality bases false otherwise + */ + public void setLowQuality(final boolean value) { isLowQuality = value; } + + /** + * Clears out all stored data in this object + */ + public void clear() { + count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0; + sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0; + totalCount = 0; + mappingQualities.clear(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 3532a74fb..616388e8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -46,14 +46,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - /** * The element that describes the header of the sliding window. @@ -68,7 +64,6 @@ public class HeaderElement { private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right private int nSoftClippedBases; // How many bases in this site came from soft clipped bases private int location; // Genome location of this site (the sliding window knows which contig we're at - private IntArrayList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site) public int getLocation() { return location; @@ -89,7 +84,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new IntArrayList()); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location); } /** @@ -99,7 +94,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new IntArrayList()); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location); } /** @@ -110,16 +105,14 @@ public class HeaderElement { * @param insertionsToTheRight number of insertions to the right of this HeaderElement * @param nSoftClippedBases number of softclipped bases of this HeaderElement * @param location the reference location of this reference element - * @param mappingQuality the list of mapping quality values of all reads that contributed to this * HeaderElement */ - public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, IntArrayList mappingQuality) { + public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location) { this.consensusBaseCounts = consensusBaseCounts; this.filteredBaseCounts = filteredBaseCounts; this.insertionsToTheRight = insertionsToTheRight; this.nSoftClippedBases = nSoftClippedBases; this.location = location; - this.mappingQuality = mappingQuality; } /** @@ -128,35 +121,52 @@ public class HeaderElement { * * @return true if site is variant by any definition. False otherwise. */ - public boolean isVariant(double minVariantProportion, double minIndelProportion) { - return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); + public boolean isVariant(double minVariantPvalue, double minIndelProportion) { + return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); } /** * Adds a new base to the HeaderElement updating all counts accordingly * - * @param base the base to add + * @param base the base to add * @param baseQual the base quality + * @param insQual the base insertion quality + * @param delQual the base deletion quality * @param baseMappingQuality the mapping quality of the read this base belongs to + * @param minBaseQual the minimum base qual allowed to be a good base + * @param minMappingQual the minimum mapping qual allowed to be a good read + * @param isSoftClipped true if the base is soft-clipped in the original read */ public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { - if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual)) - consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts + // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts + if ( baseMappingQuality >= minMappingQual ) + consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); else - filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts + filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site - nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter + nSoftClippedBases += isSoftClipped ? 1 : 0; } + /** + * Adds a new base to the HeaderElement updating all counts accordingly + * + * @param base the base to add + * @param baseQual the base quality + * @param insQual the base insertion quality + * @param delQual the base deletion quality + * @param baseMappingQuality the mapping quality of the read this base belongs to + * @param minBaseQual the minimum base qual allowed to be a good base + * @param minMappingQual the minimum mapping qual allowed to be a good read + * @param isSoftClipped true if the base is soft-clipped in the original read + */ public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { - if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual)) - consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts + // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts + if ( baseMappingQuality >= minMappingQual ) + consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); else - filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts + filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site - nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter + nSoftClippedBases -= isSoftClipped ? 1 : 0; } /** * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions @@ -193,15 +203,6 @@ public class HeaderElement { return (!hasFilteredData() && !hasConsensusData()); } - /** - * The RMS of the mapping qualities of all reads that contributed to this HeaderElement - * - * @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement - */ - public double getRMS() { - return MathUtils.rms(mappingQuality); - } - /** * removes an insertion from this element (if you removed a read that had an insertion) */ @@ -236,7 +237,7 @@ public class HeaderElement { /** * Whether or not the HeaderElement is variant due to excess deletions * - * @return whether or not the HeaderElement is variant due to excess insertions + * @return whether or not the HeaderElement is variant due to excess deletions */ private boolean isVariantFromDeletions(double minIndelProportion) { return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; @@ -245,12 +246,15 @@ public class HeaderElement { /** * Whether or not the HeaderElement is variant due to excess mismatches * - * @return whether or not the HeaderElement is variant due to excess insertions + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return whether or not the HeaderElement is variant due to excess mismatches */ - protected boolean isVariantFromMismatches(double minVariantProportion) { - BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); - double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); - return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); + protected boolean isVariantFromMismatches(double minVariantPvalue) { + final int totalCount = consensusBaseCounts.totalCountWithoutIndels(); + final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); + final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon); + final double pvalue = countOfOtherBases == 0 ? 0.0 : MathUtils.binomialCumulativeProbability(0, countOfOtherBases+1, totalCount, 0.5); + return pvalue > minVariantPvalue; } /** @@ -263,42 +267,44 @@ public class HeaderElement { return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); } - protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) { - return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; - } - /** * Calculates the number of alleles necessary to represent this site. * - * @param minVariantProportion the minimum proportion to call a site variant. - * @param allowDeletions should we allow deletions? - * @return the number of alleles necessary to represent this site or -1 if allowDeletions is false and there are a sufficient number of them + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return the number of alleles necessary to represent this site or -1 if there are too many indels */ - public int getNumberOfAlleles(final double minVariantProportion, final boolean allowDeletions) { - final List alleles = getAlleles(minVariantProportion, allowDeletions); + public int getNumberOfBaseAlleles(final double minVariantPvalue) { + final ObjectArrayList alleles = getAlleles(minVariantPvalue); return alleles == null ? -1 : alleles.size(); } /** * Calculates the alleles necessary to represent this site. * - * @param minVariantProportion the minimum proportion to call a site variant. - * @param allowDeletions should we allow deletions? - * @return the list of alleles necessary to represent this site or null if allowDeletions is false and there are a sufficient number of them + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return the list of alleles necessary to represent this site or null if there are too many indels */ - public List getAlleles(final double minVariantProportion, final boolean allowDeletions) { + public ObjectArrayList getAlleles(final double minVariantPvalue) { + // make sure we have bases at all final int totalBaseCount = consensusBaseCounts.totalCount(); if ( totalBaseCount == 0 ) - return Collections.emptyList(); + return new ObjectArrayList(0); - final int minBaseCountForRelevantAlleles = Math.max(1, (int)(minVariantProportion * totalBaseCount)); + // next, check for insertions + if ( hasSignificantCount(insertionsToTheRight, minVariantPvalue) ) + return null; - final List alleles = new ArrayList(4); + // finally, check for the bases themselves (including deletions) + final ObjectArrayList alleles = new ObjectArrayList(4); for ( final BaseIndex base : BaseIndex.values() ) { final int baseCount = consensusBaseCounts.countOfBase(base); + if ( baseCount == 0 ) + continue; - if ( baseCount >= minBaseCountForRelevantAlleles ) { - if ( !allowDeletions && base == BaseIndex.D ) + final double pvalue = MathUtils.binomialCumulativeProbability(0, baseCount+1, totalBaseCount, 0.5); + + if ( pvalue > minVariantPvalue ) { + if ( base == BaseIndex.D ) return null; alleles.add(base); } @@ -309,15 +315,26 @@ public class HeaderElement { /* * Checks whether there are a significant number of softclips. * - * @param minVariantProportion the minimum proportion to consider something significant. + * @param minVariantPvalue the minimum pvalue to call a site variant. * @return true if there are significant softclips, false otherwise */ - public boolean hasSignificantSoftclips(final double minVariantProportion) { + public boolean hasSignificantSoftclips(final double minVariantPvalue) { + return hasSignificantCount(nSoftClippedBases, minVariantPvalue); + } + + /* + * Checks whether there are a significant number of count. + * + * @param count the count to test against + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return true if there is a significant count given the provided pvalue, false otherwise + */ + private boolean hasSignificantCount(final int count, final double minVariantPvalue) { final int totalBaseCount = consensusBaseCounts.totalCount(); - if ( totalBaseCount == 0 ) + if ( count == 0 || totalBaseCount == 0 ) return false; - final int minBaseCountForSignificance = Math.max(1, (int)(minVariantProportion * totalBaseCount)); - return nSoftClippedBases >= minBaseCountForSignificance; + final double pvalue = MathUtils.binomialCumulativeProbability(0, count+1, totalBaseCount, 0.5); + return pvalue > minVariantPvalue; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 42873964d..85aee9fc9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -96,14 +96,14 @@ public class MultiSampleCompressor { final int contextSize, final int downsampleCoverage, final int minMappingQuality, - final double minAltProportionToTriggerVariant, + final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index c9730e95a..4d90a83be 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -64,6 +64,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -139,19 +140,21 @@ public class ReduceReads extends ReadWalker, Redu * towards variable regions. */ @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) - public byte minBaseQual = 20; + public byte minBaseQual = 15; /** - * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality - * lower than this threshold will be hard clipped off before entering the reduce reads algorithm. + * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with + * quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm. */ @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) public byte minTailQuality = 2; /** - * Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction. + * Any number of VCF files representing known SNPs to be used for the polyploid-based reduction. * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored. - * Note that polyploid ("het") compression will work only when a single SNP is present in a consensus window. + * If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present + * in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will + * be triggered anywhere there is a single SNP present in a consensus window. */ @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false) public List> known = Collections.emptyList(); @@ -204,9 +207,18 @@ public class ReduceReads extends ReadWalker, Redu * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be * considered consensus. */ + @Deprecated @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) public double minAltProportionToTriggerVariant = 0.05; + /** + * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region. + * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to trigger polyploid compression). + */ + @Advanced + @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false) + public double minAltPValueToTriggerVariant = 0.01; + /** * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be * considered consensus. @@ -253,7 +265,7 @@ public class ReduceReads extends ReadWalker, Redu ObjectSortedSet intervalList; - final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); + ObjectSortedSet knownSnpPositions; // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag @@ -273,6 +285,14 @@ public class ReduceReads extends ReadWalker, Redu if ( nwayout && out != null ) throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other"); + if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) + throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); + + if ( known.isEmpty() ) + knownSnpPositions = null; + else + knownSnpPositions = new ObjectAVLTreeSet(); + GenomeAnalysisEngine toolkit = getToolkit(); readNameHash = new Object2LongOpenHashMap(100000); // prepare the read name hash to keep track of what reads have had their read names compressed intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode @@ -392,7 +412,7 @@ public class ReduceReads extends ReadWalker, Redu */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } /** @@ -470,8 +490,8 @@ public class ReduceReads extends ReadWalker, Redu * @param read the current read, used for checking whether there are stale positions we can remove */ protected void clearStaleKnownPositions(final GATKSAMRecord read) { - // nothing to clear if empty - if ( knownSnpPositions.isEmpty() ) + // nothing to clear if not used or empty + if ( knownSnpPositions == null || knownSnpPositions.isEmpty() ) return; // not ready to be cleared until we encounter a read from a different contig diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index db1e0baaf..ec041386c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -62,7 +62,7 @@ public class SingleSampleCompressor { final private int contextSize; final private int downsampleCoverage; final private int minMappingQuality; - final private double minAltProportionToTriggerVariant; + final private double minAltPValueToTriggerVariant; final private double minIndelProportionToTriggerVariant; final private int minBaseQual; final private ReduceReads.DownsampleStrategy downsampleStrategy; @@ -75,7 +75,7 @@ public class SingleSampleCompressor { public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, final int minMappingQuality, - final double minAltProportionToTriggerVariant, + final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy) { @@ -83,7 +83,7 @@ public class SingleSampleCompressor { this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; this.slidingWindowCounter = 0; - this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant; + this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant; this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minBaseQual = minBaseQual; this.downsampleStrategy = downsampleStrategy; @@ -114,7 +114,7 @@ public class SingleSampleCompressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 8a80c5570..5fd7724cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -97,9 +97,8 @@ public class SlidingWindow { protected int filteredDataConsensusCounter; protected String filteredDataReadName; - // Additional parameters - protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches + protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value protected int MIN_MAPPING_QUALITY; @@ -150,11 +149,15 @@ public class SlidingWindow { this.readsInWindow = new ObjectAVLTreeSet(); } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { + public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader, + final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber, + final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, + final int minBaseQual, final int minMappingQuality, final int downsampleCoverage, + final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; - this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant; + this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant; this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant; this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; this.MIN_MAPPING_QUALITY = minMappingQuality; @@ -341,8 +344,14 @@ public class SlidingWindow { private final MarkedSites markedSites = new MarkedSites(); /** - * returns an array marked with variant and non-variant regions (it uses - * markVariantRegions to make the marks) + * returns the MarkedSites object so that it can be tested after adding data to the Sliding Window + * + * @return the Marked Sites object used by this Sliding Window + */ + protected MarkedSites getMarkedSitesForTesting() { return markedSites; } + + /** + * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks) * * @param stop check the window from start to stop (not-inclusive) */ @@ -368,8 +377,8 @@ public class SlidingWindow { if (headerElementIterator.hasNext()) { HeaderElement headerElement = headerElementIterator.next(); - if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) - markVariantRegion(markedSites, i - windowHeaderStartLocation); + if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) + markVariantRegion(i - windowHeaderStartLocation); } else break; @@ -379,14 +388,24 @@ public class SlidingWindow { /** * Marks the sites around the variant site (as true) * - * @param markedSites the boolean array to bear the marks * @param variantSiteLocation the location where a variant site was found */ - protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) { + protected void markVariantRegion(final int variantSiteLocation) { int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; - int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1; - for (int i = from; i < to; i++) - markedSites.getVariantSiteBitSet()[i] = true; + int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize; + markRegionAs(from, to, true); + } + + /** + * Marks the sites around the variant site (as true) + * + * @param from the start index (inclusive) to mark + * @param to the end index (inclusive) to mark + * @param isVariant mark the region with this boolean value + */ + private void markRegionAs(final int from, final int to, final boolean isVariant) { + for (int i = from; i <= to; i++) + markedSites.getVariantSiteBitSet()[i] = isVariant; } /** @@ -580,7 +599,7 @@ public class SlidingWindow { filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType); } - genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); + genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts()); } return result; @@ -611,7 +630,7 @@ public class SlidingWindow { if (!headerElement.hasConsensusData()) throw new ReviewedStingException("No CONSENSUS data in " + index); - genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS()); + genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts()); } } @@ -620,15 +639,14 @@ public class SlidingWindow { * * @param syntheticRead the synthetic read to add to * @param baseCounts the base counts object in the header element - * @param rms the rms mapping quality in the header element */ - private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) { + private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) { final BaseIndex base = baseCounts.baseIndexWithMostProbability(); byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE); byte qual = baseCounts.averageQualsOfBase(base); byte insQual = baseCounts.averageInsertionQualsOfBase(base); byte delQual = baseCounts.averageDeletionQualsOfBase(base); - syntheticRead.add(base, count, qual, insQual, delQual, rms); + syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS()); } /** @@ -636,39 +654,30 @@ public class SlidingWindow { * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here - * @return a non-null list of all reads contained in the variant region + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) + * @return a non-null object representing all reads contained in the variant region */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - ObjectList allReads = new ObjectArrayList(); + protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { + final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop); // Try to compress into a polyploid consensus - // Optimization: don't bother if there are no known SNPs - final int hetRefPosition = knownSnpPositions.isEmpty() ? -1 : findSinglePolyploidCompressiblePosition(start, stop); - - boolean successfullyCreatedPolyploidConsensus = false; + // Optimization: don't bother if there are no known SNPs here + final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop); // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by // insertions (which we don't want because we can't confirm that they represent the same allele). - // Also, we only allow polyploid consensus creation at known sites. + // Also, we only allow polyploid consensus creation at known sites if provided. if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) { - // try to create the polyploid consensus - final ObjectList polyploidReads = createPolyploidConsensus(start, stop, hetRefPosition); - - // if successful we are good to go! - if ( polyploidReads != null ) { - allReads.addAll(polyploidReads); - successfullyCreatedPolyploidConsensus = true; - } + allReads.reads.addAll(createPolyploidConsensus(hetRefPosition)); + allReads.stopPerformed = hetRefPosition; // we stopped at the het position } - // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them // from the window header entirely; also remove all reads preceding the variant region (since they will be output // as consensus right after compression) - if ( !successfullyCreatedPolyploidConsensus ) { + else { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); @@ -676,7 +685,7 @@ public class SlidingWindow { for ( final GATKSAMRecord read : readsInWindow ) { if ( read.getSoftStart() <= refStop ) { if ( read.getAlignmentEnd() >= refStart ) { - allReads.add(read); + allReads.reads.add(read); removeFromHeader(windowHeader, read); } toRemove.add(read); @@ -687,6 +696,7 @@ public class SlidingWindow { for ( final GATKSAMRecord read : toRemove ) readsInWindow.remove(read); } + return allReads; } @@ -694,13 +704,13 @@ public class SlidingWindow { * Determines whether the given position match one of the known sites * * @param targetPosition the position of the het site - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) * @return true if the targetPosition matches a known SNP position, false otherwise */ @Requires({"targetPosition >= 1 && knownSnpPositions != null"}) protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) { final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition); - return knownSnpPositions.contains(targetLoc); + return knownSnpPositions == null || knownSnpPositions.contains(targetLoc); } /* @@ -716,7 +726,7 @@ public class SlidingWindow { for ( int i = start; i <= stop; i++ ) { - final int nAlleles = windowHeader.get(i).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false); + final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT); // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios if ( nAlleles > 2 || nAlleles == -1 ) @@ -736,21 +746,22 @@ public class SlidingWindow { } /* - * Checks whether there's a position in the header with a significant number of softclips. + * Checks whether there's a position in the header with a significant number of softclips or a variant. * * @param header the window header to examine * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument) * @return true if there exists a position with significant softclips, false otherwise */ @Requires("header != null") - protected boolean hasSignificantSoftclipPosition(final List header, final int positionToSkip) { + protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List header, final int positionToSkip) { for ( final HeaderElement headerElement : header ) { if ( headerElement.getLocation() == positionToSkip ) continue; - if ( headerElement.hasSignificantSoftclips(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT) ) + if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) || + headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) > 1 ) return true; } @@ -762,29 +773,45 @@ public class SlidingWindow { * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here - * @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) + * @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - ObjectList allReads = compressVariantRegion(start, stop, knownSnpPositions); + protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { + final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions); - ObjectList result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; - result.addAll(addToSyntheticReads(windowHeader, 0, stop+1, SyntheticRead.StrandType.STRANDLESS)); - result.addAll(finalizeAndAdd(ConsensusType.BOTH)); + final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed); + result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads); + result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS)); + result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary } + /* + * @see #closeVariantRegions(CompressionStash, ObjectSortedSet, boolean) with forceCloseFullRegions set to false + */ + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { + return closeVariantRegions(regions, knownSnpPositions, false); + } + + private static final class CloseVariantRegionResult { + final private ObjectList reads = new ObjectArrayList(); + private int stopPerformed; + + public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; } + } + /* * Finalizes the list of regions requested (and any regions preceding them) * * @param regions the list of regions to finalize - * @param knownSnpPositions the set of known SNP positions + * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) + * @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression) * @return a non-null set of reduced reads representing the finalized regions */ - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) { final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if ( !regions.isEmpty() ) { @@ -794,9 +821,33 @@ public class SlidingWindow { for ( final GenomeLoc region : regions ) { if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { final int start = region.getStart() - windowHeaderStart; - final int stop = region.getStop() - windowHeaderStart; + int stop = region.getStop() - windowHeaderStart; - allReads.addAll(closeVariantRegion(start, stop, knownSnpPositions)); + CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions); + allReads.addAll(closeVariantRegionResult.reads); + + // check whether we didn't close the whole region that was requested + if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) { + // we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant" + markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false); + + // if the calling method said that it didn't care then we are okay so update the stop + if ( !forceCloseFullRegions ) { + stop = closeVariantRegionResult.stopPerformed; + } + // otherwise, we need to forcibly push the stop that we originally requested + else { + while ( closeVariantRegionResult.stopPerformed < stop ) { + // first clean up used header elements so they don't get reused + for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ ) + windowHeader.remove(); + stop -= (closeVariantRegionResult.stopPerformed + 1); + + closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions); + allReads.addAll(closeVariantRegionResult.reads); + } + } + } // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads @@ -847,7 +898,7 @@ public class SlidingWindow { * regions that still exist regardless of being able to fulfill the * context size requirement in the end. * - * @param knownSnpPositions the set of known SNP positions + * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) * @return A non-null set/list of all reads generated */ @Ensures("result != null") @@ -855,12 +906,11 @@ public class SlidingWindow { // mark variant regions ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash regions = new CompressionStash(); - boolean forceCloseUnfinishedRegions = true; if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); - regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); - finalizedReads = closeVariantRegions(regions, knownSnpPositions); + regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true); + finalizedReads = closeVariantRegions(regions, knownSnpPositions, true); if (!windowHeader.isEmpty()) { finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS)); @@ -908,117 +958,105 @@ public class SlidingWindow { } // define this so that we can use Java generics below - private static class HeaderElementList extends LinkedList {} + private final static class HeaderElementList extends LinkedList {} + + private final static class SingleStrandConsensusData { + final HeaderElementList consensus = new HeaderElementList(); + final ObjectList reads = new ObjectArrayList(); + } /** - * Finalizes a variant region for point mutations, and any adjacent synthetic reads. Indel sites are not supported. + * Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not + * supported) with polyploid compression. * - * @param start the first window header index of the variant region (inclusive) - * @param stop the last window header index of the variant region (inclusive) * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE! - * @return a list of all reads contained in the variant region as a polyploid consensus, or null if not possible + * @return a non-null list of all reads contained in the variant region as a polyploid consensus */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - protected ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { + @Ensures({"result != null"}) + protected ObjectList createPolyploidConsensus(final int hetRefPosition) { // we will create two (positive strand, negative strand) headers for each haplotype - final HeaderElementList[] headersPosStrand = new HeaderElementList[2]; - final HeaderElementList[] headersNegStrand = new HeaderElementList[2]; + final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2]; + final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2]; - final int refStart = windowHeader.get(start).getLocation(); - final int refStop = windowHeader.get(stop).getLocation(); final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation(); // initialize the mapping from base (allele) to header final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); - for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false) ) { + for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) ) { final int currentIndex = alleleHeaderMap.size(); if ( currentIndex > 1 ) throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); alleleHeaderMap.put(allele.b, currentIndex); - headersPosStrand[currentIndex] = new HeaderElementList(); - headersNegStrand[currentIndex] = new HeaderElementList(); + headersPosStrand[currentIndex] = new SingleStrandConsensusData(); + headersNegStrand[currentIndex] = new SingleStrandConsensusData(); } // sanity check that we saw 2 alleles if ( alleleHeaderMap.size() != 2 ) throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); - final ObjectList toRemoveFromReadCache = new ObjectArrayList(); - final ObjectList toRemoveFromHeader = new ObjectArrayList(); + final ObjectList readsToRemoveFromHeader = new ObjectArrayList(); for ( final GATKSAMRecord read : readsInWindow ) { - // if the read falls after the region, just skip it for now (we'll get to it later) - if ( read.getSoftStart() > refStop ) + // if the read falls after the het position, just skip it for now (we'll get to it later) + if ( read.getSoftStart() > globalHetRefPosition ) continue; - // if the read falls before the region, remove it - if ( read.getSoftEnd() < refStart ) { - toRemoveFromReadCache.add(read); - continue; - } - - // check whether the read spans the het site - if ( read.getSoftStart() <= globalHetRefPosition && read.getSoftEnd() >= globalHetRefPosition ) { - - // make sure it meets the minimum mapping quality requirement (if not, we'll remove it and not use it for the consensuses) - if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { - - // where on the read is the het position? - final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - - // this is safe because indels are not supported - final byte base = read.getReadBases()[readPosOfHet]; - final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; - - // make sure that the base passes filters (if not, we'll remove it and not use it for the consensuses) - if ( qual >= MIN_BASE_QUAL_TO_COUNT ) { - - // check which allele this read represents - final Integer allele = alleleHeaderMap.get(base); - - // ignore the read if it represents a base that's not part of the consensus - if ( allele != null ) { - // add to the appropriate polyploid header - final LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; - addToHeader(header, read); - } - } - } - - // remove from the standard header so that we don't double count it - toRemoveFromHeader.add(read); - } - - // we remove all reads falling inside the variant region from the window - toRemoveFromReadCache.add(read); - } - - // sanity check that no new "variant region" exists on just a single consensus strand - // due to softclips now that we've broken everything out into their component parts - for ( final LinkedList header : headersPosStrand ) { - if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) - return null; - } - for ( final LinkedList header : headersNegStrand ) { - if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) - return null; - } - - // create the polyploid synthetic reads - final ObjectList hetReads = new ObjectArrayList(); - for ( final LinkedList header : headersPosStrand ) - finalizeHetConsensus(header, false, hetReads); - for ( final LinkedList header : headersNegStrand ) - finalizeHetConsensus(header, true, hetReads); - - // remove all used reads - for ( final GATKSAMRecord read : toRemoveFromReadCache ) + // remove all other reads from the read cache since we're going to use them here readsInWindow.remove(read); - for ( final GATKSAMRecord read : toRemoveFromHeader ) + + // if the read falls before the het position, we don't need to look at it + if ( read.getSoftEnd() < globalHetRefPosition ) + continue; + + // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead removeFromHeader(windowHeader, read); + // make sure it meets the minimum mapping quality requirement (if not, we won't use it for the consensus) + if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { + + // where on the read is the het position? + final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); + + // this is safe because indels are not supported + final byte base = read.getReadBases()[readPosOfHet]; + final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; + + // check which allele this read represents + final Integer allele = alleleHeaderMap.get(base); + + // ignore the read if it represents a base that's not part of the consensus + if ( allele != null ) { + // add to the appropriate polyploid header + final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; + header.reads.add(read); + addToHeader(header.consensus, read); + } + } + } + + // create the polyploid synthetic reads if we can + final ObjectList hetReads = new ObjectArrayList(); + + // sanity check that no new "variant region" exists on just a single consensus strand due to softclips + // or multi-allelic sites now that we've broken everything out into their component parts. if one does + // exist then we need to back out the consensus for that strand only. + for ( final SingleStrandConsensusData header : headersPosStrand ) { + if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) + hetReads.addAll(header.reads); + else + finalizeHetConsensus(header.consensus, false, hetReads); + } + for ( final SingleStrandConsensusData header : headersNegStrand ) { + if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) + hetReads.addAll(header.reads); + else + finalizeHetConsensus(header.consensus, true, hetReads); + } + return hetReads; } @@ -1058,7 +1096,7 @@ public class SlidingWindow { int locationIndex = headerStart < 0 ? 0 : readStart - headerStart; if ( removeRead && locationIndex < 0 ) - throw new ReviewedStingException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); + throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); // we only need to create new header elements if we are adding the read, not when we're removing it if ( !removeRead ) @@ -1138,6 +1176,8 @@ public class SlidingWindow { case H: break; case I: + readBaseIndex += cigarElement.getLength(); + // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it. if ( removeRead && locationIndex == 0 ) break; @@ -1150,7 +1190,6 @@ public class SlidingWindow { else headerElement.addInsertionToTheRight(); - readBaseIndex += cigarElement.getLength(); break; case D: // deletions are added to the baseCounts with the read mapping quality as it's quality score diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java index 597077742..4e5652c45 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -99,7 +100,7 @@ public class AssessReducedQuals extends LocusWalker implem public int sufficientQualSum = 600; @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) - public double qual_epsilon = 0.25; + public double qual_epsilon = 0.10; @Output protected PrintStream out; @@ -145,7 +146,7 @@ public class AssessReducedQuals extends LocusWalker implem } private boolean isGoodRead(final PileupElement p) { - return !p.isDeletion() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20; + return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= 20; } private int getTagIndex(final List tags) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java index 7f41836fa..5ae6e86df 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java @@ -179,7 +179,7 @@ public class BaseCountsUnitTest extends BaseTest { BaseCounts counts = new BaseCounts(); for ( int qual : test.quals ) - counts.incr(BaseIndex.A, (byte)qual); + counts.incr(BaseIndex.A, (byte)qual, 20); final int actualSum = (int)counts.getSumQuals((byte)'A'); final int expectedSum = qualSum(test.quals); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index 2f744e914..435c1029c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -48,12 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class HeaderElementUnitTest extends BaseTest { @@ -119,16 +119,15 @@ public class HeaderElementUnitTest extends BaseTest { Assert.assertFalse(headerElement.hasFilteredData()); Assert.assertFalse(headerElement.hasInsertionToTheRight()); Assert.assertTrue(headerElement.isEmpty()); - Assert.assertEquals(headerElement.getRMS(), 0.0); } private void testHeaderData(final HeaderElement headerElement, final HETest test) { - Assert.assertEquals(headerElement.getRMS(), (double)test.MQ); Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip); Assert.assertFalse(headerElement.isEmpty()); Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertEquals(headerElement.hasConsensusData(), headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual)); - Assert.assertEquals(headerElement.hasFilteredData(), !headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual)); + Assert.assertEquals(headerElement.hasConsensusData(), test.MQ >= minMappingQual); + Assert.assertEquals(headerElement.hasFilteredData(), test.MQ < minMappingQual); + Assert.assertEquals(headerElement.hasConsensusData() ? headerElement.getConsensusBaseCounts().getRMS() : headerElement.getFilteredBaseCounts().getRMS(), (double)test.MQ); Assert.assertFalse(headerElement.isVariantFromMismatches(0.05)); Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip); } @@ -136,13 +135,11 @@ public class HeaderElementUnitTest extends BaseTest { private class AllelesTest { public final int[] counts; - public final double proportion; - public final boolean allowDeletions; + public final double pvalue; - private AllelesTest(final int[] counts, final double proportion, final boolean allowDeletions) { + private AllelesTest(final int[] counts, final double pvalue) { this.counts = counts; - this.proportion = proportion; - this.allowDeletions = allowDeletions; + this.pvalue = pvalue; } } @@ -151,17 +148,15 @@ public class HeaderElementUnitTest extends BaseTest { List tests = new ArrayList(); final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; - final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 }; + final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 }; for ( final int countA : counts ) { for ( final int countC : counts ) { for ( final int countG : counts ) { for ( final int countT : counts ) { for ( final int countD : counts ) { - for ( final double proportion : proportions ) { - for ( final boolean allowDeletions : Arrays.asList(true, false) ) { - tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, proportion, allowDeletions)}); - } + for ( final double pvalue : pvalues ) { + tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)}); } } } @@ -182,28 +177,33 @@ public class HeaderElementUnitTest extends BaseTest { headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false); } - final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion, test.allowDeletions); - final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion, test.allowDeletions); + final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue); + final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue); Assert.assertEquals(nAllelesSeen, nAllelesExpected); } - private static int calculateExpectedAlleles(final int[] counts, final double proportion, final boolean allowDeletions) { - double total = 0.0; + private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) { + int total = 0; for ( final int count : counts ) { total += count; } - final int minCount = Math.max(1, (int)(proportion * total)); - - if ( !allowDeletions && counts[BaseIndex.D.index] >= minCount ) - return -1; - int result = 0; - for ( final int count : counts ) { - if ( count > 0 && count >= minCount ) + for ( int index = 0; index < counts.length; index++ ) { + final int count = counts[index]; + if ( count == 0 ) + continue; + + final double pvalue = MathUtils.binomialCumulativeProbability(0, count + 1, total, 0.5); + + if ( pvalue > targetPvalue ) { + if ( index == BaseIndex.D.index ) + return -1; result++; + } } + return result; } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 65e930b89..1ab001147 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.io.File; @@ -74,10 +75,10 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { - return executeTest(name, spec, false); + return executeTest(name, spec, emptyFileMd5); } - protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final boolean disableQualsTest) { + protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) { final Pair, List> result = super.executeTest(name, spec); // perform some Reduce Reads specific testing now @@ -93,15 +94,13 @@ public class ReduceReadsIntegrationTest extends WalkerTest { reducedInputs.append(file.getAbsolutePath()); } - // run the coverage test - final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); - super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); + // the coverage test is a less stricter version of the quals test so we can safely ignore it for now + //final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); + //super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); // run the quals test - if ( !disableQualsTest ) { - final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); - super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); - } + final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); + super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5))); } return result; @@ -147,62 +146,69 @@ public class ReduceReadsIntegrationTest extends WalkerTest { } private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { - this.RRTest(testName, args, md5, useKnowns, false); + this.RRTest(testName, args, md5, useKnowns, emptyFileMd5); } - private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final boolean disableQualsTest) { + private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); - executeTest(testName, spec, disableQualsTest); + executeTest(testName, spec, qualsTestMD5); } @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false); + RRTest("testDefaultCompression ", L, "62f8cdb85a424e42e9c56f36302d1dba", false); } @Test(enabled = true) public void testDefaultCompressionWithKnowns() { - RRTest("testDefaultCompressionWithKnowns ", L, "79cdbd997196957af63f46353cff710b", true); + RRTest("testDefaultCompressionWithKnowns ", L, "874c0e0a54c3db67f5e9d7c0d45b7844", true); } private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; @Test(enabled = true) public void testMultipleIntervals() { - RRTest("testMultipleIntervals ", intervals, "6733b25e87e3fce5753cf7936ccf934f", false); + RRTest("testMultipleIntervals ", intervals, "2e849f8324b27af36bae8cb9b01722e6", false); } @Test(enabled = true) public void testMultipleIntervalsWithKnowns() { - RRTest("testMultipleIntervalsWithKnowns ", intervals, "99e2a79befc71eaadb4197c66a0d6df8", true); + RRTest("testMultipleIntervalsWithKnowns ", intervals, "71bc2167cc6916288bd34dcf099feebc", true); } + final String highCompressionMD5 = "c83256fa2d6785d5188f50dd45c77e0f"; + @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e3b7e14655973c8950d7fec96321e483", false); + RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, false); } @Test(enabled = true) public void testHighCompressionWithKnowns() { - RRTest("testHighCompressionWithKnowns ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "30a7ed079b3a41ed63e520260fa6afe3", true); + RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, true); } @Test(enabled = true) public void testLowCompression() { - // too much downsampling for quals test - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false, true); + RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a903558ef284381d74b0ad837deb19f6", false); } @Test(enabled = true) public void testLowCompressionWithKnowns() { - // too much downsampling for quals test - RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true, true); + RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a4c5aa158c6ebbc703134cbe2d48619c", true); + } + + @Test(enabled = true) + public void testBadPvalueInput() { + final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01"; + WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class); + executeTest("testBadPvalueInput", spec); } @Test(enabled = true) public void testIndelCompression() { - final String md5 = "f58ae2154e0e5716be0e850b7605856e"; + final String md5 = "56154baed62be07008d3684a0a4c0996"; RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); } @@ -210,28 +216,25 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517")), true); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("d7655de41d90aecb716f79e32d53b2d1"))); } @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022")), true); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fa549ba96ca0ce5fbf3553ba173167e8"))); } @Test(enabled = true) public void testCoReductionWithKnowns() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7")), true); + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9edcf09b21a4ae8d9fc25222bcb0486b"))); } @Test(enabled = true) public void testInsertionsAtEdgeOfConsensus() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4445db7aeddaf2f1d86e1af0cdc74c8"))); + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("27cc8f1a336b2d0a29855ceb8fc988b0"))); } /** @@ -245,7 +248,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("f118e83c394d21d901a24230379864fc"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("34baf99904b676d5f132d3791030ed0a")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); } /** @@ -256,7 +259,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("985c4f15a1d45267abb2f6790267930d"))); } /** @@ -266,7 +269,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("388ef48791965d637e4bdb45d5d7cf01"))); } /** @@ -276,8 +279,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - // don't use quals test here (there's one location with low quals that won't pass; signed off by EB) - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa")), true); + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("cfa2588f5edf74c5ddf3d190f5ac6f2d"))); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java index 15b79b78a..6032affa7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java @@ -46,9 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import it.unimi.dsi.fastutil.objects.*; import net.sf.samtools.SAMFileHeader; import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; @@ -58,6 +56,7 @@ import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -198,6 +197,7 @@ public class ReduceReadsUnitTest extends BaseTest { final ReduceReads rr = new ReduceReads(); RodBinding.resetNameCounter(); rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known")); + rr.knownSnpPositions = new ObjectAVLTreeSet(); final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index f081b9f8a..4bf67f5a2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -200,17 +200,16 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(enabled = true) public void testMarkVariantRegion() { final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites(); - markedSites.updateRegion(100, 100); + slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100); - slidingWindow.markVariantRegion(markedSites, 40); - Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 21); + slidingWindow.markVariantRegion(40); + Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21); - slidingWindow.markVariantRegion(markedSites, 5); - Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 37); + slidingWindow.markVariantRegion(5); + Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37); - slidingWindow.markVariantRegion(markedSites, 95); - Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 52); + slidingWindow.markVariantRegion(95); + Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52); } private static int countTrueBits(final boolean[] bitset) { @@ -254,10 +253,12 @@ public class SlidingWindowUnitTest extends BaseTest { private class ConsensusCreationTest { public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression; public final List myReads = new ArrayList(20); + public final String description; private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; + this.description = String.format("%d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -270,6 +271,7 @@ public class SlidingWindowUnitTest extends BaseTest { private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; + this.description = String.format("%s %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -279,6 +281,8 @@ public class SlidingWindowUnitTest extends BaseTest { myReads.add(createVariantRead(loc, false, false, operator)); } + public String toString() { return description; } + private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) { @@ -315,7 +319,6 @@ public class SlidingWindowUnitTest extends BaseTest { private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); - private static final GenomeLoc loc312 = new UnvalidatingGenomeLoc("1", 0, 1000312, 1000312); private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); @DataProvider(name = "ConsensusCreation") @@ -328,7 +331,6 @@ public class SlidingWindowUnitTest extends BaseTest { tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc312), false, false, 11, 8)}); // test low quality reads tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1)}); @@ -346,7 +348,7 @@ public class SlidingWindowUnitTest extends BaseTest { // test mixture tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 3, 3)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 1, 1)}); // test I/D operators tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9)}); @@ -370,17 +372,22 @@ public class SlidingWindowUnitTest extends BaseTest { for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); - // test WITH het compression + // test WITH het compression at KNOWN sites slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); for ( int i = 0; i < 1200; i++ ) knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); result = slidingWindow.close(knownSNPs); + Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); + // test WITH het compression at ALL sites + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : test.myReads ) + slidingWindow.addRead(read); + result = slidingWindow.close(null); Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); } @@ -405,21 +412,26 @@ public class SlidingWindowUnitTest extends BaseTest { final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - // test WITH het compression - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + // test WITH het compression at KNOWN sites + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); for ( int i = 0; i < readLength; i++ ) knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); result = slidingWindow.close(knownSNPs); + Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all + // test WITH het compression at ALL sites + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + result = slidingWindow.close(knownSNPs); Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all } @@ -692,7 +704,7 @@ public class SlidingWindowUnitTest extends BaseTest { slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips); } - final boolean result = slidingWindow.hasSignificantSoftclipPosition(windowHeader, currentHeaderStart + indexToSkip); + final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip); Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index ebbc3945f..8f8ec10f5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -329,8 +329,8 @@ public class MathUtils { /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. * - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) + * @param start - start (inclusive) of the cumulant sum (over hits) + * @param end - end (exclusive) of the cumulant sum (over hits) * @param total - number of attempts for the number of hits * @param probHit - probability of a successful hit * @return - returns the cumulative probability From 5bce0e086e6942450b4f276668d8c1795b41f378 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 9 Apr 2013 15:37:08 -0400 Subject: [PATCH 189/226] Refactored binomial probability code in MathUtils. * Moved redundant code out of UGEngine * Added overloaded methods that assume p=0.5 for speed efficiency * Added unit test for the binomialCumulativeProbability method --- .../reducereads/HeaderElement.java | 7 +- .../genotyper/UnifiedGenotyperEngine.java | 14 +- .../reducereads/HeaderElementUnitTest.java | 2 +- .../broadinstitute/sting/utils/MathUtils.java | 144 ++++++++++-------- .../sting/utils/MathUtilsUnitTest.java | 16 ++ 5 files changed, 106 insertions(+), 77 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 616388e8c..dec323213 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -253,7 +253,7 @@ public class HeaderElement { final int totalCount = consensusBaseCounts.totalCountWithoutIndels(); final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon); - final double pvalue = countOfOtherBases == 0 ? 0.0 : MathUtils.binomialCumulativeProbability(0, countOfOtherBases+1, totalCount, 0.5); + final double pvalue = countOfOtherBases == 0 ? 0.0 : MathUtils.binomialCumulativeProbability(totalCount, 0, countOfOtherBases); return pvalue > minVariantPvalue; } @@ -301,7 +301,7 @@ public class HeaderElement { if ( baseCount == 0 ) continue; - final double pvalue = MathUtils.binomialCumulativeProbability(0, baseCount+1, totalBaseCount, 0.5); + final double pvalue = MathUtils.binomialCumulativeProbability(totalBaseCount, 0, baseCount); if ( pvalue > minVariantPvalue ) { if ( base == BaseIndex.D ) @@ -334,7 +334,8 @@ public class HeaderElement { if ( count == 0 || totalBaseCount == 0 ) return false; - final double pvalue = MathUtils.binomialCumulativeProbability(0, count+1, totalBaseCount, 0.5); + // technically, count can be greater than totalBaseCount (because of the way insertions are counted) so we need to account for that + final double pvalue = MathUtils.binomialCumulativeProbability(totalBaseCount, 0, Math.min(count, totalBaseCount)); return pvalue > minVariantPvalue; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 4e13e0d9d..55db44052 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -610,20 +610,8 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - private final static double[] binomialProbabilityDepthCache = new double[10000]; - private final static double REF_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); - - static { - for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { - binomialProbabilityDepthCache[i] = MathUtils.log10BinomialProbability(i, 0, REF_BINOMIAL_PROB_LOG10_0_5); - } - } - private final double getRefBinomialProbLog10(final int depth) { - if ( depth < binomialProbabilityDepthCache.length ) - return binomialProbabilityDepthCache[depth]; - else - return MathUtils.log10BinomialProbability(depth, 0, REF_BINOMIAL_PROB_LOG10_0_5); + return MathUtils.log10BinomialProbability(depth, 0); } private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index 435c1029c..d73a71855 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -195,7 +195,7 @@ public class HeaderElementUnitTest extends BaseTest { if ( count == 0 ) continue; - final double pvalue = MathUtils.binomialCumulativeProbability(0, count + 1, total, 0.5); + final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count); if ( pvalue > targetPvalue ) { if ( index == BaseIndex.D.index ) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8f8ec10f5..d382d804f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -63,6 +62,7 @@ public class MathUtils { * where the real-space value is 0.0. */ public final static double LOG10_P_OF_ZERO = -1000000.0; + public final static double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); static { log10Cache = new double[LOG10_CACHE_SIZE]; @@ -70,6 +70,7 @@ public class MathUtils { jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; + log10FactorialCache[0] = 0.0; for (int k = 1; k < LOG10_CACHE_SIZE; k++) { log10Cache[k] = Math.log10(k); log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; @@ -306,10 +307,25 @@ public class MathUtils { return a * b; } + /** + * Calculates the log10 of the binomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k number of successes + * @return the log10 of the binomial coefficient + */ public static double binomialCoefficient(final int n, final int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } + /** + * @see #binomialCoefficient(int, int) with log10 applied to result + */ + public static double log10BinomialCoefficient(final int n, final int k) { + return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); + } + /** * Computes a binomial probability. This is computed using the formula *

        @@ -326,23 +342,48 @@ public class MathUtils { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } + /** + * @see #binomialProbability(int, int, double) with log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k, final double log10p) { + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 + */ + public static double binomialProbability(final int n, final int k) { + return Math.pow(10, log10BinomialProbability(n, k)); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k) { + return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); + } + /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * Assumes that the probability of a successful hit is fair (i.e. 0.5). * - * @param start - start (inclusive) of the cumulant sum (over hits) - * @param end - end (exclusive) of the cumulant sum (over hits) - * @param total - number of attempts for the number of hits - * @param probHit - probability of a successful hit + * @param n number of attempts for the number of hits + * @param k_start start (inclusive) of the cumulant sum (over hits) + * @param k_end end (inclusive) of the cumulant sum (over hits) * @return - returns the cumulative probability */ - public static double binomialCumulativeProbability(final int start, final int end, final int total, final double probHit) { + public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { + if ( k_end > n ) + throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); + double cumProb = 0.0; double prevProb; BigDecimal probCache = BigDecimal.ZERO; - for (int hits = start; hits < end; hits++) { + for (int hits = k_start; hits <= k_end; hits++) { prevProb = cumProb; - double probability = binomialProbability(total, hits, probHit); + double probability = binomialProbability(n, hits); cumProb += probability; if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); @@ -355,6 +396,41 @@ public class MathUtils { return probCache.add(new BigDecimal(cumProb)).doubleValue(); } + /** + * Calculates the log10 of the multinomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) + * @return + */ + public static double log10MultinomialCoefficient(final int n, final int[] k) { + double denominator = 0.0; + for (int x : k) { + denominator += log10Factorial(x); + } + return log10Factorial(n) - denominator; + } + + /** + * Computes the log10 of the multinomial distribution probability given a vector + * of log10 probabilities. Designed to prevent overflows even with very large numbers. + * + * @param n number of trials + * @param k array of number of successes for each possibility + * @param log10p array of log10 probabilities + * @return + */ + public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { + if (log10p.length != k.length) + throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); + double log10Prod = 0.0; + for (int i = 0; i < log10p.length; i++) { + log10Prod += log10p[i] * k[i]; + } + return log10MultinomialCoefficient(n, k) + log10Prod; + } + /** * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. * This is computed using the formula: @@ -1120,58 +1196,6 @@ public class MathUtils { return lnToLog10(lnGamma(x)); } - /** - * Calculates the log10 of the binomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k number of successes - * @return the log10 of the binomial coefficient - */ - public static double log10BinomialCoefficient(final int n, final int k) { - return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); - } - - public static double log10BinomialProbability(final int n, final int k, final double log10p) { - double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); - return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); - } - - /** - * Calculates the log10 of the multinomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) - * @return - */ - public static double log10MultinomialCoefficient(final int n, final int[] k) { - double denominator = 0.0; - for (int x : k) { - denominator += log10Factorial(x); - } - return log10Factorial(n) - denominator; - } - - /** - * Computes the log10 of the multinomial distribution probability given a vector - * of log10 probabilities. Designed to prevent overflows even with very large numbers. - * - * @param n number of trials - * @param k array of number of successes for each possibility - * @param log10p array of log10 probabilities - * @return - */ - public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { - if (log10p.length != k.length) - throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); - double log10Prod = 0.0; - for (int i = 0; i < log10p.length; i++) { - log10Prod += log10p[i] * k[i]; - } - return log10MultinomialCoefficient(n, k) + log10Prod; - } - public static double factorial(final int x) { // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value return (double)Math.round(Math.pow(10, log10Factorial(x))); diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 2560bcd11..27af8ec68 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -56,6 +56,22 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.binomialProbability(300, 112, 0.98), 2.34763e-236, 1e-237); } + /** + * Tests that we get the right values from the binomial distribution + */ + @Test + public void testCumulativeBinomialProbability() { + logger.warn("Executing testCumulativeBinomialProbability"); + + final int numTrials = 10; + for ( int i = 0; i < numTrials; i++ ) + Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); + + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); + } + /** * Tests that we get the right values from the multinomial distribution */ From f0e64850da5bb79013245d28299aeee638c33502 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Apr 2013 10:30:37 -0400 Subject: [PATCH 190/226] Two sensitivity / specificity improvements to the haplotype caller -- Reduce the min read length to 10 bp in the filterNonPassingReads in the HC. Now that we filter out reads before genotyping, we have to be more tolerant of shorter, but informative, reads, in order to avoid a few FNs in shallow read data -- Reduce the min usable base qual to 8 by default in the HC. In regions with low coverage we sometimes throw out our only informative kmers because we required a contiguous run of bases with >= 16 QUAL. This is a bit too aggressive of a requirement, so I lowered it to 8. -- Together with the previous commit this results in a significant improvement in the sensitivity and specificity of the caller NA12878 MEM chr20:10-11 Name VariantType TRUE_POSITIVE FALSE_POSITIVE FALSE_NEGATIVE TRUE_NEGATIVE CALLED_NOT_IN_DB_AT_ALL branch SNPS 1216 0 2 194 0 branch INDELS 312 2 13 71 7 master SNPS 1214 0 4 194 1 master INDELS 309 2 16 71 10 -- Update MD5s in the integration tests to reflect these two new changes --- .../walkers/haplotypecaller/HaplotypeCaller.java | 2 +- .../haplotypecaller/LocalAssemblyEngine.java | 2 +- ...omplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 16 ++++++++-------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 2ecc152df..a17e25f41 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -845,7 +845,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { final List readsToRemove = new ArrayList(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { - if( rec.getReadLength() < 24 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { + if( rec.getReadLength() < 10 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { readsToRemove.add(rec); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 23cbc3265..4c0483ad6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -60,7 +60,7 @@ import java.util.List; * Date: Mar 14, 2011 */ public abstract class LocalAssemblyEngine { - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 16; + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8; protected PrintStream graphWriter = null; protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index f8580f271..57d8aa92c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "c0379d32c8c743d84c6da5956d67c004"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "57e13aed8dc483514ac15fb757aee1d1"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "2fb56d241baca3658af5811e680bde4c"); + "d89c8a32e9c54f66e0331382cac86b27"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "bd7d24e87776f939b36742c1fd33b25c"); + "89a28d4290523dd55117bc4e44212d73"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a77304e57..4e291cb59 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "943302eb9b9798d1ffeb9136612cbc85"); + HCTest(CEUTRIO_BAM, "", "aeab5f0d40852e6332b96481981a0e46"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "3199bebe4e34b5df7558f74b05fb3a4e"); + HCTest(NA12878_BAM, "", "c1530f2158cb41d50e830ca5be0f97a0"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "aef51f79d58634e4b35a1a98caba329c"); + "3e2e4a62c6c60d432fa1ca32aee2635b"); } @Test @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5ac0d4b30a0c9a97a71ad014e63f11cf"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "bac6f98e910290722df28da44b41f06f"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "a7e3b05fdc9866965e3ab71dbbd288ff"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "65e7b1b72a2411d6360138049914aa3a"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8252f956e94cb8538b18210e9350f0e3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ab518ae32535714604a4ffc71fe42511")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("7d4da215e86658e8da70fa0ade7f3eca")); + Arrays.asList("3c87eb93ffe3a0166aca753050b981e1")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("b0f0467dd4bfc4cdc85fff85ffa6f0c1")); + Arrays.asList("8adfa8a27a312760dab50787da595c57")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From 3477e092ea59640289de13e3b9720f5ea5d067fc Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Apr 2013 08:39:08 -0400 Subject: [PATCH 191/226] Minor: bump up the amount of cached log10 data in MathUtils so that Monkol can actually call 50K samples. --- public/java/src/org/broadinstitute/sting/utils/MathUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index d382d804f..f4644036f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -54,7 +54,7 @@ public class MathUtils { private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; private static final double MAX_JACOBIAN_TOLERANCE = 8.0; private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; - private static final int MAXN = 50000; + private static final int MAXN = 70000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients /** From be66049a6fd9ce7eb1494d40c57d9d1a20dd151b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Apr 2013 15:42:26 -0400 Subject: [PATCH 193/226] Bugfix for CommonSuffixSplitter -- The problem is that the common suffix splitter could eliminate the reference source vertex when there's an incoming node that contains all of the reference source vertex bases and then some additional prefix bases. In this case we'd eliminate the reference source vertex. Fixed by checking for this condition and aborting the simplification -- Update MD5s, including minor improvements --- .../graphs/CommonSuffixSplitter.java | 17 +++++++++++++++++ ...mplexAndSymbolicVariantsIntegrationTest.java | 6 +++--- .../HaplotypeCallerIntegrationTest.java | 4 ++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java index e37fbb281..0665186c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitter.java @@ -100,6 +100,8 @@ public class CommonSuffixSplitter { final SeqVertex suffixVTemplate = commonSuffix(toSplit); if ( suffixVTemplate.isEmpty() ) { return false; + } else if ( wouldEliminateRefSource(graph, suffixVTemplate, toSplit) ) { + return false; } else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) { return false; } else { @@ -141,6 +143,21 @@ public class CommonSuffixSplitter { } } + /** + * Would factoring out this suffix result in elimating the reference source vertex? + * @param graph the graph + * @param commonSuffix the common suffix of all toSplits + * @param toSplits the list of vertices we're are trying to split + * @return true if toSplit contains the reference source and this ref source has all and only the bases of commonSuffix + */ + private boolean wouldEliminateRefSource(final SeqGraph graph, final SeqVertex commonSuffix, final Collection toSplits) { + for ( final SeqVertex toSplit : toSplits ) { + if ( graph.isRefSource(toSplit) ) + return toSplit.length() == commonSuffix.length(); + } + return false; + } + // private static int counter = 0; /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 57d8aa92c..17f04971b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "57e13aed8dc483514ac15fb757aee1d1"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd51f8c7235eb6547b678093c7a01089"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "d89c8a32e9c54f66e0331382cac86b27"); + "ed3b577e6f7d68bba6774a62d9df9cd9"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "89a28d4290523dd55117bc4e44212d73"); + "a594a28d8053c3e969c39de81a9d03d6"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 4e291cb59..500db6ae9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "3e2e4a62c6c60d432fa1ca32aee2635b"); + "28c3b1f276ec8198801aafe880e40fb6"); } @Test @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ab518ae32535714604a4ffc71fe42511")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cac0d88fa4471c7a0ac96533a9a6354b")); executeTest("HCTestStructuralIndels: ", spec); } From cf7afc1ad4476d069744858b2bbbcf9bb6a650e7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 20 Apr 2013 00:28:35 -0400 Subject: [PATCH 194/226] Fixed "skipped intervals" bug on DiagnoseTargets Problem ------- Diagnose targets was skipping intervals when they were not covered by any reads. Solution -------- Rework the interval iteration logic to output all intervals as they're skipped over by the traversal, as well as adding a loop on traversal done to finish outputting intervals past the coverage of teh BAM file. Summarized Changes ------------------ * Outputs all intervals it iterates over, even if uncovered * Outputs leftover intervals in the end of the traversal * Updated integration tests [fixes #47813825] --- .../diagnostics/targets/DiagnoseTargets.java | 104 ++++++++---------- .../DiagnoseTargetsIntegrationTest.java | 4 +- 2 files changed, 49 insertions(+), 59 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index b302a967c..33c7e7a1b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -152,11 +152,14 @@ public class DiagnoseTargets extends LocusWalker { @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) private boolean debug = false; - private HashMap intervalMap = null; // maps each interval => statistics - private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome - private Set samples = null; // all the samples being processed - private final Allele SYMBOLIC_ALLELE = Allele.create("

        ", false); // avoid creating the symbolic allele multiple times - private ThresHolder thresholds = null; + private Map intervalMap = null; // maps each interval => statistics + private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome + private Set samples = null; // all the samples being processed + private static final Allele SYMBOLIC_ALLELE = Allele.create("
        ", false); // avoid creating the symbolic allele multiple times + private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times + private ThresHolder thresholds = null; // object that holds all the thresholds for Diagnose Targets (todo -- should become a plugin based system) + + private static final int INITIAL_HASH_SIZE = 500000; @Override public void initialize() { @@ -165,24 +168,32 @@ public class DiagnoseTargets extends LocusWalker { if (getToolkit().getIntervals() == null) throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )"); - thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); + thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, + minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, + badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, + qualityStatusThreshold); - intervalMap = new HashMap(); + intervalMap = new HashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header - vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header + // get all of the unique sample names for the VCF Header + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore - addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus + // process and remove any intervals in the map that are don't overlap the current locus anymore + // and add all new intervals that may overlap this reference locus + outputFinishedIntervals(refLocus, ref.getBase()); + addNewOverlappingIntervals(refLocus); + // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStatistics intervalStatistics : intervalMap.values()) - intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats + intervalStatistics.addLocus(context, ref, thresholds); + return 1L; } @@ -212,53 +223,40 @@ public class DiagnoseTargets extends LocusWalker { @Override public void onTraversalDone(Long result) { for (GenomeLoc interval : intervalMap.keySet()) - outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true)); - } + outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); - private GenomeLoc getIntervalMapSpan() { - GenomeLoc loc = null; - for (GenomeLoc interval : intervalMap.keySet()) { - if (loc == null) { - loc = interval; - } else - loc = interval.union(loc); + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null) { + outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); + intervalListIterator.next(); + interval = intervalListIterator.peek(); } - - return loc; - } - - private GenomeLoc getFinishedIntervalSpan(GenomeLoc pos) { - GenomeLoc loc = null; - for (GenomeLoc interval : intervalMap.keySet()) { - if (interval.isBefore(pos)) { - if (loc == null) - loc = interval; - else - loc = interval.union(loc); - } - } - - return loc; } /** - * Removes all intervals that are behind the current reference locus from the intervalMap + * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ - private void removePastIntervals(GenomeLoc refLocus, byte refBase) { - // if there are statistics to output/ check to see that we can output them in order - if (getFinishedIntervalSpan(refLocus) != null && - getIntervalMapSpan().getStart() == getFinishedIntervalSpan(refLocus).getStart()) { + private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { + GenomeLoc interval = intervalListIterator.peek(); - for (GenomeLoc interval : intervalMap.keySet()) { - if (interval.isBefore(refLocus)) { - outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true)); - intervalMap.remove(interval); - } + // output empty statistics for uncovered intervals + while (interval != null && interval.isBefore(refLocus)) { + final IntervalStatistics stats = intervalMap.get(interval); + outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE); + if (stats != null) intervalMap.remove(interval); + intervalListIterator.next(); + interval = intervalListIterator.peek(); + } + + // remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals) + for (GenomeLoc key : intervalMap.keySet()) { + if (key.isBefore(refLocus)) { + outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true)); + intervalMap.remove(key); } - } } @@ -269,17 +267,9 @@ public class DiagnoseTargets extends LocusWalker { */ private void addNewOverlappingIntervals(GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); - - // skip any intervals with no coverage that we have passed - while (interval != null && interval.isBefore(refLocus)) { - intervalListIterator.next(); // discard the interval (we've already added it to the map) - interval = intervalListIterator.peek(); - } - - // add any intervals that overlap this one while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); - intervalListIterator.next(); // discard the interval (we've already added it to the map) + intervalListIterator.next(); interval = intervalListIterator.peek(); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 6a52a42e5..2875e10d7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9b51561bcf248da70a4d711380b04f7b"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c"); + DTTest("testMultiSample ", "-I " + multiSample, "925f88f0c41c6a9ac479be34e052dc5d"); } } From 2b923f1568329ce3af4d1953746e74e327ea0c44 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 20 Apr 2013 13:05:14 -0400 Subject: [PATCH 195/226] fix for DiagnoseTargets multiple filter output Problem ------- Diagnose targets is outputting both LOW_MEDIAN_COVERAGE and NO_READS when no reads are covering the interval Solution -------- Only allow low median coverage check if there are reads [fixes #48442675] --- .../targets/IntervalStatistics.java | 20 ++++++++++++++----- .../diagnostics/targets/SampleStatistics.java | 2 +- .../DiagnoseTargetsIntegrationTest.java | 4 ++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 0aea54fa0..0a6b73dae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -148,24 +148,34 @@ class IntervalStatistics { votes.put(status, votes.get(status) + 1); // output tall values above the threshold + final double minVotesNeeded = thresholds.getVotePercentageThreshold() * samples.size(); for (CallableStatus status : votes.keySet()) { - if (votes.get(status) > (samples.size() * thresholds.getVotePercentageThreshold()) && !(status.equals(CallableStatus.PASS))) + if (!status.equals((CallableStatus.PASS)) && votes.get(status) > minVotesNeeded) output.add(status); } - if (hasNref) output.add(CallableStatus.REF_N); // get median DP of each sample + final double minMedianDepth = thresholds.getLowMedianDepthThreshold() * samples.size(); + final int nSamples = samples.size(); int nLowMedianDepth = 0; + int samplesSeen = 0; for (SampleStatistics sample : samples.values()) { - if (sample.getQuantileDepth(0.5) < thresholds.getMinimumMedianDepth()) + samplesSeen++; + final double medianDepth = sample.getQuantileDepth(0.5); + if (medianDepth > 0 && medianDepth < thresholds.getMinimumMedianDepth()) { nLowMedianDepth++; + } + if (nLowMedianDepth > minMedianDepth) { + output.add(CallableStatus.LOW_MEDIAN_DEPTH); + break; + } + if (nSamples - samplesSeen + nLowMedianDepth < minMedianDepth) + break; } - if (nLowMedianDepth > (samples.size() * thresholds.getLowMedianDepthThreshold())) - output.add(CallableStatus.LOW_MEDIAN_DEPTH); return output; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index ad9f287d2..9efdbefe1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -101,7 +101,7 @@ class SampleStatistics { * @return the callable statuses of the entire sample */ public Set getCallableStatuses(ThresHolder thresholds) { - // We check if reads are present ot prevent div / 0 exceptions + // We check if reads are present to prevent div / 0 exceptions if (nReads == 0) { return Collections.singleton(CallableStatus.NO_READS); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 2875e10d7..a9330f2dd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9b51561bcf248da70a4d711380b04f7b"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "bd614643284a849724bf8ee6bc4df8bf"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "925f88f0c41c6a9ac479be34e052dc5d"); + DTTest("testMultiSample ", "-I " + multiSample, "145f5d4641abfdeadbc59ee74ce1560f"); } } From b3c0abd9e8bc2c8565afcf1eb90b6d8633aa3d67 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 20 Apr 2013 14:42:33 -0400 Subject: [PATCH 196/226] Remove REF_N status from DiagnoseTargets This is not really feasible with the current mandate of this walker. We would have to traverse by reference and that would make the runtime much higher, and we are not really interested in the status 99% of the time anyway. There are other walkers that can report this, and just this, status more cheaply. [fixes #48442663] --- .../gatk/walkers/diagnostics/targets/CallableStatus.java | 2 -- .../gatk/walkers/diagnostics/targets/IntervalStatistics.java | 3 --- .../gatk/walkers/diagnostics/targets/SampleStatistics.java | 4 ---- .../diagnostics/targets/DiagnoseTargetsIntegrationTest.java | 4 ++-- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java index 4bc318b02..959c002ad 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java @@ -54,8 +54,6 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; */ public enum CallableStatus { - REF_N("the reference base was an N, which is not considered callable the GATK"), - PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"), COVERAGE_GAPS("absolutely no coverage was observed at a locus, regardless of the filtering parameters"), diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 0a6b73dae..4fd9a20ef 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -154,9 +154,6 @@ class IntervalStatistics { output.add(status); } - if (hasNref) - output.add(CallableStatus.REF_N); - // get median DP of each sample final double minMedianDepth = thresholds.getLowMedianDepthThreshold() * samples.size(); final int nSamples = samples.size(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index 9efdbefe1..051369b94 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -139,10 +139,6 @@ class SampleStatistics { if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold()) output.add(CallableStatus.POOR_QUALITY); - if (totals.get(CallableStatus.REF_N) > 0) - output.add(CallableStatus.REF_N); - - if (output.isEmpty()) { output.add(CallableStatus.PASS); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index a9330f2dd..a435a33ad 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "bd614643284a849724bf8ee6bc4df8bf"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "3d558ec8828c269774ee45e5df086a5f"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "145f5d4641abfdeadbc59ee74ce1560f"); + DTTest("testMultiSample ", "-I " + multiSample, "d40cf1f1daf68f2740cd411e2cf361fc"); } } From eb6308a0e4eeaf2f06bfd3cd9e0d9c219b18140f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 20 Apr 2013 18:34:03 -0400 Subject: [PATCH 197/226] General DiagnoseTargets documentation cleanup * remove interval statistic low_median_coverage -- it is already captured by low coverage and coverage gaps. * add gatkdocs to all the parameters * clean up the logic on callable status a bit (still need to be re-worked into a plugin system) * update integration tests --- .../diagnostics/targets/CallableStatus.java | 7 +-- .../diagnostics/targets/DiagnoseTargets.java | 57 +++++++++++++------ .../targets/IntervalStatistics.java | 20 ------- .../diagnostics/targets/SampleStatistics.java | 10 +--- .../diagnostics/targets/ThresHolder.java | 37 ++++-------- .../DiagnoseTargetsIntegrationTest.java | 4 +- .../targets/LocusStatisticsUnitTest.java | 2 +- .../targets/SampleStatisticsUnitTest.java | 4 +- 8 files changed, 60 insertions(+), 81 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java index 959c002ad..32c0c339d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java @@ -66,12 +66,7 @@ public enum CallableStatus { BAD_MATE("the reads are not properly mated, suggesting mapping errors"), - NO_READS("there are no reads contained in the interval"), - - // - // Interval-level statuses - // - LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples"); + NO_READS("there are no reads contained in the interval"); public final String description; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index 33c7e7a1b..5bdb81906 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -110,45 +111,71 @@ import java.util.*; @PartitionBy(PartitionType.INTERVAL) public class DiagnoseTargets extends LocusWalker { - @Output(doc = "File to which variants should be written") + @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; + /** + * Only bases with quality greater than this will be considered in the coverage metrics. + */ @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false) private int minimumBaseQuality = 20; + /** + * Only reads with mapping quality greater than this will be considered in the coverage metrics. + */ @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false) private int minimumMappingQuality = 20; + /** + * If at any locus, a sample has less coverage than this, it will be reported as LOW_COVERAGE + */ @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false) private int minimumCoverage = 5; + /** + * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE + */ @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) private int maximumCoverage = 700; - @Argument(fullName = "minimum_median_depth", shortName = "med", doc = "The minimum allowable median coverage, used for calling LOW_MEDIAN_DEPTH", required = false) - private int minMedianDepth = 10; - + /** + * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE + */ @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false) private int maxInsertSize = 500; - @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed percentage of samples containing a call for the interval to adopt the call ", required = false) + /** + * The proportion of samples that must have a status for it to filter the entire interval. Example: 8 out of 10 samples have low coverage status on the interval, + * with a threshold higher than 0.2, this interval will be filtered as LOW_COVERAGE. + */ + @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed proportion of samples containing a call for the interval to adopt the call ", required = false) private double votePercentage = 0.50; - @Argument(fullName = "low_median_depth_status_threshold", shortName = "stMED", doc = "The percentage of the loci needed for calling LOW_MEDIAN_DEPTH", required = false) - private double lowMedianDepthPercentage = 0.20; - - @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The percentage of the loci needed for calling BAD_MATE", required = false) + /** + * The proportion of reads in the loci that must have bad mates for the sample to be reported as BAD_MATE + */ + @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The proportion of the loci needed for calling BAD_MATE", required = false) private double badMateStatusThreshold = 0.50; - @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The percentage of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) + /** + * The proportion of loci in a sample that must fall under the LOW_COVERAGE or COVERAGE_GAPS category for the sample to be reported as either (or both) + */ + @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The proportion of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) private double coverageStatusThreshold = 0.20; - @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The percentage of the loci needed for calling EXCESSIVE_COVERAGE", required = false) + /** + * The proportion of loci in a sample that must fall under the EXCESSIVE_COVERAGE category for the sample to be reported as EXCESSIVE_COVERAGE + */ + @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The proportion of the loci needed for calling EXCESSIVE_COVERAGE", required = false) private double excessiveCoverageThreshold = 0.20; - @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The percentage of the loci needed for calling POOR_QUALITY", required = false) + /** + * The proportion of loci in a sample that must fall under the LOW_QUALITY category for the sample to be reported as LOW_QUALITY + */ + @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) private double qualityStatusThreshold = 0.50; + @Hidden @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) private boolean debug = false; @@ -168,10 +195,8 @@ public class DiagnoseTargets extends LocusWalker { if (getToolkit().getIntervals() == null) throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )"); - thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, - minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, - badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, - qualityStatusThreshold); + thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, maxInsertSize, votePercentage, + badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); intervalMap = new HashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 4fd9a20ef..0f4b33747 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -154,26 +154,6 @@ class IntervalStatistics { output.add(status); } - // get median DP of each sample - final double minMedianDepth = thresholds.getLowMedianDepthThreshold() * samples.size(); - final int nSamples = samples.size(); - int nLowMedianDepth = 0; - int samplesSeen = 0; - for (SampleStatistics sample : samples.values()) { - samplesSeen++; - final double medianDepth = sample.getQuantileDepth(0.5); - if (medianDepth > 0 && medianDepth < thresholds.getMinimumMedianDepth()) { - nLowMedianDepth++; - } - if (nLowMedianDepth > minMedianDepth) { - output.add(CallableStatus.LOW_MEDIAN_DEPTH); - break; - } - if (nSamples - samplesSeen + nLowMedianDepth < minMedianDepth) - break; - } - - return output; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index 051369b94..afde93ea3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -293,17 +293,11 @@ class SampleStatistics { if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag()) return false; - // inverted - if (read.getReadNegativeStrandFlag() == - read.getAlignmentStart() < read.getMateAlignmentStart()) - return false; + // todo -- inverted ? - // TODO note: IGV uses a different algorithm for insert size, there should be a common util class that does this for you // mates are too far apart - if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize()) - return false; + return Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) <= thresholds.getMaximumInsertSize(); - return true; } public int getnReads() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java index fc4954f3b..c2dd2f4ff 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -54,44 +54,38 @@ import java.util.Set; class ThresHolder { public static final String AVG_INTERVAL_DP_KEY = "AVG_INTERVAL_DP"; - public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); + public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 50, 0.5, 0.5, 0.2, 0.2, 0.5); private final int minimumBaseQuality; private final int minimumMappingQuality; private final int minimumCoverage; private final int maximumCoverage; - private final int minimumMedianDepth; private final int maximumInsertSize; private final double votePercentageThreshold; - private final double lowMedianDepthThreshold; private final double badMateStatusThreshold; private final double coverageStatusThreshold; private final double excessiveCoverageThreshold; private final double qualityStatusThreshold; - public ThresHolder(int minimumBaseQuality, - int minimumMappingQuality, - int minimumCoverage, - int maximumCoverage, - int minimumMedianDepth, - int maximumInsertSize, - double votePercentageThreshold, - double lowMedianDepthThreshold, - double badMateStatusThreshold, - double coverageStatusThreshold, - double excessiveCoverageThreshold, - double qualityStatusThreshold) { + public ThresHolder(final int minimumBaseQuality, + final int minimumMappingQuality, + final int minimumCoverage, + final int maximumCoverage, + final int maximumInsertSize, + final double votePercentageThreshold, + final double badMateStatusThreshold, + final double coverageStatusThreshold, + final double excessiveCoverageThreshold, + final double qualityStatusThreshold) { this.minimumBaseQuality = minimumBaseQuality; this.minimumMappingQuality = minimumMappingQuality; this.minimumCoverage = minimumCoverage; this.maximumCoverage = maximumCoverage; - this.minimumMedianDepth = minimumMedianDepth; this.maximumInsertSize = maximumInsertSize; this.votePercentageThreshold = votePercentageThreshold; - this.lowMedianDepthThreshold = lowMedianDepthThreshold; this.badMateStatusThreshold = badMateStatusThreshold; this.coverageStatusThreshold = coverageStatusThreshold; this.excessiveCoverageThreshold = excessiveCoverageThreshold; @@ -106,10 +100,6 @@ class ThresHolder { return maximumCoverage; } - public int getMinimumMedianDepth() { - return minimumMedianDepth; - } - public int getMaximumInsertSize() { return maximumInsertSize; } @@ -118,10 +108,6 @@ class ThresHolder { return votePercentageThreshold; } - public double getLowMedianDepthThreshold() { - return lowMedianDepthThreshold; - } - public double getBadMateStatusThreshold() { return badMateStatusThreshold; } @@ -156,7 +142,6 @@ class ThresHolder { headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype - // todo -- find the appropriate VCF constants headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index a435a33ad..ef14f8386 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "3d558ec8828c269774ee45e5df086a5f"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "742c13fc092b42f9ff71fc3fff4a95cc"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "d40cf1f1daf68f2740cd411e2cf361fc"); + DTTest("testMultiSample ", "-I " + multiSample, "7083cc720a2caa02fb0fa8f49f94a826"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java index 9ab4621b9..c86acebb9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java @@ -57,7 +57,7 @@ public class LocusStatisticsUnitTest /*extends BaseTest*/ { @Test(dataProvider = "StatusTestValues") public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) { // The min Coverage threshold is 10, the max is 100 - ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); + ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 50, 0.5, 0.5, 0.2, 0.2, 0.5); Set statuses = new LocusStatistics(coverage, rawCoverage).callableStatuses(thresholds); // Check to make sure the status provides matches the actual Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java index 18e4bbfc2..dd9e1d86e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java @@ -95,7 +95,7 @@ public class SampleStatisticsUnitTest/* extends BaseTest */ { GATKSAMRecord noPair = ArtificialSAMUtils.createArtificialRead(header, "test", 0, 100, 50); GATKSAMRecord good = ArtificialSAMUtils.createPair(header, "test", 30, 100, 150, true, false).get(0); GATKSAMRecord bigInsertSize = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, false).get(0); - GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0); +// GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0); GATKSAMRecord sameOrientation = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, true).get(0); GATKSAMRecord pairNotMapped = ArtificialSAMUtils.createPair(header, "test", 30, 100, 140, true, false).get(1); @@ -106,7 +106,7 @@ public class SampleStatisticsUnitTest/* extends BaseTest */ { new Object[]{noPair, false}, new Object[]{good, true}, new Object[]{bigInsertSize, false}, - new Object[]{inverted, false}, +// new Object[]{inverted, false}, new Object[]{sameOrientation, false}, new Object[]{pairNotMapped, false} }; From fdd16dc6f99eee6a866780540702b371d3abaf1d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 20 Apr 2013 21:49:50 -0400 Subject: [PATCH 198/226] DiagnoseTargets refactor A plugin enabled implementation of DiagnoseTargets Summarized Changes: ------------------- * move argument collection into Thresholder object * make thresholder object private member of all statistics classes * rework the logic of the mate pairing thresholds * update unit and integration tests to reflect the new behavior * Implements Locus Statistic plugins * Extend Locus Statistic plugins to determine sample status * Export all common plugin functionality into utility class * Update tests accordingly [fixes #48465557] --- .../diagnostics/targets/DiagnoseTargets.java | 183 ++++++------- .../targets/FindCoveredIntervals.java | 2 +- .../targets/IntervalStatistics.java | 71 +++-- .../diagnostics/targets/LocusStatistics.java | 48 ++-- .../diagnostics/targets/SampleStatistics.java | 259 ++++++------------ .../diagnostics/targets/ThresHolder.java | 152 +++++----- .../targets/statistics/Interval.java | 63 +++++ .../diagnostics/targets/statistics/Locus.java | 65 +++++ .../targets/statistics/LocusCoverageGap.java} | 88 ++---- .../statistics/LocusExcessiveCoverage.java | 79 ++++++ .../targets/statistics/LocusLowCoverage.java | 80 ++++++ .../targets/statistics/LocusPoorQuality.java | 79 ++++++ .../targets/statistics/PluginUtils.java | 66 +++++ .../targets/statistics/Sample.java | 63 +++++ .../targets/statistics/SampleBadMates.java | 76 +++++ .../targets/statistics/SampleNoReads.java | 73 +++++ .../DiagnoseTargetsIntegrationTest.java | 4 +- .../targets/LocusStatisticsUnitTest.java | 38 +-- 18 files changed, 983 insertions(+), 506 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java rename protected/java/{test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java => src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java} (73%) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index 5bdb81906..d90f2d1fa 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -47,23 +47,26 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Interval; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.variant.vcf.VCFConstants; -import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; import java.util.*; @@ -111,80 +114,19 @@ import java.util.*; @PartitionBy(PartitionType.INTERVAL) public class DiagnoseTargets extends LocusWalker { + private static final String AVG_INTERVAL_DP_KEY = "IDP"; + @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; - /** - * Only bases with quality greater than this will be considered in the coverage metrics. - */ - @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false) - private int minimumBaseQuality = 20; - - /** - * Only reads with mapping quality greater than this will be considered in the coverage metrics. - */ - @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false) - private int minimumMappingQuality = 20; - - /** - * If at any locus, a sample has less coverage than this, it will be reported as LOW_COVERAGE - */ - @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false) - private int minimumCoverage = 5; - - /** - * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE - */ - @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) - private int maximumCoverage = 700; - - /** - * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE - */ - @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false) - private int maxInsertSize = 500; - - /** - * The proportion of samples that must have a status for it to filter the entire interval. Example: 8 out of 10 samples have low coverage status on the interval, - * with a threshold higher than 0.2, this interval will be filtered as LOW_COVERAGE. - */ - @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed proportion of samples containing a call for the interval to adopt the call ", required = false) - private double votePercentage = 0.50; - - /** - * The proportion of reads in the loci that must have bad mates for the sample to be reported as BAD_MATE - */ - @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The proportion of the loci needed for calling BAD_MATE", required = false) - private double badMateStatusThreshold = 0.50; - - /** - * The proportion of loci in a sample that must fall under the LOW_COVERAGE or COVERAGE_GAPS category for the sample to be reported as either (or both) - */ - @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The proportion of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) - private double coverageStatusThreshold = 0.20; - - /** - * The proportion of loci in a sample that must fall under the EXCESSIVE_COVERAGE category for the sample to be reported as EXCESSIVE_COVERAGE - */ - @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The proportion of the loci needed for calling EXCESSIVE_COVERAGE", required = false) - private double excessiveCoverageThreshold = 0.20; - - /** - * The proportion of loci in a sample that must fall under the LOW_QUALITY category for the sample to be reported as LOW_QUALITY - */ - @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) - private double qualityStatusThreshold = 0.50; - - @Hidden - @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) - private boolean debug = false; + @ArgumentCollection + private ThresHolder thresholds = new ThresHolder(); private Map intervalMap = null; // maps each interval => statistics private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private Set samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("
        ", false); // avoid creating the symbolic allele multiple times private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times - private ThresHolder thresholds = null; // object that holds all the thresholds for Diagnose Targets (todo -- should become a plugin based system) private static final int INITIAL_HASH_SIZE = 500000; @@ -192,18 +134,18 @@ public class DiagnoseTargets extends LocusWalker { public void initialize() { super.initialize(); - if (getToolkit().getIntervals() == null) - throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )"); - - thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, maxInsertSize, votePercentage, - badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); + if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) + throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); intervalMap = new HashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); + vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); + + // pre load all the statistics classes because it is costly to operate on the JVM and we only want to do it once. + loadAllPlugins(thresholds); } @Override @@ -217,8 +159,7 @@ public class DiagnoseTargets extends LocusWalker { // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStatistics intervalStatistics : intervalMap.values()) - intervalStatistics.addLocus(context, ref, thresholds); - + intervalStatistics.addLocus(context); return 1L; } @@ -317,37 +258,26 @@ public class DiagnoseTargets extends LocusWalker { alleles.add(SYMBOLIC_ALLELE); VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); - vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF - vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(thresholds), true))); + vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); + vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(ThresHolder.AVG_INTERVAL_DP_KEY, stats.averageCoverage()); + attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); - if (debug) { - System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage()); - } for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); - SampleStatistics sampleStat = stats.getSample(sample); - gb.attribute(ThresHolder.AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); - gb.attribute("Q1", sampleStat.getQuantileDepth(0.25)); - gb.attribute("MED", sampleStat.getQuantileDepth(0.50)); - gb.attribute("Q3", sampleStat.getQuantileDepth(0.75)); + SampleStatistics sampleStat = stats.getSampleStatics(sample); + gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); - if (debug) { - System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads()); - } - gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds), false)); + gb.filters(statusesToStrings(stats.getSampleStatics(sample).getCallableStatuses(), false)); genotypes.add(gb.make()); } vcb = vcb.genotypes(genotypes); - vcfWriter.add(vcb.make()); - } /** @@ -356,17 +286,74 @@ public class DiagnoseTargets extends LocusWalker { * @param statuses the set of statuses to be converted * @return a matching set of strings */ - private List statusesToStrings(Set statuses, final boolean includePASS) { + private List statusesToStrings(Set statuses, final boolean isInfoField) { List output = new ArrayList(statuses.size()); for (CallableStatus status : statuses) - if ( includePASS || status != CallableStatus.PASS ) // adding pass => results in a filter for genotypes + if ( isInfoField || status != CallableStatus.PASS ) output.add(status.name()); return output; } private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { - return new IntervalStatistics(samples, interval); + return new IntervalStatistics(samples, interval, thresholds); } + + protected static void loadAllPlugins(final ThresHolder thresholds) { + for (Class stat : new PluginManager(Locus.class).getPlugins()) { + try { + final Locus stats = (Locus) stat.newInstance(); + stats.initialize(thresholds); + thresholds.locusStatisticList.add(stats); + } catch (Exception e) { + throw new DynamicClassResolutionException(stat, e); + } + } + + for (Class stat : new PluginManager(Sample.class).getPlugins()) { + try { + final Sample stats = (Sample) stat.newInstance(); + stats.initialize(thresholds); + thresholds.sampleStatisticList.add(stats); + } catch (Exception e) { + throw new DynamicClassResolutionException(stat, e); + } + } + + for (Class stat : new PluginManager(Interval.class).getPlugins()) { + try { + final Interval stats = (Interval) stat.newInstance(); + stats.initialize(thresholds); + thresholds.intervalStatisticList.add(stats); + } catch (Exception e) { + throw new DynamicClassResolutionException(stat, e); + } + } + } + + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); + headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size.")); + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index eef581160..1c9751c5b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -105,7 +105,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { // Look to see if the region has sufficient coverage public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); + int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage(); // note the linear probability scale return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 0f4b33747..2e7333cc6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Interval; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -57,27 +57,23 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; -class IntervalStatistics { +public class IntervalStatistics { private final Map samples; private final GenomeLoc interval; - private boolean hasNref = false; + private final ThresHolder thresholds; - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + private int preComputedTotalCoverage = -1; - /* - private double minMedianDepth = 20.0; - private double badMedianDepthPercentage = 0.20; - private double votePercentage = 0.50; - */ - public IntervalStatistics(Set samples, GenomeLoc interval/*, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality*/) { + public IntervalStatistics(Set samples, GenomeLoc interval, ThresHolder thresholds) { this.interval = interval; + this.thresholds = thresholds; this.samples = new HashMap(samples.size()); for (String sample : samples) - this.samples.put(sample, new SampleStatistics(interval /*, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality*/)); + this.samples.put(sample, new SampleStatistics(interval, thresholds)); } - public SampleStatistics getSample(String sample) { + public SampleStatistics getSampleStatics(String sample) { return samples.get(sample); } @@ -85,19 +81,19 @@ class IntervalStatistics { return interval; } + public int getNSamples() { + return samples.size(); + } + /** * The function to populate data into the Statistics from the walker. * This takes the input and manages passing the data to the SampleStatistics and Locus Statistics * * @param context The alignment context given from the walker - * @param ref the reference context given from the walker - * @param thresholds the class contains the statistical threshold for making calls */ - public void addLocus(AlignmentContext context, ReferenceContext ref, ThresHolder thresholds) { + public void addLocus(AlignmentContext context) { ReadBackedPileup pileup = context.getBasePileup(); - //System.out.println(ref.getLocus().toString()); - Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); for (Map.Entry entry : samplePileups.entrySet()) { @@ -108,11 +104,9 @@ class IntervalStatistics { if (sampleStatistics == null) throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); - sampleStatistics.addLocus(context.getLocation(), samplePileup, thresholds); + sampleStatistics.addLocus(context.getLocation(), samplePileup); } - if (!hasNref && ref.getBase() == 'N') - hasNref = true; } public double averageCoverage() { @@ -129,29 +123,34 @@ class IntervalStatistics { /** * Return the Callable statuses for the interval as a whole - * todo -- add missingness filter * - * @param thresholds the class contains the statistical threshold for making calls * @return the callable status(es) for the whole interval */ - public Set callableStatuses(ThresHolder thresholds) { - Set output = new HashSet(); + public Set callableStatuses() { + final Set output = new HashSet(); - // Initialize the Map - Map votes = new HashMap(); - for (CallableStatus status : CallableStatus.values()) - votes.put(status, 0); + // sum up all the callable status for each sample + final Map sampleStatusTally = new HashMap(CallableStatus.values().length); + for (SampleStatistics sampleStatistics : samples.values()) { + for (CallableStatus status : sampleStatistics.getCallableStatuses()) { + sampleStatusTally.put(status, !sampleStatusTally.containsKey(status) ? 1 : sampleStatusTally.get(status) + 1); + } + } - // tally up the votes - for (SampleStatistics sample : samples.values()) - for (CallableStatus status : sample.getCallableStatuses(thresholds)) - votes.put(status, votes.get(status) + 1); + // check if any of the votes pass the threshold + final int nSamples = getNSamples(); + for (Map.Entry entry : sampleStatusTally.entrySet()) { + if ((double) entry.getValue() / nSamples > thresholds.votePercentageThreshold) { + output.add(entry.getKey()); + } + } - // output tall values above the threshold - final double minVotesNeeded = thresholds.getVotePercentageThreshold() * samples.size(); - for (CallableStatus status : votes.keySet()) { - if (!status.equals((CallableStatus.PASS)) && votes.get(status) > minVotesNeeded) + // add the interval specific statitics statuses + for (Interval intervalStat : thresholds.intervalStatisticList) { + final CallableStatus status = intervalStat.status(this); + if (status != null) { output.add(status); + } } return output; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java index 5ec1a1608..e85f3d9c1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java @@ -46,21 +46,25 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; + import java.util.HashSet; +import java.util.List; import java.util.Set; -class LocusStatistics { - private final int coverage; - private final int rawCoverage; +public class LocusStatistics { + private int coverage; + private int rawCoverage; + private final List locusStatisticsList; - public LocusStatistics() { - this.coverage = 0; - this.rawCoverage = 0; + public LocusStatistics(ThresHolder thresholds) { + this(0,0,thresholds); } - public LocusStatistics(int coverage, int rawCoverage) { + protected LocusStatistics(int coverage, int rawCoverage, ThresHolder thresholds) { this.coverage = coverage; this.rawCoverage = rawCoverage; + this.locusStatisticsList = thresholds.locusStatisticList; } public int getCoverage() { @@ -74,31 +78,21 @@ class LocusStatistics { /** * Generates all applicable statuses from the coverages in this locus * - * @param thresholds the class contains the statistical threshold for making calls * @return a set of all statuses that apply */ - public Set callableStatuses(ThresHolder thresholds) { + public Set callableStatuses() { Set output = new HashSet(); - - // if too much coverage - if (getCoverage() > thresholds.getMaximumCoverage()) - output.add(CallableStatus.EXCESSIVE_COVERAGE); - - // if not enough coverage - if (getCoverage() < thresholds.getMinimumCoverage()) { - // was there a lot of low Qual coverage? - if (getRawCoverage() >= thresholds.getMinimumCoverage()) - output.add(CallableStatus.POOR_QUALITY); - // no? - else { - // is there any coverage? - if (getRawCoverage() > 0) - output.add(CallableStatus.LOW_COVERAGE); - else - output.add(CallableStatus.COVERAGE_GAPS); + for (Locus stats : locusStatisticsList) { + CallableStatus status = stats.status(this); + if (status != null) { + output.add(status); } } - return output; } + + public void set(final int coverage, final int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index afde93ea3..c05feebbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -56,93 +58,100 @@ import java.util.*; /** * The statistics calculator for a specific sample given the interval */ -class SampleStatistics { +public class SampleStatistics { private final GenomeLoc interval; private final ArrayList loci; + private final ThresHolder thresholds; - private int[] preSortedDepths = null; - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - + // avoids re-calculating these sums over loci + private int preComputedTotalCoverage = -1; + private Map locusStatusTally = null; private int nReads = -1; private int nBadMates = -1; - private SampleStatistics(GenomeLoc interval, ArrayList loci) { + public SampleStatistics(final GenomeLoc interval, final ThresHolder thresholds) { this.interval = interval; - this.loci = loci; + this.loci = new ArrayList(interval.size()); + this.thresholds = thresholds; nReads = 0; nBadMates = 0; - } - - public SampleStatistics(GenomeLoc interval) { - this(interval, new ArrayList(interval.size())); // Initialize every loci (this way we don't have to worry about non-existent loci in the object for (int i = 0; i < interval.size(); i++) - this.loci.add(new LocusStatistics()); + this.loci.add(new LocusStatistics(thresholds)); } + /** + * Calculates the total "good" coverage of this sample. Good means "passes the base and + * mapping quality requirements. + * + * @return the total "good" coverage across the interval for this sample + */ public long totalCoverage() { if (preComputedTotalCoverage < 0) calculateTotalCoverage(); return preComputedTotalCoverage; } + /** + * Calculates the average "good" coverage of this sample. Good means "passes the base and + * mapping quality requirements. + * + * @return the average "good" coverage + */ public double averageCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return (double) preComputedTotalCoverage / loci.size(); + return (double) totalCoverage() / loci.size(); + } + + /** + * Tally up all the callable status of all the loci in this sample. + * + * @return a map of callable status and counts + */ + public Map getLocusStatusTally() { + if (locusStatusTally == null) { + locusStatusTally = new HashMap(CallableStatus.values().length); + + // sum up all the callable statuses for each locus + for (int i = 0; i < interval.size(); i++) { + LocusStatistics locus = loci.get(i); + for (CallableStatus status : locus.callableStatuses()) { + locusStatusTally.put(status, !locusStatusTally.containsKey(status) ? 1 : locusStatusTally.get(status) + 1); + } + } + } + return locusStatusTally; } /** * Calculates the callable statuses of the entire sample * - * @param thresholds the class contains the statistical threshold for making calls * @return the callable statuses of the entire sample */ - public Set getCallableStatuses(ThresHolder thresholds) { - // We check if reads are present to prevent div / 0 exceptions - if (nReads == 0) { - return Collections.singleton(CallableStatus.NO_READS); - } + public Set getCallableStatuses() { + final Set output = new HashSet(); - Set output = new HashSet(); - Map totals = new HashMap(CallableStatus.values().length); - - // initialize map - for (CallableStatus status : CallableStatus.values()) - totals.put(status, 0.0); - - // sum up all the callable statuses for each locus - for (int i = 0; i < interval.size(); i++) { - for (CallableStatus status : callableStatus(i, thresholds)) { - double count = totals.get(status); - - totals.put(status, count + 1); + // get the tally of all the locus callable statuses + for (Locus locusStat : thresholds.locusStatisticList) { + final CallableStatus status = locusStat.sampleStatus(this); + if (status != null) { + output.add(status); } } - double intervalSize = interval.size(); - - if (((double) nBadMates / nReads) >= thresholds.getBadMateStatusThreshold()) - output.add(CallableStatus.BAD_MATE); - - if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) >= thresholds.getCoverageStatusThreshold()) - output.add(CallableStatus.COVERAGE_GAPS); - - if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) >= thresholds.getCoverageStatusThreshold()) - output.add(CallableStatus.LOW_COVERAGE); - - if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) >= thresholds.getExcessiveCoverageThreshold()) - output.add(CallableStatus.EXCESSIVE_COVERAGE); - - if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold()) - output.add(CallableStatus.POOR_QUALITY); - - if (output.isEmpty()) { - output.add(CallableStatus.PASS); + // get the sample specific statitics statuses + for (Sample sampleStat : thresholds.sampleStatisticList) { + final CallableStatus status = sampleStat.status(this); + if (status != null) { + output.add(status); + } } + // special case, if there are no reads, then there is no sense reporting coverage gaps. + if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS)) + output.remove(CallableStatus.COVERAGE_GAPS); + return output; } @@ -151,50 +160,37 @@ class SampleStatistics { * * @param locus The locus given as a GenomeLoc * @param pileup The pileup of that locus, this exclusively contains the sample - * @param thresholds the class contains the statistical threshold for making calls */ - public void addLocus(GenomeLoc locus, ReadBackedPileup pileup, ThresHolder thresholds) { + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { if (!interval.containsP(locus)) throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval)); // a null pileup means there nothing ot add if (pileup != null) { - - int locusIndex = locus.getStart() - interval.getStart(); - - int rawCoverage = pileup.depthOfCoverage(); - int coverage = thresholds.getFilteredCoverage(pileup); - - LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); - - loci.set(locusIndex, locusData); + final int locusIndex = locus.getStart() - interval.getStart(); + final int rawCoverage = pileup.depthOfCoverage(); + final int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.minimumBaseQuality, thresholds.minimumMappingQuality).depthOfCoverage(); + final LocusStatistics locusData = loci.get(locusIndex); + locusData.set(coverage, rawCoverage); for (GATKSAMRecord read : pileup.getReads()) - processRead(read, thresholds); - } - } - - private void processRead(GATKSAMRecord read, ThresHolder thresholds) { - // Was this read already processed? - if (read.getTemporaryAttribute("checkedBadMate") == null) { - nReads++; - if (!hasValidMate(read, thresholds)) - nBadMates++; - read.setTemporaryAttribute("checkedBadMate", true); + processRead(read); } } /** - * returns the callable status of a given locus without taking the reference base into account. - * - * @param locusIndex location in the genome to inquire (only one locus) - * @param thresholds the class contains the statistical threshold for making calls - * @return the callable status of a locus + * Account for the read and check it for any statistics necessary. Reads are marked in the temporary + * attribute "seen" to make sure they're not counted twice. + * + * @param read the read */ - private Set callableStatus(int locusIndex, ThresHolder thresholds) { - LocusStatistics locus = loci.get(locusIndex); - - return locus.callableStatuses(thresholds); + private void processRead(GATKSAMRecord read) { + if (read.getTemporaryAttribute("seen") == null) { + nReads++; + if (read.getReadPairedFlag() && !read.getProperPairFlag()) + nBadMates++; + read.setTemporaryAttribute("seen", true); + } } private void calculateTotalCoverage() { @@ -203,101 +199,8 @@ class SampleStatistics { preComputedTotalCoverage += locus.getCoverage(); } - public double getQuantileDepth(double percentage) { - if (preSortedDepths == null) - getDepthsAsSortedArray(); - - return getQuartile(preSortedDepths, percentage); - } - - static double getQuartile(int[] data, double percentage) { - int size = data.length; - if (size == 1) - return (double) data[0]; - - if (percentage == 0.5) { - return getMedian(data); - } - - double position = (size - 1.0) / 2; - if (percentage == 0.25) { - // if the position is a whole number - return getMedian(Arrays.copyOfRange(data, 0, (int) position + 1)); - - } - if (percentage == 0.75) { - if (position % 1 == 0) { - return getMedian(Arrays.copyOfRange(data, (int) position, size)); - } else { - return getMedian(Arrays.copyOfRange(data, (int) position + 1, size)); - } - } - return -1; - } - - // Assumes data is sorted - private static double getMedian(int[] data) { - double size = (double) data.length; - if (size == 1) - return (double) data[0]; - - double position = (size - 1.0) / 2; - - if (position % 1 == 0) - return (double) data[(int) position]; - - else { - double high = (double) data[(int) Math.ceil(position)]; - double low = (double) data[(int) Math.floor(position)]; - - return (high + low) / 2; - - } - - } - - private void getDepthsAsSortedArray() { - preSortedDepths = new int[loci.size()]; - - for (int i = 0; i < loci.size(); i++) - preSortedDepths[i] = loci.get(i).getCoverage(); - - Arrays.sort(preSortedDepths); - } - - boolean hasValidMate(GATKSAMRecord read, ThresHolder thresholds) { - /** Check the following - * Does it have a pair? - * reasonable insert size? - * inverted? - * same orientation? - * same contig? - * is pair mapped? - * todo - is forced mate? - * - */ - - // has NO pair - if (!read.getReadPairedFlag()) - return false; - - // different contigs - if (!read.getMateReferenceIndex().equals(read.getReferenceIndex())) - return false; - - // unmapped - if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag()) - return false; - - // same orientation - if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag()) - return false; - - // todo -- inverted ? - - // mates are too far apart - return Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) <= thresholds.getMaximumInsertSize(); - + public int getIntervalSize() { + return interval.size(); } public int getnReads() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java index c2dd2f4ff..3b7626708 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -46,29 +46,82 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; -import org.broadinstitute.variant.vcf.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Interval; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Sample; -import java.util.HashSet; -import java.util.Set; +import java.util.LinkedList; +import java.util.List; -class ThresHolder { - public static final String AVG_INTERVAL_DP_KEY = "AVG_INTERVAL_DP"; - public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 50, 0.5, 0.5, 0.2, 0.2, 0.5); +public class ThresHolder { - private final int minimumBaseQuality; - private final int minimumMappingQuality; + /** + * Only bases with quality greater than this will be considered in the coverage metrics. + */ + @Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false) + public int minimumBaseQuality = 20; - private final int minimumCoverage; - private final int maximumCoverage; + /** + * Only reads with mapping quality greater than this will be considered in the coverage metrics. + */ + @Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false) + public int minimumMappingQuality = 20; - private final int maximumInsertSize; + /** + * If at any locus, a sample has less coverage than this, it will be reported as LOW_COVERAGE + */ + @Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false) + public int minimumCoverage = 5; - private final double votePercentageThreshold; - private final double badMateStatusThreshold; - private final double coverageStatusThreshold; - private final double excessiveCoverageThreshold; - private final double qualityStatusThreshold; + /** + * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE + */ + @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) + public int maximumCoverage = 700; + + /** + * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE + */ + @Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false) + public int maximumInsertSize = 500; + + /** + * The proportion of samples that must have a status for it to filter the entire interval. Example: 8 out of 10 samples have low coverage status on the interval, + * with a threshold higher than 0.2, this interval will be filtered as LOW_COVERAGE. + */ + @Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed proportion of samples containing a call for the interval to adopt the call ", required = false) + public double votePercentageThreshold = 0.50; + + /** + * The proportion of reads in the loci that must have bad mates for the sample to be reported as BAD_MATE + */ + @Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The proportion of the loci needed for calling BAD_MATE", required = false) + public double badMateStatusThreshold = 0.50; + + /** + * The proportion of loci in a sample that must fall under the LOW_COVERAGE or COVERAGE_GAPS category for the sample to be reported as either (or both) + */ + @Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The proportion of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false) + public double coverageStatusThreshold = 0.20; + + /** + * The proportion of loci in a sample that must fall under the EXCESSIVE_COVERAGE category for the sample to be reported as EXCESSIVE_COVERAGE + */ + @Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The proportion of the loci needed for calling EXCESSIVE_COVERAGE", required = false) + public double excessiveCoverageThreshold = 0.20; + + /** + * The proportion of loci in a sample that must fall under the LOW_QUALITY category for the sample to be reported as LOW_QUALITY + */ + @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) + public double qualityStatusThreshold = 0.50; + + public final List locusStatisticList = new LinkedList(); + public final List sampleStatisticList = new LinkedList(); + public final List intervalStatisticList = new LinkedList(); + + public ThresHolder() {} public ThresHolder(final int minimumBaseQuality, final int minimumMappingQuality, @@ -91,69 +144,4 @@ class ThresHolder { this.excessiveCoverageThreshold = excessiveCoverageThreshold; this.qualityStatusThreshold = qualityStatusThreshold; } - - public int getMinimumCoverage() { - return minimumCoverage; - } - - public int getMaximumCoverage() { - return maximumCoverage; - } - - public int getMaximumInsertSize() { - return maximumInsertSize; - } - - public double getVotePercentageThreshold() { - return votePercentageThreshold; - } - - public double getBadMateStatusThreshold() { - return badMateStatusThreshold; - } - - public double getCoverageStatusThreshold() { - return coverageStatusThreshold; - } - - public double getExcessiveCoverageThreshold() { - return excessiveCoverageThreshold; - } - - public double getQualityStatusThreshold() { - return qualityStatusThreshold; - } - - public int getFilteredCoverage(ReadBackedPileup pileup) { - return pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); - } - - /** - * Gets the header lines for the VCF writer - * - * @return A set of VCF header lines - */ - public static Set getHeaderInfo() { - Set headerLines = new HashSet(); - - // INFO fields for overall data - headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); - - // FORMAT fields for each genotype - headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); - headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); - headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); - headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); - headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); - - - // FILTER fields - for (CallableStatus stat : CallableStatus.values()) - headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); - - return headerLines; - } - } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java new file mode 100644 index 000000000..3e8adc978 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java @@ -0,0 +1,63 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.IntervalStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 4/20/13 + * Time: 11:30 PM + * To change this template use File | Settings | File Templates. + */ +public interface Interval { + public void initialize(ThresHolder thresholds); + public CallableStatus status (IntervalStatistics intervalStatistics); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java new file mode 100644 index 000000000..aee41846b --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java @@ -0,0 +1,65 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 4/20/13 + * Time: 11:29 PM + * To change this template use File | Settings | File Templates. + */ +public interface Locus { + public void initialize(ThresHolder thresholds); + public CallableStatus status (LocusStatistics locusStatistics); + public CallableStatus sampleStatus (SampleStatistics sampleStatistics); +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java similarity index 73% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java index dd9e1d86e..effdd33b0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatisticsUnitTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java @@ -44,78 +44,34 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; -public class SampleStatisticsUnitTest/* extends BaseTest */ { +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +public class LocusCoverageGap implements Locus { + private double threshold; + private static final CallableStatus CALL = CallableStatus.COVERAGE_GAPS; - @DataProvider(name = "QuartileValues") - public Object[][] getQuantileValues() { - - int[] a1 = {5}; - int[] a2 = {1, 2}; - int[] a5 = {10, 20, 30, 40, 50}; - int[] a10 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - - - return new Object[][]{ - new Object[]{a1, 0.5, 5}, - new Object[]{a1, 0, 5}, - new Object[]{a1, 1, 5}, - new Object[]{a2, 0.5, 1.5}, - new Object[]{a2, 0.25, 1}, - new Object[]{a2, 0.75, 2}, - new Object[]{a5, 0.5, 30}, - new Object[]{a5, 0.25, 20}, - new Object[]{a5, 0.75, 40}, - new Object[]{a5, 0, -1}, - new Object[]{a10, 0.5, 5.5}, - new Object[]{a10, 0.25, 3}, - new Object[]{a10, 0.75, 8} - }; + @Override + public void initialize(ThresHolder thresholds) { + threshold = thresholds.coverageStatusThreshold; } - @Test(dataProvider = "QuartileValues") - public void testGetQuartile(int[] dataList, double percentage, double expected) { - Assert.assertEquals(SampleStatistics.getQuartile(dataList, percentage), expected); - + @Override + public CallableStatus status(LocusStatistics locusStatistics) { + return locusStatistics.getRawCoverage() == 0 ? CALL : null; } - @DataProvider(name = "ReadsAndMates") - public Object[][] getReadAndMates() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - - GATKSAMRecord noPair = ArtificialSAMUtils.createArtificialRead(header, "test", 0, 100, 50); - GATKSAMRecord good = ArtificialSAMUtils.createPair(header, "test", 30, 100, 150, true, false).get(0); - GATKSAMRecord bigInsertSize = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, false).get(0); -// GATKSAMRecord inverted = ArtificialSAMUtils.createPair(header, "test", 30, 151, 150, true, false).get(0); - GATKSAMRecord sameOrientation = ArtificialSAMUtils.createPair(header, "test", 30, 100, 151, true, true).get(0); - - GATKSAMRecord pairNotMapped = ArtificialSAMUtils.createPair(header, "test", 30, 100, 140, true, false).get(1); - pairNotMapped.setMateUnmappedFlag(true); - - // finish test - return new Object[][]{ - new Object[]{noPair, false}, - new Object[]{good, true}, - new Object[]{bigInsertSize, false}, -// new Object[]{inverted, false}, - new Object[]{sameOrientation, false}, - new Object[]{pairNotMapped, false} - }; + @Override + public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { + return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); } - - @Test(dataProvider = "ReadsAndMates") - public void testHasValidMate(GATKSAMRecord read, boolean expected) { - //50 is out maximum insert size - Assert.assertEquals(new SampleStatistics(GenomeLoc.UNMAPPED).hasValidMate(read, ThresHolder.DEFAULTS), expected); - } - } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java new file mode 100644 index 000000000..72709a0f6 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java @@ -0,0 +1,79 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +public class LocusExcessiveCoverage implements Locus { + private int excessiveCoverage; + private double threshold; + private static final CallableStatus CALL = CallableStatus.EXCESSIVE_COVERAGE ; + + @Override + public void initialize(ThresHolder thresholds) { + this.excessiveCoverage = thresholds.maximumCoverage; + this.threshold = thresholds.coverageStatusThreshold; + } + + @Override + public CallableStatus status(LocusStatistics locusStatistics) { + return locusStatistics.getCoverage() > excessiveCoverage ? CALL : null; + } + + @Override + public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { + return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java new file mode 100644 index 000000000..f3f181bd1 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +public class LocusLowCoverage implements Locus { + private int minCoverage; + private double threshold; + private static final CallableStatus CALL = CallableStatus.LOW_COVERAGE ; + + @Override + public void initialize(ThresHolder thresholds) { + this.minCoverage = thresholds.minimumCoverage; + this.threshold = thresholds.coverageStatusThreshold; + } + + @Override + public CallableStatus status(LocusStatistics locusStatistics) { + final int raw = locusStatistics.getRawCoverage(); + return raw > 0 && raw < minCoverage ? CALL: null; + } + + @Override + public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { + return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java new file mode 100644 index 000000000..91d5f8c04 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java @@ -0,0 +1,79 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +public class LocusPoorQuality implements Locus { + private int minCoverage; + private double threshold; + private static final CallableStatus CALL = CallableStatus.POOR_QUALITY ; + + @Override + public void initialize(ThresHolder thresholds) { + this.minCoverage = thresholds.minimumCoverage; + this.threshold = thresholds.coverageStatusThreshold; + } + + @Override + public CallableStatus status(LocusStatistics locusStatistics) { + return locusStatistics.getCoverage() < minCoverage && locusStatistics.getRawCoverage() >= minCoverage ? CALL: null; + } + + @Override + public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { + return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java new file mode 100644 index 000000000..0a29aea4d --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java @@ -0,0 +1,66 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; + +import java.util.Map; + +/** + * User: carneiro + * Date: 4/21/13 + * Time: 11:23 AM + */ +public class PluginUtils { + public static CallableStatus genericSampleStatus (final SampleStatistics sampleStatistics, final CallableStatus CALL, final double threshold) { + final Map totals = sampleStatistics.getLocusStatusTally(); + final int size = sampleStatistics.getIntervalSize(); + final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0; + return ( (double) statusCount / size) >= threshold ? CALL: null; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java new file mode 100644 index 000000000..a33c33877 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java @@ -0,0 +1,63 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 4/20/13 + * Time: 11:30 PM + * To change this template use File | Settings | File Templates. + */ +public interface Sample { + public void initialize(ThresHolder thresholds); + public CallableStatus status (SampleStatistics sampleStatistics); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java new file mode 100644 index 000000000..07e2cca5a --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java @@ -0,0 +1,76 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +public class SampleBadMates implements Sample { + private static final CallableStatus CALL = CallableStatus.NO_READS ; + + private double threshold; + private double votingThreshold; + + @Override + public void initialize(ThresHolder thresholds) { + threshold = thresholds.badMateStatusThreshold; + votingThreshold = thresholds.votePercentageThreshold; + } + + @Override + public CallableStatus status(SampleStatistics sampleStatistics) { + final int nReads = sampleStatistics.getnReads(); + return nReads > 0 && (double) sampleStatistics.getnBadMates() / nReads > threshold ? CALL : null; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java new file mode 100644 index 000000000..30a6fbda3 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java @@ -0,0 +1,73 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; + +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; + +/** + * User: carneiro + * Date: 4/20/13 + * Time: 11:44 PM + */ +public class SampleNoReads implements Sample { + private static final CallableStatus CALL = CallableStatus.NO_READS; + + private double votingThreshold; + + @Override + public void initialize(ThresHolder thresholds) { + votingThreshold = thresholds.votePercentageThreshold; + } + + @Override + public CallableStatus status(SampleStatistics sampleStatistics) { + return sampleStatistics.getnReads() == 0 ? CALL : null; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index ef14f8386..e5cea2c7b 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "742c13fc092b42f9ff71fc3fff4a95cc"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "850304909477afa8c2a8f128d6eedde9"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "7083cc720a2caa02fb0fa8f49f94a826"); + DTTest("testMultiSample ", "-I " + multiSample, "bedd19bcf21d1a779f6706c0351c9d26"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java index c86acebb9..96747619f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java @@ -47,37 +47,43 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Set; -public class LocusStatisticsUnitTest /*extends BaseTest*/ { +public class LocusStatisticsUnitTest { + + ThresHolder thresholds = new ThresHolder(); + + @BeforeClass + public void init() { + DiagnoseTargets.loadAllPlugins(thresholds); + } @Test(dataProvider = "StatusTestValues") public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) { - // The min Coverage threshold is 10, the max is 100 - ThresHolder thresholds = new ThresHolder(20, 20, 10, 100, 50, 0.5, 0.5, 0.2, 0.2, 0.5); - Set statuses = new LocusStatistics(coverage, rawCoverage).callableStatuses(thresholds); - // Check to make sure the status provides matches the actual + Set statuses = new LocusStatistics(coverage, rawCoverage, thresholds).callableStatuses(); Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1)); - } @DataProvider(name = "StatusTestValues") public Object[][] getStatusTestValues() { + final int max = thresholds.maximumCoverage; + final int min = thresholds.minimumCoverage; return new Object[][]{ - new Object[]{100, 100, null}, - new Object[]{100, 101, null}, - new Object[]{101, 101, CallableStatus.EXCESSIVE_COVERAGE}, - new Object[]{10, 101, null}, - new Object[]{9, 101, CallableStatus.POOR_QUALITY}, - new Object[]{9, 10, CallableStatus.POOR_QUALITY}, - new Object[]{9, 9, CallableStatus.LOW_COVERAGE}, + new Object[]{max, max, null}, + new Object[]{max, max+1, null}, + new Object[]{max+1, max+1, CallableStatus.EXCESSIVE_COVERAGE}, + new Object[]{min, max+1, null}, + new Object[]{min-1, max+1, CallableStatus.POOR_QUALITY}, + new Object[]{min-1, min, CallableStatus.POOR_QUALITY}, + new Object[]{min-1, min-1, CallableStatus.LOW_COVERAGE}, new Object[]{0, 0, CallableStatus.COVERAGE_GAPS}, - new Object[]{0, 9, CallableStatus.LOW_COVERAGE}, - new Object[]{0, 101, CallableStatus.POOR_QUALITY}, - new Object[]{10, Integer.MAX_VALUE, null}, + new Object[]{0, min-1, CallableStatus.LOW_COVERAGE}, + new Object[]{0, max+1, CallableStatus.POOR_QUALITY}, + new Object[]{min, Integer.MAX_VALUE, null}, new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, CallableStatus.EXCESSIVE_COVERAGE}, }; } From cb4ec3437a50285e2bf22e293d89ed1c21c44fad Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 23 Apr 2013 13:32:06 -0400 Subject: [PATCH 199/226] After debate reverting SW parameter changes temporarily while we explore global SW plans. --- .../sting/gatk/walkers/haplotypecaller/graphs/Path.java | 2 +- .../haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- .../walkers/haplotypecaller/graphs/KBestPathsUnitTest.java | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index f232a4ce0..47676a498 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -86,7 +86,7 @@ public class Path { // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base - public static final Parameters NEW_SW_PARAMETERS = new Parameters(10, -20.0, -26.0, -0.1); + public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1); private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 500db6ae9..2664f3ed0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cac0d88fa4471c7a0ac96533a9a6354b")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("eb5772b825120a0b8710e5add485d73a")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index 302866b55..d1bae74b2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -425,7 +425,7 @@ public class KBestPathsUnitTest extends BaseTest { logger.warn("AltPath : " + altPath + " cigar " + altPath.calculateCigar()); Assert.assertEquals(refPath.calculateCigar().toString(), "51M"); - Assert.assertEquals(altPath.calculateCigar().toString(), "3M14D2M20I32M"); + Assert.assertEquals(altPath.calculateCigar().toString(), "3M6I48M"); } // ----------------------------------------------------------------- @@ -443,7 +443,7 @@ public class KBestPathsUnitTest extends BaseTest { Arrays.asList("G", "C", "1M"), Arrays.asList("G", "", "1D"), Arrays.asList("", "C", "1I"), - Arrays.asList("AAA", "CGT", "3D3I"), + Arrays.asList("AAA", "CGT", "3M"), Arrays.asList("TAT", "CAC", "3M"), Arrays.asList("GCTG", "GTCG", "4M"), Arrays.asList("AAAAA", "", "5D"), From 38662f1d475e5bc63edbb88cb300fbb24dd62f90 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 23 Apr 2013 14:01:43 -0400 Subject: [PATCH 201/226] Limiting access to the DT classes * Make most classes final, others package local * Move to diagnostics.diagnosetargets package * Aggregate statistics and walker classes on the same package for simplified visibility. * Make status list a LinkedList instead of a HashSet --- .../BaseCoverageDistribution.java | 2 +- .../{targets => }/FindCoveredIntervals.java | 2 +- .../CallableStatus.java | 4 ++-- .../DiagnoseTargets.java | 13 +++++-------- .../Interval.java | 8 ++------ .../IntervalStatistics.java | 19 +++++++------------ .../statistics => diagnosetargets}/Locus.java | 9 ++------- .../LocusCoverageGap.java | 9 ++------- .../LocusExcessiveCoverage.java | 9 ++------- .../LocusLowCoverage.java | 9 ++------- .../LocusPoorQuality.java | 9 ++------- .../LocusStatistics.java | 13 +++++-------- .../PluginUtils.java | 7 ++----- .../Sample.java | 8 ++------ .../SampleBadMates.java | 8 ++------ .../SampleNoReads.java | 8 ++------ .../SampleStatistics.java | 10 ++++------ .../ThresHolder.java | 7 ++----- ...seCoverageDistributionIntegrationTest.java | 2 +- .../DiagnoseTargetsIntegrationTest.java | 12 ++++++------ .../LocusStatisticsUnitTest.java | 6 +++--- 21 files changed, 57 insertions(+), 117 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => }/BaseCoverageDistribution.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => }/FindCoveredIntervals.java (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/CallableStatus.java (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/DiagnoseTargets.java (96%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/Interval.java (96%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/IntervalStatistics.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/Locus.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/LocusCoverageGap.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/LocusExcessiveCoverage.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/LocusLowCoverage.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/LocusPoorQuality.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/LocusStatistics.java (96%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/PluginUtils.java (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/Sample.java (96%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/SampleBadMates.java (96%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets/statistics => diagnosetargets}/SampleNoReads.java (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/SampleStatistics.java (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/ThresHolder.java (97%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => }/BaseCoverageDistributionIntegrationTest.java (99%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/DiagnoseTargetsIntegrationTest.java (95%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/{targets => diagnosetargets}/LocusStatisticsUnitTest.java (97%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java index 53b7cebaa..417da9d79 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistribution.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java index 1c9751c5b..ad6023579 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java index 32c0c339d..d38736f4f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * Short one line description of the walker. @@ -52,7 +52,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; * @author Mauricio Carneiro * @since 2/1/12 */ -public enum CallableStatus { +enum CallableStatus { PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"), diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index d90f2d1fa..7ecbe2f21 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.commandline.ArgumentCollection; @@ -54,9 +54,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Interval; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; @@ -259,7 +256,7 @@ public class DiagnoseTargets extends LocusWalker { VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); - vcb.filters(new HashSet(statusesToStrings(stats.callableStatuses(), true))); + vcb.filters(new LinkedHashSet(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage()); @@ -268,10 +265,10 @@ public class DiagnoseTargets extends LocusWalker { for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); - SampleStatistics sampleStat = stats.getSampleStatics(sample); + SampleStatistics sampleStat = stats.getSampleStatistics(sample); gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); - gb.filters(statusesToStrings(stats.getSampleStatics(sample).getCallableStatuses(), false)); + gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); } @@ -286,7 +283,7 @@ public class DiagnoseTargets extends LocusWalker { * @param statuses the set of statuses to be converted * @return a matching set of strings */ - private List statusesToStrings(Set statuses, final boolean isInfoField) { + private List statusToStrings(List statuses, final boolean isInfoField) { List output = new ArrayList(statuses.size()); for (CallableStatus status : statuses) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java index 3e8adc978..75f41edf9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Interval.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java @@ -44,11 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.IntervalStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * Created with IntelliJ IDEA. @@ -57,7 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; * Time: 11:30 PM * To change this template use File | Settings | File Templates. */ -public interface Interval { +interface Interval { public void initialize(ThresHolder thresholds); public CallableStatus status (IntervalStatistics intervalStatistics); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java index 2e7333cc6..30cca8c5a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java @@ -44,21 +44,16 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Interval; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -public class IntervalStatistics { +import java.util.*; +final class IntervalStatistics { private final Map samples; private final GenomeLoc interval; private final ThresHolder thresholds; @@ -73,7 +68,7 @@ public class IntervalStatistics { this.samples.put(sample, new SampleStatistics(interval, thresholds)); } - public SampleStatistics getSampleStatics(String sample) { + public SampleStatistics getSampleStatistics(String sample) { return samples.get(sample); } @@ -126,13 +121,13 @@ public class IntervalStatistics { * * @return the callable status(es) for the whole interval */ - public Set callableStatuses() { - final Set output = new HashSet(); + public List callableStatuses() { + final List output = new LinkedList(); // sum up all the callable status for each sample final Map sampleStatusTally = new HashMap(CallableStatus.values().length); for (SampleStatistics sampleStatistics : samples.values()) { - for (CallableStatus status : sampleStatistics.getCallableStatuses()) { + for (CallableStatus status : sampleStatistics.callableStatuses()) { sampleStatusTally.put(status, !sampleStatusTally.containsKey(status) ? 1 : sampleStatusTally.get(status) + 1); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java index aee41846b..5e6162fb6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Locus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java @@ -44,12 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * Created with IntelliJ IDEA. @@ -58,7 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; * Time: 11:29 PM * To change this template use File | Settings | File Templates. */ -public interface Locus { +interface Locus { public void initialize(ThresHolder thresholds); public CallableStatus status (LocusStatistics locusStatistics); public CallableStatus sampleStatus (SampleStatistics sampleStatistics); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java index effdd33b0..d78109a86 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusCoverageGap.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java @@ -44,19 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * User: carneiro * Date: 4/20/13 * Time: 11:44 PM */ -public class LocusCoverageGap implements Locus { +final class LocusCoverageGap implements Locus { private double threshold; private static final CallableStatus CALL = CallableStatus.COVERAGE_GAPS; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java index 72709a0f6..3bbb6b2d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusExcessiveCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java @@ -44,19 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * User: carneiro * Date: 4/20/13 * Time: 11:44 PM */ -public class LocusExcessiveCoverage implements Locus { +final class LocusExcessiveCoverage implements Locus { private int excessiveCoverage; private double threshold; private static final CallableStatus CALL = CallableStatus.EXCESSIVE_COVERAGE ; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java index f3f181bd1..0f7d481c9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusLowCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java @@ -44,19 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * User: carneiro * Date: 4/20/13 * Time: 11:44 PM */ -public class LocusLowCoverage implements Locus { +final class LocusLowCoverage implements Locus { private int minCoverage; private double threshold; private static final CallableStatus CALL = CallableStatus.LOW_COVERAGE ; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java index 91d5f8c04..3caf467ec 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/LocusPoorQuality.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java @@ -44,19 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.LocusStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * User: carneiro * Date: 4/20/13 * Time: 11:44 PM */ -public class LocusPoorQuality implements Locus { +final class LocusPoorQuality implements Locus { private int minCoverage; private double threshold; private static final CallableStatus CALL = CallableStatus.POOR_QUALITY ; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java index e85f3d9c1..543b126b4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java @@ -44,15 +44,12 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; - -import java.util.HashSet; +import java.util.LinkedList; import java.util.List; -import java.util.Set; -public class LocusStatistics { +final class LocusStatistics { private int coverage; private int rawCoverage; private final List locusStatisticsList; @@ -80,8 +77,8 @@ public class LocusStatistics { * * @return a set of all statuses that apply */ - public Set callableStatuses() { - Set output = new HashSet(); + public List callableStatuses() { + List output = new LinkedList(); for (Locus stats : locusStatisticsList) { CallableStatus status = stats.status(this); if (status != null) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java index 0a29aea4d..2343b637e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/PluginUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java @@ -44,10 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import java.util.Map; @@ -56,7 +53,7 @@ import java.util.Map; * Date: 4/21/13 * Time: 11:23 AM */ -public class PluginUtils { +final class PluginUtils { public static CallableStatus genericSampleStatus (final SampleStatistics sampleStatistics, final CallableStatus CALL, final double threshold) { final Map totals = sampleStatistics.getLocusStatusTally(); final int size = sampleStatistics.getIntervalSize(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java index a33c33877..3b4e55347 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/Sample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java @@ -44,11 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * Created with IntelliJ IDEA. @@ -57,7 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; * Time: 11:30 PM * To change this template use File | Settings | File Templates. */ -public interface Sample { +interface Sample { public void initialize(ThresHolder thresholds); public CallableStatus status (SampleStatistics sampleStatistics); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java index 07e2cca5a..9c56858f6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleBadMates.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java @@ -44,18 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * User: carneiro * Date: 4/20/13 * Time: 11:44 PM */ -public class SampleBadMates implements Sample { +final class SampleBadMates implements Sample { private static final CallableStatus CALL = CallableStatus.NO_READS ; private double threshold; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java index 30a6fbda3..95d66a555 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/statistics/SampleNoReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java @@ -44,18 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics; - -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.CallableStatus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.SampleStatistics; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.ThresHolder; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; /** * User: carneiro * Date: 4/20/13 * Time: 11:44 PM */ -public class SampleNoReads implements Sample { +final class SampleNoReads implements Sample { private static final CallableStatus CALL = CallableStatus.NO_READS; private double votingThreshold; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java index c05feebbd..6c8481b0e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java @@ -44,10 +44,8 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -58,7 +56,7 @@ import java.util.*; /** * The statistics calculator for a specific sample given the interval */ -public class SampleStatistics { +final class SampleStatistics { private final GenomeLoc interval; private final ArrayList loci; private final ThresHolder thresholds; @@ -129,8 +127,8 @@ public class SampleStatistics { * * @return the callable statuses of the entire sample */ - public Set getCallableStatuses() { - final Set output = new HashSet(); + public List callableStatuses() { + final List output = new LinkedList(); // get the tally of all the locus callable statuses for (Locus locusStat : thresholds.locusStatisticList) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index 3b7626708..c45c2d9ff 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -44,17 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Interval; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Locus; -import org.broadinstitute.sting.gatk.walkers.diagnostics.targets.statistics.Sample; import java.util.LinkedList; import java.util.List; -public class ThresHolder { +final class ThresHolder { /** * Only bases with quality greater than this will be considered in the coverage metrics. diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistributionIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java similarity index 99% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistributionIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java index 53153c100..27f140337 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/BaseCoverageDistributionIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistributionIntegrationTest.java @@ -44,7 +44,7 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics; import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.DataProvider; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java similarity index 95% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java index e5cea2c7b..bac09f30d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java @@ -44,23 +44,23 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; import java.util.Arrays; public class DiagnoseTargetsIntegrationTest extends WalkerTest { - final static String REF = b37KGReference; - final String singleSample = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final String multiSample = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - final String L = validationDataLocation + "DT-itest.interval_list"; + final static String REF = BaseTest.b37KGReference; + final String singleSample = BaseTest.validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final String multiSample = BaseTest.validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final String L = BaseTest.validationDataLocation + "DT-itest.interval_list"; private void DTTest(String testName, String args, String md5) { String base = String.format("-T DiagnoseTargets --no_cmdline_in_header -R %s -L %s", REF, L) + " -o %s "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); - //spec.disableShadowBCF(); executeTest(testName, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java similarity index 97% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java index 96747619f..d784c2a9e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatisticsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java @@ -44,14 +44,14 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.Set; +import java.util.List; public class LocusStatisticsUnitTest { @@ -64,7 +64,7 @@ public class LocusStatisticsUnitTest { @Test(dataProvider = "StatusTestValues") public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) { - Set statuses = new LocusStatistics(coverage, rawCoverage, thresholds).callableStatuses(); + List statuses = new LocusStatistics(coverage, rawCoverage, thresholds).callableStatuses(); Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1)); } From 75184614c66682be87c50101814be66366e8149f Mon Sep 17 00:00:00 2001 From: Jacob Silterra Date: Tue, 23 Apr 2013 14:03:48 -0400 Subject: [PATCH 202/226] Add additional necessary class files to na12878kb.jar target --- build.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build.xml b/build.xml index c70f85a28..12ebfa18f 100644 --- a/build.xml +++ b/build.xml @@ -674,8 +674,9 @@ - + + From 8f8f339e4bd4106f95552d86c6b0368d228729b8 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 23 Apr 2013 18:02:27 -0400 Subject: [PATCH 203/226] Abstract class for the statistics Addressing the code duplication issue raised by Mark. --- .../diagnosetargets/AbstractStatistics.java | 150 ++++++++++++++++++ .../diagnosetargets/DiagnoseTargets.java | 8 +- .../diagnostics/diagnosetargets/Interval.java | 4 +- .../diagnosetargets/IntervalStatistics.java | 53 ++----- .../diagnostics/diagnosetargets/Locus.java | 4 +- .../diagnosetargets/LocusCoverageGap.java | 3 +- .../LocusExcessiveCoverage.java | 3 +- .../diagnosetargets/LocusLowCoverage.java | 5 +- .../diagnosetargets/LocusPoorQuality.java | 3 +- .../diagnosetargets/LocusStatistics.java | 28 ++-- .../diagnosetargets/PluginUtils.java | 2 +- .../diagnostics/diagnosetargets/Sample.java | 4 +- .../diagnosetargets/SampleBadMates.java | 3 +- .../diagnosetargets/SampleNoReads.java | 9 +- .../diagnosetargets/SampleStatistics.java | 132 +++++---------- .../diagnosetargets/Statistic.java | 57 +++++++ .../diagnosetargets/ThresHolder.java | 6 +- 17 files changed, 304 insertions(+), 170 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java create mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java new file mode 100644 index 000000000..0ac083bb6 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java @@ -0,0 +1,150 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Generic code for Diagnose Target Statistics + * + * @author Mauricio Carneiro + * @since 4/23/13 + */ +abstract class AbstractStatistics { + + private long preComputedTotalCoverage = -1; + private Map statusTally = null; + protected ThresHolder thresholds; + + /** + * Calculates the average "good" coverage of this sample. Good means "passes the base and + * mapping quality requirements. + * + * @return the average "good" coverage + */ + public double averageCoverage(final int size) { + if (preComputedTotalCoverage < 0) + preComputedTotalCoverage = calculateTotalCoverage(getElements()); + return (double) preComputedTotalCoverage / size; + } + + /** + * Calculates the total "good" coverage of this sample. Good means "passes the base and + * mapping quality requirements. + * + * @return the total "good" coverage across the interval for this sample + */ + public long getCoverage() { + if (preComputedTotalCoverage < 0) + preComputedTotalCoverage = calculateTotalCoverage(getElements()); + return preComputedTotalCoverage; + } + + + /** + * This is how the extending class will calculate it's own total coverage + * + * @return the total coverage + */ + private long calculateTotalCoverage(Iterable elements) { + long cov = 0; + for (AbstractStatistics element : elements) { + cov += element.getCoverage(); + } + return cov; + } + + /** + * What are the list of elements in your class? For example: + * + * IntervalStatistics => List + * SampleStatistics => List + * + * @return the corresponding list of elements of the extending class + */ + public abstract Iterable getElements(); + + /** + * Calculates the Callable statuses for the statistic as a whole (interval, sample or locus) + * + * @return the callable status(es) for the whole object + */ + public abstract Iterable callableStatuses(); + + + /** + * Tally up all the callable status of all the loci in this sample. + * + * @return a map of callable status and counts + */ + public Map getStatusTally() { + if (statusTally == null) { + statusTally = new HashMap(CallableStatus.values().length); + for (AbstractStatistics stats : getElements()) { + for (CallableStatus status : stats.callableStatuses()) { + statusTally.put(status, !statusTally.containsKey(status) ? 1 : statusTally.get(status) + 1); + } + } + } + return statusTally; + } + + public static List queryStatus(List statList, AbstractStatistics stratification) { + List output = new LinkedList(); + for (Statistic stat : statList) { + final CallableStatus status = stat.status(stratification); + if (status != null) { + output.add(status); + } + } + return output; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 7ecbe2f21..78a30a7bd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -259,14 +259,14 @@ public class DiagnoseTargets extends LocusWalker { vcb.filters(new LinkedHashSet(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage()); + attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); vcb = vcb.attributes(attributes); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStatistics sampleStat = stats.getSampleStatistics(sample); - gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage()); + gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size())); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); @@ -283,8 +283,8 @@ public class DiagnoseTargets extends LocusWalker { * @param statuses the set of statuses to be converted * @return a matching set of strings */ - private List statusToStrings(List statuses, final boolean isInfoField) { - List output = new ArrayList(statuses.size()); + private List statusToStrings(Iterable statuses, final boolean isInfoField) { + List output = new LinkedList(); for (CallableStatus status : statuses) if ( isInfoField || status != CallableStatus.PASS ) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java index 75f41edf9..bd8307f89 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java @@ -53,7 +53,5 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Time: 11:30 PM * To change this template use File | Settings | File Templates. */ -interface Interval { - public void initialize(ThresHolder thresholds); - public CallableStatus status (IntervalStatistics intervalStatistics); +interface Interval extends Statistic { } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java index 30cca8c5a..1580ce9ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java @@ -53,23 +53,21 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.*; -final class IntervalStatistics { - private final Map samples; +final class IntervalStatistics extends AbstractStatistics{ + private final Map samples; private final GenomeLoc interval; private final ThresHolder thresholds; - private int preComputedTotalCoverage = -1; - public IntervalStatistics(Set samples, GenomeLoc interval, ThresHolder thresholds) { this.interval = interval; this.thresholds = thresholds; - this.samples = new HashMap(samples.size()); + this.samples = new HashMap(samples.size()); for (String sample : samples) this.samples.put(sample, new SampleStatistics(interval, thresholds)); } public SampleStatistics getSampleStatistics(String sample) { - return samples.get(sample); + return (SampleStatistics) samples.get(sample); } public GenomeLoc getInterval() { @@ -94,7 +92,7 @@ final class IntervalStatistics { for (Map.Entry entry : samplePileups.entrySet()) { String sample = entry.getKey(); ReadBackedPileup samplePileup = entry.getValue(); - SampleStatistics sampleStatistics = samples.get(sample); + SampleStatistics sampleStatistics = (SampleStatistics) samples.get(sample); if (sampleStatistics == null) throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); @@ -104,49 +102,30 @@ final class IntervalStatistics { } - public double averageCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return (double) preComputedTotalCoverage / interval.size(); - } - - private void calculateTotalCoverage() { - preComputedTotalCoverage = 0; - for (SampleStatistics sample : samples.values()) - preComputedTotalCoverage += sample.totalCoverage(); + /** + * {@inheritDoc} + */ + @Override + public Iterable getElements() { + return samples.values(); } /** - * Return the Callable statuses for the interval as a whole - * - * @return the callable status(es) for the whole interval + * {@inheritDoc} */ - public List callableStatuses() { + @Override + public Iterable callableStatuses() { final List output = new LinkedList(); - // sum up all the callable status for each sample - final Map sampleStatusTally = new HashMap(CallableStatus.values().length); - for (SampleStatistics sampleStatistics : samples.values()) { - for (CallableStatus status : sampleStatistics.callableStatuses()) { - sampleStatusTally.put(status, !sampleStatusTally.containsKey(status) ? 1 : sampleStatusTally.get(status) + 1); - } - } - // check if any of the votes pass the threshold final int nSamples = getNSamples(); - for (Map.Entry entry : sampleStatusTally.entrySet()) { + for (Map.Entry entry : getStatusTally().entrySet()) { if ((double) entry.getValue() / nSamples > thresholds.votePercentageThreshold) { output.add(entry.getKey()); } } - // add the interval specific statitics statuses - for (Interval intervalStat : thresholds.intervalStatisticList) { - final CallableStatus status = intervalStat.status(this); - if (status != null) { - output.add(status); - } - } + output.addAll(queryStatus(thresholds.intervalStatisticList, this)); return output; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java index 5e6162fb6..f04a093fd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java @@ -53,8 +53,6 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Time: 11:29 PM * To change this template use File | Settings | File Templates. */ -interface Locus { - public void initialize(ThresHolder thresholds); - public CallableStatus status (LocusStatistics locusStatistics); +interface Locus extends Statistic { public CallableStatus sampleStatus (SampleStatistics sampleStatistics); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java index d78109a86..d40816a34 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java @@ -61,7 +61,8 @@ final class LocusCoverageGap implements Locus { } @Override - public CallableStatus status(LocusStatistics locusStatistics) { + public CallableStatus status(AbstractStatistics statistics) { + final LocusStatistics locusStatistics = (LocusStatistics) statistics; return locusStatistics.getRawCoverage() == 0 ? CALL : null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java index 3bbb6b2d8..ef1d7ffde 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java @@ -63,7 +63,8 @@ final class LocusExcessiveCoverage implements Locus { } @Override - public CallableStatus status(LocusStatistics locusStatistics) { + public CallableStatus status(AbstractStatistics statistics) { + final LocusStatistics locusStatistics = (LocusStatistics) statistics; return locusStatistics.getCoverage() > excessiveCoverage ? CALL : null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java index 0f7d481c9..f421f5142 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java @@ -63,8 +63,9 @@ final class LocusLowCoverage implements Locus { } @Override - public CallableStatus status(LocusStatistics locusStatistics) { - final int raw = locusStatistics.getRawCoverage(); + public CallableStatus status(AbstractStatistics statistics) { + final LocusStatistics locusStatistics = (LocusStatistics) statistics; + final long raw = locusStatistics.getRawCoverage(); return raw > 0 && raw < minCoverage ? CALL: null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java index 3caf467ec..042ebcbf8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java @@ -63,7 +63,8 @@ final class LocusPoorQuality implements Locus { } @Override - public CallableStatus status(LocusStatistics locusStatistics) { + public CallableStatus status(AbstractStatistics statistics) { + final LocusStatistics locusStatistics = (LocusStatistics) statistics; return locusStatistics.getCoverage() < minCoverage && locusStatistics.getRawCoverage() >= minCoverage ? CALL: null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java index 543b126b4..b5a9373d5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java @@ -49,10 +49,10 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import java.util.LinkedList; import java.util.List; -final class LocusStatistics { - private int coverage; - private int rawCoverage; - private final List locusStatisticsList; +final class LocusStatistics extends AbstractStatistics{ + private long coverage; + private long rawCoverage; + private final List locusStatisticsList; public LocusStatistics(ThresHolder thresholds) { this(0,0,thresholds); @@ -64,12 +64,13 @@ final class LocusStatistics { this.locusStatisticsList = thresholds.locusStatisticList; } - public int getCoverage() { - return coverage; - } + @Override + public long getCoverage() {return coverage;} + public long getRawCoverage() {return rawCoverage;} - public int getRawCoverage() { - return rawCoverage; + public void addLocus(final int coverage, final int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; } /** @@ -79,7 +80,7 @@ final class LocusStatistics { */ public List callableStatuses() { List output = new LinkedList(); - for (Locus stats : locusStatisticsList) { + for (Statistic stats : locusStatisticsList) { CallableStatus status = stats.status(this); if (status != null) { output.add(status); @@ -88,8 +89,9 @@ final class LocusStatistics { return output; } - public void set(final int coverage, final int rawCoverage) { - this.coverage = coverage; - this.rawCoverage = rawCoverage; + @Override + public Iterable getElements() { + return null; } + } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java index 2343b637e..cb28e0ac5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java @@ -55,7 +55,7 @@ import java.util.Map; */ final class PluginUtils { public static CallableStatus genericSampleStatus (final SampleStatistics sampleStatistics, final CallableStatus CALL, final double threshold) { - final Map totals = sampleStatistics.getLocusStatusTally(); + final Map totals = sampleStatistics.getStatusTally(); final int size = sampleStatistics.getIntervalSize(); final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0; return ( (double) statusCount / size) >= threshold ? CALL: null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java index 3b4e55347..52b24520e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java @@ -53,7 +53,5 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Time: 11:30 PM * To change this template use File | Settings | File Templates. */ -interface Sample { - public void initialize(ThresHolder thresholds); - public CallableStatus status (SampleStatistics sampleStatistics); +interface Sample extends Statistic { } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java index 9c56858f6..483f63c06 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java @@ -64,7 +64,8 @@ final class SampleBadMates implements Sample { } @Override - public CallableStatus status(SampleStatistics sampleStatistics) { + public CallableStatus status(AbstractStatistics statistics) { + final SampleStatistics sampleStatistics = (SampleStatistics) statistics; final int nReads = sampleStatistics.getnReads(); return nReads > 0 && (double) sampleStatistics.getnBadMates() / nReads > threshold ? CALL : null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java index 95d66a555..1c6d3deb7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java @@ -53,16 +53,13 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; */ final class SampleNoReads implements Sample { private static final CallableStatus CALL = CallableStatus.NO_READS; - - private double votingThreshold; - - @Override +@Override public void initialize(ThresHolder thresholds) { - votingThreshold = thresholds.votePercentageThreshold; } @Override - public CallableStatus status(SampleStatistics sampleStatistics) { + public CallableStatus status(AbstractStatistics statistics) { + final SampleStatistics sampleStatistics = (SampleStatistics) statistics; return sampleStatistics.getnReads() == 0 ? CALL : null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java index 6c8481b0e..4d41fea16 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java @@ -56,20 +56,18 @@ import java.util.*; /** * The statistics calculator for a specific sample given the interval */ -final class SampleStatistics { +final class SampleStatistics extends AbstractStatistics { private final GenomeLoc interval; - private final ArrayList loci; + private final ArrayList loci; private final ThresHolder thresholds; - // avoids re-calculating these sums over loci - private int preComputedTotalCoverage = -1; private Map locusStatusTally = null; private int nReads = -1; private int nBadMates = -1; public SampleStatistics(final GenomeLoc interval, final ThresHolder thresholds) { this.interval = interval; - this.loci = new ArrayList(interval.size()); + this.loci = new ArrayList(interval.size()); this.thresholds = thresholds; nReads = 0; nBadMates = 0; @@ -77,69 +75,61 @@ final class SampleStatistics { // Initialize every loci (this way we don't have to worry about non-existent loci in the object for (int i = 0; i < interval.size(); i++) this.loci.add(new LocusStatistics(thresholds)); - } /** - * Calculates the total "good" coverage of this sample. Good means "passes the base and - * mapping quality requirements. - * - * @return the total "good" coverage across the interval for this sample + * Simple Getters */ - public long totalCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return preComputedTotalCoverage; - } + public int getIntervalSize() {return interval.size();} + public int getnReads() {return nReads;} + public int getnBadMates() {return nBadMates;} /** - * Calculates the average "good" coverage of this sample. Good means "passes the base and - * mapping quality requirements. - * - * @return the average "good" coverage - */ - public double averageCoverage() { - return (double) totalCoverage() / loci.size(); - } - - /** - * Tally up all the callable status of all the loci in this sample. - * - * @return a map of callable status and counts - */ - public Map getLocusStatusTally() { - if (locusStatusTally == null) { - locusStatusTally = new HashMap(CallableStatus.values().length); - - // sum up all the callable statuses for each locus - for (int i = 0; i < interval.size(); i++) { - LocusStatistics locus = loci.get(i); - for (CallableStatus status : locus.callableStatuses()) { - locusStatusTally.put(status, !locusStatusTally.containsKey(status) ? 1 : locusStatusTally.get(status) + 1); - } - } - } - return locusStatusTally; - } - - /** - * Calculates the callable statuses of the entire sample + * Adds a locus to the interval wide stats * - * @return the callable statuses of the entire sample + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus, this exclusively contains the sample */ - public List callableStatuses() { + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval)); + + // a null pileup means there nothing to add + if (pileup != null) { + final int locusIndex = locus.getStart() - interval.getStart(); + final int rawCoverage = pileup.depthOfCoverage(); + final int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.minimumBaseQuality, thresholds.minimumMappingQuality).depthOfCoverage(); + final LocusStatistics locusData = (LocusStatistics) loci.get(locusIndex); + locusData.addLocus(coverage, rawCoverage); + + // process all the reads in this pileup (tallying number of reads and bad mates) + for (GATKSAMRecord read : pileup.getReads()) + processRead(read); + } + } + + @Override + public Iterable getElements() { + return loci; + } + + /** + * {@inheritDoc} + */ + @Override + public Iterable callableStatuses() { final List output = new LinkedList(); // get the tally of all the locus callable statuses - for (Locus locusStat : thresholds.locusStatisticList) { - final CallableStatus status = locusStat.sampleStatus(this); + for (Statistic locusStat : thresholds.locusStatisticList) { + final CallableStatus status = ((Locus) locusStat).sampleStatus(this); if (status != null) { output.add(status); } } // get the sample specific statitics statuses - for (Sample sampleStat : thresholds.sampleStatisticList) { + for (Statistic sampleStat : thresholds.sampleStatisticList) { final CallableStatus status = sampleStat.status(this); if (status != null) { output.add(status); @@ -153,28 +143,6 @@ final class SampleStatistics { return output; } - /** - * Adds a locus to the interval wide stats - * - * @param locus The locus given as a GenomeLoc - * @param pileup The pileup of that locus, this exclusively contains the sample - */ - public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { - if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval)); - - // a null pileup means there nothing ot add - if (pileup != null) { - final int locusIndex = locus.getStart() - interval.getStart(); - final int rawCoverage = pileup.depthOfCoverage(); - final int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.minimumBaseQuality, thresholds.minimumMappingQuality).depthOfCoverage(); - final LocusStatistics locusData = loci.get(locusIndex); - locusData.set(coverage, rawCoverage); - - for (GATKSAMRecord read : pileup.getReads()) - processRead(read); - } - } /** * Account for the read and check it for any statistics necessary. Reads are marked in the temporary @@ -190,22 +158,4 @@ final class SampleStatistics { read.setTemporaryAttribute("seen", true); } } - - private void calculateTotalCoverage() { - preComputedTotalCoverage = 0; - for (LocusStatistics locus : loci) - preComputedTotalCoverage += locus.getCoverage(); - } - - public int getIntervalSize() { - return interval.size(); - } - - public int getnReads() { - return nReads; - } - - public int getnBadMates() { - return nBadMates; - } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java new file mode 100644 index 000000000..c43b00a65 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java @@ -0,0 +1,57 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; + +/** + * + * @author Mauricio Carneiro + * @since 4/23/13 + */ +interface Statistic { + public void initialize(ThresHolder thresholds); + public CallableStatus status (AbstractStatistics statistic); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index c45c2d9ff..42c09dda1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -114,9 +114,9 @@ final class ThresHolder { @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) public double qualityStatusThreshold = 0.50; - public final List locusStatisticList = new LinkedList(); - public final List sampleStatisticList = new LinkedList(); - public final List intervalStatisticList = new LinkedList(); + public final List locusStatisticList = new LinkedList(); + public final List sampleStatisticList = new LinkedList(); + public final List intervalStatisticList = new LinkedList(); public ThresHolder() {} From 2ab270cf3f5dcd785af8e4e16143a4cb7b8d97d4 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 23 Apr 2013 17:51:46 -0400 Subject: [PATCH 204/226] Corner case fix to General Ploidy SNP likelihood model. -- In case there are no informative bases in a pileup but pileup isn't empty (like when all bases have Q < min base quality) the GLs were still computed (but were all zeros) and fed to the exact model. Now, mimic case of diploid Gl computation where GLs are only added if # good bases > 0 -- I believe general case where only non-informative GLs are fed into AF calc model is broken and yields bogus QUAL, will investigate separately. --- .../GeneralPloidySNPGenotypeLikelihoods.java | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index 14bffbc34..f19057f29 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -227,7 +227,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi * @param capBaseQualsAtMappingQual Cap base at mapping qual * @param minBaseQual Minimum base quality to consider * @param errorModel Site error model - * @return Number of bases added + * @return Number of bases added - only good bases actually added to GLs are counted. */ private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) { // Number of [A C G T]'s in pileup, in that order @@ -235,28 +235,29 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi for (byte b: BaseUtils.BASES) numSeenBases.add(0); - if (hasReferenceSampleData) { - // count number of elements in pileup - for (PileupElement elt : pileup) { - byte obsBase = elt.getBase(); - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - if ( qual == 0 ) - continue; - - int idx = 0; - - for (byte base:BaseUtils.BASES) { - int cnt = numSeenBases.get(idx); - numSeenBases.set(idx++,cnt + (base == obsBase?1:0)); - - } - + int nGoodBases = 0; + // count number of elements in pileup + for (PileupElement elt : pileup) { + byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + continue; + + int idx = 0; + + for (byte base:BaseUtils.BASES) { + int cnt = numSeenBases.get(idx); + numSeenBases.set(idx++,cnt + (base == obsBase?1:0)); + } - if (VERBOSE) - System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3)); + nGoodBases++; } + + if (VERBOSE) + System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3)); + computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup); - return pileup.getNumberOfElements(); + return nGoodBases; } /** @@ -281,7 +282,8 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi double p1 = 0.0; if (!hasReferenceSampleData) { - // no error model: loop throught pileup to compute likalihoods just on base qualities + // no error model: loop through pileup to compute likelihoods just on base qualities + // In this case, vector numObservations is not used directly for GL computation for (final PileupElement elt : pileup) { final byte obsBase = elt.getBase(); final byte qual = qualToUse(elt, true, true, mbq); From df90597bfc28fbbf978f586ec229987fc60bd6cf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Apr 2013 17:31:59 -0400 Subject: [PATCH 205/226] Performance optimizations and caliper benchmarking code for consolidateCigar -- Now that this function is used in the core of LIBS it needed some basic optimizations, which are now complete, pass all unit tests. -- Added caliper benchmark for AlignmentUtils to assess performance (showing new version is 3x-10x faster) -- Remove unused import in ReadStateManager --- .../utils/locusiterator/ReadStateManager.java | 1 - .../sting/utils/sam/AlignmentUtils.java | 29 +++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java index 8fbd302a8..9728bdb1c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.locusiterator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; -import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 2208302fb..e48d1ca4c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -581,8 +581,11 @@ public final class AlignmentUtils { */ @Ensures({"result != null"}) public static Cigar consolidateCigar( final Cigar c ) { - if( c == null ) { throw new IllegalArgumentException("Cigar cannot be null"); } - if( c.isEmpty() ) { return c; } + if ( c == null ) { throw new IllegalArgumentException("Cigar cannot be null"); } + + // fast check to determine if there's anything worth doing before we create new Cigar and actually do some work + if ( ! needsConsolidation(c) ) + return c; final Cigar returnCigar = new Cigar(); int sumLength = 0; @@ -601,13 +604,33 @@ public final class AlignmentUtils { lastElement = cur; } - if( sumLength > 0 ) { + if ( sumLength > 0 ) { returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); } return returnCigar; } + /** + * Does the cigar C need to be consolidated? + * + * @param c a non-null cigar + * @return true if so + */ + private static boolean needsConsolidation(final Cigar c) { + if ( c.numCigarElements() <= 1 ) + return false; // fast path for empty or single cigar + + CigarOperator lastOp = null; + for( final CigarElement cur : c.getCigarElements() ) { + if ( cur.getLength() == 0 || lastOp == cur.getOperator() ) + return true; + lastOp = cur.getOperator(); + } + + return false; + } + /** * Takes the alignment of the read sequence readSeq to the reference sequence refSeq * starting at 0-based position refIndex on the refSeq and specified by its cigar. From 80131ac99641c5d60869392e2bff8625e59334d7 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 24 Apr 2013 11:41:32 -0400 Subject: [PATCH 206/226] Adding the 1000G_phase1.snps.high_confidence callset to the GATK resource bundle for use in the April 2013 updated best practices. --- .../gatk/walkers/haplotypecaller/DeBruijnAssembler.java | 8 ++++---- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- .../walkers/variantrecalibration/VariantRecalibrator.java | 2 +- .../sting/queue/qscripts/GATKResourcesBundle.scala | 3 +++ 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 5ce65e13f..0e1d49d81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -81,7 +81,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - // TODO -- this number is very low, and limits our ability to explore low-frequnecy variants. It should + // TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25; @@ -187,10 +187,10 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { // TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm // TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect - // TODO -- to anything from one that's actuall has good support along the chain but just happens + // TODO -- to anything from one that's actually has good support along the chain but just happens // TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately // TODO -- the pruning algorithm really should be an error correction algorithm that knows more - // TODO -- about the structure of the data and can differeniate between an infrequent path but + // TODO -- about the structure of the data and can differentiate between an infrequent path but // TODO -- without evidence against it (such as occurs when a region is hard to get any reads through) // TODO -- from a error with lots of weight going along another similar path // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive @@ -216,7 +216,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { seqGraph.removePathsNotConnectedToRef(); seqGraph.simplifyGraph(); if ( seqGraph.vertexSet().size() == 1 ) { - // we've prefectly assembled into a single reference haplotype, add a empty seq vertex to stop + // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop // the code from blowing up. // TODO -- ref properties should really be on the vertices, not the graph itself final SeqVertex complete = seqGraph.vertexSet().iterator().next(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a17e25f41..6ea543f25 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -308,7 +308,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected boolean useLowQualityBasesForAssembly = false; @Hidden - @Argument(fullName="dontTrimActiveRegions", shortName="donTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) + @Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false) protected boolean dontTrimActiveRegions = false; @Hidden diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index bee695e2a..824ef1f6e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -173,7 +173,7 @@ public class VariantRecalibrator extends RodWalker Date: Wed, 24 Apr 2013 14:15:49 -0400 Subject: [PATCH 207/226] Split class names into stratification and metrics Calling everything statistics was very confusing. Diagnose Targets stratifies the data three ways: Interval, Sample and Locus. Each stratification then has it's own set of metrics (plugin system) to calculate -- LocusMetric, SampleMetric, IntervalMetric. Metrics are generalized by the Metric interface. (for generic access) Stratifications are generalized by the AbstractStratification abstract class. (to aggressively limit code duplication) --- ...stics.java => AbstractStratification.java} | 14 ++++---- .../diagnosetargets/DiagnoseTargets.java | 36 +++++++++---------- .../{Sample.java => IntervalMetric.java} | 2 +- ...stics.java => IntervalStratification.java} | 24 ++++++------- .../{Locus.java => LocusMetric.java} | 4 +-- ...geGap.java => LocusMetricCoverageGap.java} | 12 +++---- ...java => LocusMetricExcessiveCoverage.java} | 12 +++---- ...erage.java => LocusMetricLowCoverage.java} | 12 +++---- ...ality.java => LocusMetricPoorQuality.java} | 12 +++---- ...atistics.java => LocusStratification.java} | 14 ++++---- .../{Statistic.java => Metric.java} | 4 +-- .../diagnosetargets/PluginUtils.java | 6 ++-- .../{Interval.java => SampleMetric.java} | 2 +- ...adMates.java => SampleMetricBadMates.java} | 10 +++--- ...eNoReads.java => SampleMetricNoReads.java} | 8 ++--- ...tistics.java => SampleStratification.java} | 25 ++++++------- .../diagnosetargets/ThresHolder.java | 6 ++-- .../LocusStatisticsUnitTest.java | 2 +- 18 files changed, 103 insertions(+), 102 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{AbstractStatistics.java => AbstractStratification.java} (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{Sample.java => IntervalMetric.java} (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{IntervalStatistics.java => IntervalStratification.java} (92%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{Locus.java => LocusMetric.java} (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{LocusCoverageGap.java => LocusMetricCoverageGap.java} (94%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{LocusExcessiveCoverage.java => LocusMetricExcessiveCoverage.java} (94%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{LocusLowCoverage.java => LocusMetricLowCoverage.java} (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{LocusPoorQuality.java => LocusMetricPoorQuality.java} (94%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{LocusStatistics.java => LocusStratification.java} (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{Statistic.java => Metric.java} (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{Interval.java => SampleMetric.java} (99%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{SampleBadMates.java => SampleMetricBadMates.java} (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{SampleNoReads.java => SampleMetricNoReads.java} (96%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/{SampleStatistics.java => SampleStratification.java} (93%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java index 0ac083bb6..dca83af44 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java @@ -57,7 +57,7 @@ import java.util.Map; * @author Mauricio Carneiro * @since 4/23/13 */ -abstract class AbstractStatistics { +abstract class AbstractStratification { private long preComputedTotalCoverage = -1; private Map statusTally = null; @@ -93,9 +93,9 @@ abstract class AbstractStatistics { * * @return the total coverage */ - private long calculateTotalCoverage(Iterable elements) { + private long calculateTotalCoverage(Iterable elements) { long cov = 0; - for (AbstractStatistics element : elements) { + for (AbstractStratification element : elements) { cov += element.getCoverage(); } return cov; @@ -109,7 +109,7 @@ abstract class AbstractStatistics { * * @return the corresponding list of elements of the extending class */ - public abstract Iterable getElements(); + public abstract Iterable getElements(); /** * Calculates the Callable statuses for the statistic as a whole (interval, sample or locus) @@ -127,7 +127,7 @@ abstract class AbstractStatistics { public Map getStatusTally() { if (statusTally == null) { statusTally = new HashMap(CallableStatus.values().length); - for (AbstractStatistics stats : getElements()) { + for (AbstractStratification stats : getElements()) { for (CallableStatus status : stats.callableStatuses()) { statusTally.put(status, !statusTally.containsKey(status) ? 1 : statusTally.get(status) + 1); } @@ -136,9 +136,9 @@ abstract class AbstractStatistics { return statusTally; } - public static List queryStatus(List statList, AbstractStatistics stratification) { + public static List queryStatus(List statList, AbstractStratification stratification) { List output = new LinkedList(); - for (Statistic stat : statList) { + for (Metric stat : statList) { final CallableStatus status = stat.status(stratification); if (status != null) { output.add(status); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 78a30a7bd..32f87b973 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -119,7 +119,7 @@ public class DiagnoseTargets extends LocusWalker { @ArgumentCollection private ThresHolder thresholds = new ThresHolder(); - private Map intervalMap = null; // maps each interval => statistics + private Map intervalMap = null; // maps each interval => statistics private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private Set samples = null; // all the samples being processed private static final Allele SYMBOLIC_ALLELE = Allele.create("
        ", false); // avoid creating the symbolic allele multiple times @@ -134,7 +134,7 @@ public class DiagnoseTargets extends LocusWalker { if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); - intervalMap = new HashMap(INITIAL_HASH_SIZE); + intervalMap = new HashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header @@ -155,8 +155,8 @@ public class DiagnoseTargets extends LocusWalker { addNewOverlappingIntervals(refLocus); // at this point, all intervals in intervalMap overlap with this locus, so update all of them - for (IntervalStatistics intervalStatistics : intervalMap.values()) - intervalStatistics.addLocus(context); + for (IntervalStratification intervalStratification : intervalMap.values()) + intervalStratification.addLocus(context); return 1L; } @@ -207,7 +207,7 @@ public class DiagnoseTargets extends LocusWalker { // output empty statistics for uncovered intervals while (interval != null && interval.isBefore(refLocus)) { - final IntervalStatistics stats = intervalMap.get(interval); + final IntervalStratification stats = intervalMap.get(interval); outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE); if (stats != null) intervalMap.remove(interval); intervalListIterator.next(); @@ -243,7 +243,7 @@ public class DiagnoseTargets extends LocusWalker { * @param stats The statistics of the interval * @param refAllele the reference allele */ - private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) { + private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) { GenomeLoc interval = stats.getInterval(); @@ -265,7 +265,7 @@ public class DiagnoseTargets extends LocusWalker { for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); - SampleStatistics sampleStat = stats.getSampleStatistics(sample); + SampleStratification sampleStat = stats.getSampleStatistics(sample); gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size())); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); @@ -293,36 +293,36 @@ public class DiagnoseTargets extends LocusWalker { return output; } - private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { - return new IntervalStatistics(samples, interval, thresholds); + private IntervalStratification createIntervalStatistic(GenomeLoc interval) { + return new IntervalStratification(samples, interval, thresholds); } protected static void loadAllPlugins(final ThresHolder thresholds) { - for (Class stat : new PluginManager(Locus.class).getPlugins()) { + for (Class stat : new PluginManager(LocusMetric.class).getPlugins()) { try { - final Locus stats = (Locus) stat.newInstance(); + final LocusMetric stats = (LocusMetric) stat.newInstance(); stats.initialize(thresholds); - thresholds.locusStatisticList.add(stats); + thresholds.locusMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } - for (Class stat : new PluginManager(Sample.class).getPlugins()) { + for (Class stat : new PluginManager(SampleMetric.class).getPlugins()) { try { - final Sample stats = (Sample) stat.newInstance(); + final SampleMetric stats = (SampleMetric) stat.newInstance(); stats.initialize(thresholds); - thresholds.sampleStatisticList.add(stats); + thresholds.sampleMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } } - for (Class stat : new PluginManager(Interval.class).getPlugins()) { + for (Class stat : new PluginManager(IntervalMetric.class).getPlugins()) { try { - final Interval stats = (Interval) stat.newInstance(); + final IntervalMetric stats = (IntervalMetric) stat.newInstance(); stats.initialize(thresholds); - thresholds.intervalStatisticList.add(stats); + thresholds.intervalMetricList.add(stats); } catch (Exception e) { throw new DynamicClassResolutionException(stat, e); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java index 52b24520e..50470a744 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Sample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java @@ -53,5 +53,5 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Time: 11:30 PM * To change this template use File | Settings | File Templates. */ -interface Sample extends Statistic { +interface IntervalMetric extends Metric { } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java similarity index 92% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java index 1580ce9ac..6c20403d1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -53,21 +53,21 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.*; -final class IntervalStatistics extends AbstractStatistics{ - private final Map samples; +final class IntervalStratification extends AbstractStratification { + private final Map samples; private final GenomeLoc interval; private final ThresHolder thresholds; - public IntervalStatistics(Set samples, GenomeLoc interval, ThresHolder thresholds) { + public IntervalStratification(Set samples, GenomeLoc interval, ThresHolder thresholds) { this.interval = interval; this.thresholds = thresholds; - this.samples = new HashMap(samples.size()); + this.samples = new HashMap(samples.size()); for (String sample : samples) - this.samples.put(sample, new SampleStatistics(interval, thresholds)); + this.samples.put(sample, new SampleStratification(interval, thresholds)); } - public SampleStatistics getSampleStatistics(String sample) { - return (SampleStatistics) samples.get(sample); + public SampleStratification getSampleStatistics(String sample) { + return (SampleStratification) samples.get(sample); } public GenomeLoc getInterval() { @@ -92,12 +92,12 @@ final class IntervalStatistics extends AbstractStatistics{ for (Map.Entry entry : samplePileups.entrySet()) { String sample = entry.getKey(); ReadBackedPileup samplePileup = entry.getValue(); - SampleStatistics sampleStatistics = (SampleStatistics) samples.get(sample); + SampleStratification sampleStratification = (SampleStratification) samples.get(sample); - if (sampleStatistics == null) + if (sampleStratification == null) throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); - sampleStatistics.addLocus(context.getLocation(), samplePileup); + sampleStratification.addLocus(context.getLocation(), samplePileup); } } @@ -106,7 +106,7 @@ final class IntervalStatistics extends AbstractStatistics{ * {@inheritDoc} */ @Override - public Iterable getElements() { + public Iterable getElements() { return samples.values(); } @@ -125,7 +125,7 @@ final class IntervalStatistics extends AbstractStatistics{ } } - output.addAll(queryStatus(thresholds.intervalStatisticList, this)); + output.addAll(queryStatus(thresholds.intervalMetricList, this)); return output; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java index f04a093fd..9950b4e2d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Locus.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java @@ -53,6 +53,6 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Time: 11:29 PM * To change this template use File | Settings | File Templates. */ -interface Locus extends Statistic { - public CallableStatus sampleStatus (SampleStatistics sampleStatistics); +interface LocusMetric extends Metric { + public CallableStatus sampleStatus (SampleStratification sampleStratification); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java similarity index 94% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java index d40816a34..0973fef1e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusCoverageGap.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java @@ -51,7 +51,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Date: 4/20/13 * Time: 11:44 PM */ -final class LocusCoverageGap implements Locus { +final class LocusMetricCoverageGap implements LocusMetric { private double threshold; private static final CallableStatus CALL = CallableStatus.COVERAGE_GAPS; @@ -61,13 +61,13 @@ final class LocusCoverageGap implements Locus { } @Override - public CallableStatus status(AbstractStatistics statistics) { - final LocusStatistics locusStatistics = (LocusStatistics) statistics; - return locusStatistics.getRawCoverage() == 0 ? CALL : null; + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + return locusStratification.getRawCoverage() == 0 ? CALL : null; } @Override - public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { - return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java similarity index 94% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java index ef1d7ffde..fbedc5404 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusExcessiveCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java @@ -51,7 +51,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Date: 4/20/13 * Time: 11:44 PM */ -final class LocusExcessiveCoverage implements Locus { +final class LocusMetricExcessiveCoverage implements LocusMetric { private int excessiveCoverage; private double threshold; private static final CallableStatus CALL = CallableStatus.EXCESSIVE_COVERAGE ; @@ -63,13 +63,13 @@ final class LocusExcessiveCoverage implements Locus { } @Override - public CallableStatus status(AbstractStatistics statistics) { - final LocusStatistics locusStatistics = (LocusStatistics) statistics; - return locusStatistics.getCoverage() > excessiveCoverage ? CALL : null; + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + return locusStratification.getCoverage() > excessiveCoverage ? CALL : null; } @Override - public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { - return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java index f421f5142..5b5015beb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusLowCoverage.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java @@ -51,7 +51,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Date: 4/20/13 * Time: 11:44 PM */ -final class LocusLowCoverage implements Locus { +final class LocusMetricLowCoverage implements LocusMetric { private int minCoverage; private double threshold; private static final CallableStatus CALL = CallableStatus.LOW_COVERAGE ; @@ -63,14 +63,14 @@ final class LocusLowCoverage implements Locus { } @Override - public CallableStatus status(AbstractStatistics statistics) { - final LocusStatistics locusStatistics = (LocusStatistics) statistics; - final long raw = locusStatistics.getRawCoverage(); + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + final long raw = locusStratification.getRawCoverage(); return raw > 0 && raw < minCoverage ? CALL: null; } @Override - public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { - return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java similarity index 94% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java index 042ebcbf8..53c07d421 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusPoorQuality.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java @@ -51,7 +51,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Date: 4/20/13 * Time: 11:44 PM */ -final class LocusPoorQuality implements Locus { +final class LocusMetricPoorQuality implements LocusMetric { private int minCoverage; private double threshold; private static final CallableStatus CALL = CallableStatus.POOR_QUALITY ; @@ -63,13 +63,13 @@ final class LocusPoorQuality implements Locus { } @Override - public CallableStatus status(AbstractStatistics statistics) { - final LocusStatistics locusStatistics = (LocusStatistics) statistics; - return locusStatistics.getCoverage() < minCoverage && locusStatistics.getRawCoverage() >= minCoverage ? CALL: null; + public CallableStatus status(AbstractStratification statistics) { + final LocusStratification locusStratification = (LocusStratification) statistics; + return locusStratification.getCoverage() < minCoverage && locusStratification.getRawCoverage() >= minCoverage ? CALL: null; } @Override - public CallableStatus sampleStatus(SampleStatistics sampleStatistics) { - return PluginUtils.genericSampleStatus(sampleStatistics, CALL, threshold); + public CallableStatus sampleStatus(SampleStratification sampleStratification) { + return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java index b5a9373d5..d6acaf850 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java @@ -49,19 +49,19 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import java.util.LinkedList; import java.util.List; -final class LocusStatistics extends AbstractStatistics{ +final class LocusStratification extends AbstractStratification { private long coverage; private long rawCoverage; - private final List locusStatisticsList; + private final List locusStatisticsList; - public LocusStatistics(ThresHolder thresholds) { + public LocusStratification(ThresHolder thresholds) { this(0,0,thresholds); } - protected LocusStatistics(int coverage, int rawCoverage, ThresHolder thresholds) { + protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) { this.coverage = coverage; this.rawCoverage = rawCoverage; - this.locusStatisticsList = thresholds.locusStatisticList; + this.locusStatisticsList = thresholds.locusMetricList; } @Override @@ -80,7 +80,7 @@ final class LocusStatistics extends AbstractStatistics{ */ public List callableStatuses() { List output = new LinkedList(); - for (Statistic stats : locusStatisticsList) { + for (Metric stats : locusStatisticsList) { CallableStatus status = stats.status(this); if (status != null) { output.add(status); @@ -90,7 +90,7 @@ final class LocusStatistics extends AbstractStatistics{ } @Override - public Iterable getElements() { + public Iterable getElements() { return null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java index c43b00a65..6f13b9cac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Statistic.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java @@ -51,7 +51,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * @author Mauricio Carneiro * @since 4/23/13 */ -interface Statistic { +interface Metric { public void initialize(ThresHolder thresholds); - public CallableStatus status (AbstractStatistics statistic); + public CallableStatus status (AbstractStratification statistic); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java index cb28e0ac5..1085e8cac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java @@ -54,9 +54,9 @@ import java.util.Map; * Time: 11:23 AM */ final class PluginUtils { - public static CallableStatus genericSampleStatus (final SampleStatistics sampleStatistics, final CallableStatus CALL, final double threshold) { - final Map totals = sampleStatistics.getStatusTally(); - final int size = sampleStatistics.getIntervalSize(); + public static CallableStatus genericSampleStatus (final SampleStratification sampleStratification, final CallableStatus CALL, final double threshold) { + final Map totals = sampleStratification.getStatusTally(); + final int size = sampleStratification.getIntervalSize(); final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0; return ( (double) statusCount / size) >= threshold ? CALL: null; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java similarity index 99% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java index bd8307f89..8de33b269 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Interval.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java @@ -53,5 +53,5 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Time: 11:30 PM * To change this template use File | Settings | File Templates. */ -interface Interval extends Statistic { +interface SampleMetric extends Metric { } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java index 483f63c06..cf5aac4a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleBadMates.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java @@ -51,7 +51,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Date: 4/20/13 * Time: 11:44 PM */ -final class SampleBadMates implements Sample { +final class SampleMetricBadMates implements SampleMetric { private static final CallableStatus CALL = CallableStatus.NO_READS ; private double threshold; @@ -64,10 +64,10 @@ final class SampleBadMates implements Sample { } @Override - public CallableStatus status(AbstractStatistics statistics) { - final SampleStatistics sampleStatistics = (SampleStatistics) statistics; - final int nReads = sampleStatistics.getnReads(); - return nReads > 0 && (double) sampleStatistics.getnBadMates() / nReads > threshold ? CALL : null; + public CallableStatus status(AbstractStratification statistics) { + final SampleStratification sampleStratification = (SampleStratification) statistics; + final int nReads = sampleStratification.getnReads(); + return nReads > 0 && (double) sampleStratification.getnBadMates() / nReads > threshold ? CALL : null; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java index 1c6d3deb7..bf9e7420d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleNoReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java @@ -51,16 +51,16 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; * Date: 4/20/13 * Time: 11:44 PM */ -final class SampleNoReads implements Sample { +final class SampleMetricNoReads implements SampleMetric { private static final CallableStatus CALL = CallableStatus.NO_READS; @Override public void initialize(ThresHolder thresholds) { } @Override - public CallableStatus status(AbstractStatistics statistics) { - final SampleStatistics sampleStatistics = (SampleStatistics) statistics; - return sampleStatistics.getnReads() == 0 ? CALL : null; + public CallableStatus status(AbstractStratification statistics) { + final SampleStratification sampleStratification = (SampleStratification) statistics; + return sampleStratification.getnReads() == 0 ? CALL : null; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java similarity index 93% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java index 4d41fea16..b9ae1f3cf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStatistics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java @@ -51,30 +51,31 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.*; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; /** * The statistics calculator for a specific sample given the interval */ -final class SampleStatistics extends AbstractStatistics { +final class SampleStratification extends AbstractStratification { private final GenomeLoc interval; - private final ArrayList loci; + private final ArrayList loci; private final ThresHolder thresholds; - private Map locusStatusTally = null; private int nReads = -1; private int nBadMates = -1; - public SampleStatistics(final GenomeLoc interval, final ThresHolder thresholds) { + public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) { this.interval = interval; - this.loci = new ArrayList(interval.size()); + this.loci = new ArrayList(interval.size()); this.thresholds = thresholds; nReads = 0; nBadMates = 0; // Initialize every loci (this way we don't have to worry about non-existent loci in the object for (int i = 0; i < interval.size(); i++) - this.loci.add(new LocusStatistics(thresholds)); + this.loci.add(new LocusStratification(thresholds)); } /** @@ -99,7 +100,7 @@ final class SampleStatistics extends AbstractStatistics { final int locusIndex = locus.getStart() - interval.getStart(); final int rawCoverage = pileup.depthOfCoverage(); final int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.minimumBaseQuality, thresholds.minimumMappingQuality).depthOfCoverage(); - final LocusStatistics locusData = (LocusStatistics) loci.get(locusIndex); + final LocusStratification locusData = (LocusStratification) loci.get(locusIndex); locusData.addLocus(coverage, rawCoverage); // process all the reads in this pileup (tallying number of reads and bad mates) @@ -109,7 +110,7 @@ final class SampleStatistics extends AbstractStatistics { } @Override - public Iterable getElements() { + public Iterable getElements() { return loci; } @@ -121,15 +122,15 @@ final class SampleStatistics extends AbstractStatistics { final List output = new LinkedList(); // get the tally of all the locus callable statuses - for (Statistic locusStat : thresholds.locusStatisticList) { - final CallableStatus status = ((Locus) locusStat).sampleStatus(this); + for (Metric locusStat : thresholds.locusMetricList) { + final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this); if (status != null) { output.add(status); } } // get the sample specific statitics statuses - for (Statistic sampleStat : thresholds.sampleStatisticList) { + for (Metric sampleStat : thresholds.sampleMetricList) { final CallableStatus status = sampleStat.status(this); if (status != null) { output.add(status); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index 42c09dda1..b0c999460 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -114,9 +114,9 @@ final class ThresHolder { @Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false) public double qualityStatusThreshold = 0.50; - public final List locusStatisticList = new LinkedList(); - public final List sampleStatisticList = new LinkedList(); - public final List intervalStatisticList = new LinkedList(); + public final List locusMetricList = new LinkedList(); + public final List sampleMetricList = new LinkedList(); + public final List intervalMetricList = new LinkedList(); public ThresHolder() {} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java index d784c2a9e..fe3010e02 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStatisticsUnitTest.java @@ -64,7 +64,7 @@ public class LocusStatisticsUnitTest { @Test(dataProvider = "StatusTestValues") public void testCallableStatuses(int coverage, int rawCoverage, CallableStatus status) { - List statuses = new LocusStatistics(coverage, rawCoverage, thresholds).callableStatuses(); + List statuses = new LocusStratification(coverage, rawCoverage, thresholds).callableStatuses(); Assert.assertTrue((status == null) ? statuses.isEmpty() : (statuses.contains(status) && statuses.size() == 1)); } From 379a9841ce8bad91313c974139de1ef00fe2d90b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Apr 2013 12:45:09 -0400 Subject: [PATCH 208/226] Various bug fixes for recent Reduce Reads additions plus solution implemented for low MQ reads. 1. Using cumulative binomial probability was not working at high coverage sites (because p-values quickly got out of hand) so instead we use a hybrid system for determining significance: at low coverage sites use binomial prob and at high coverage sites revert to using the old base proportions. Then we get the best of both worlds. As a note, coverage refers to just the individual base counts and not the entire pileup. 2. Reads were getting lost because of the comparator being used in the SlidingWindow. When read pairs had the same alignment end position the 2nd one encountered would get dropped (but added to the header!). We now use a PriorityQueue instead of a TreeSet to allow for such cases. 3. Each consensus keeps track of its own number of softclipped bases. There was no reason that that number should be shared between them. 4. We output consensus filtered (i.e. low MQ) reads whenever they are present for now. Don't lose that information. Maybe we'll decide to change this in the future, but for now we are conservative. 5. Also implemented various small performance optimizations based on profiling. Added unit tests to cover these changes; systematic assessment now tests against low MQ reads too. --- .../reducereads/BaseAndQualsCounts.java | 36 +- .../compression/reducereads/BaseCounts.java | 78 +++- .../compression/reducereads/BaseIndex.java | 4 +- .../reducereads/HeaderElement.java | 78 ++-- .../reducereads/MultiSampleCompressor.java | 3 +- .../compression/reducereads/ReduceReads.java | 13 +- .../reducereads/SingleSampleCompressor.java | 7 +- .../reducereads/SlidingWindow.java | 397 ++++++++++-------- .../gatk/walkers/qc/AssessReducedQuals.java | 5 +- .../reducereads/BaseCountsUnitTest.java | 2 +- .../reducereads/HeaderElementUnitTest.java | 15 +- .../ReduceReadsIntegrationTest.java | 38 +- .../reducereads/SlidingWindowUnitTest.java | 152 +++++-- .../broadinstitute/sting/utils/MathUtils.java | 2 +- 14 files changed, 527 insertions(+), 303 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index 416f66ec6..28a48c212 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -80,6 +80,21 @@ public class BaseAndQualsCounts extends BaseCounts { * @param isLowQualBase true if the base is low quality */ public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); + } + + /* + * Increments the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + * @param isSoftClip true if is soft-clipped + */ + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { // if we already have high quality bases, ignore low quality ones if ( isLowQualBase && !isLowQuality() ) return; @@ -92,7 +107,7 @@ public class BaseAndQualsCounts extends BaseCounts { } final BaseIndex i = BaseIndex.byteToBase(base); - super.incr(i, baseQual, baseMappingQual); + super.incr(i, baseQual, baseMappingQual, isSoftClip); switch (i) { case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; @@ -114,13 +129,28 @@ public class BaseAndQualsCounts extends BaseCounts { * @param baseMappingQual the mapping quality * @param isLowQualBase true if the base is low quality */ - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); + } + + /* + * Decrements the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + * @param isSoftClip true if is soft-clipped + */ + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { // if this is not the right type of base, ignore it if ( isLowQualBase != isLowQuality() ) return; final BaseIndex i = BaseIndex.byteToBase(base); - super.decr(i, baseQual, baseMappingQual); + super.decr(i, baseQual, baseMappingQual, isSoftClip); switch (i) { case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index afcaf1510..e1329db3b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -80,6 +80,7 @@ import org.broadinstitute.sting.utils.MathUtils; private int count_N = 0; private int sumQual_N = 0; private int totalCount = 0; // keeps track of total count since this is requested so often + private int nSoftClippedBases = 0; private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise @@ -104,6 +105,7 @@ import org.broadinstitute.sting.utils.MathUtils; this.count_I += other.count_I; this.count_N += other.count_N; this.totalCount += other.totalCount; + this.nSoftClippedBases = other.nSoftClippedBases; this.mappingQualities.addAll(other.mappingQualities); } @@ -117,6 +119,7 @@ import org.broadinstitute.sting.utils.MathUtils; this.count_I -= other.count_I; this.count_N -= other.count_N; this.totalCount -= other.totalCount; + this.nSoftClippedBases -= other.nSoftClippedBases; this.mappingQualities.removeAll(other.mappingQualities); } @@ -126,7 +129,7 @@ import org.broadinstitute.sting.utils.MathUtils; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final BaseIndex base, final byte qual, final int mappingQuality) { + public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { switch (base) { case A: ++count_A; sumQual_A += qual; break; case C: ++count_C; sumQual_C += qual; break; @@ -137,6 +140,7 @@ import org.broadinstitute.sting.utils.MathUtils; case N: ++count_N; sumQual_N += qual; break; } ++totalCount; + nSoftClippedBases += isSoftclip ? 1 : 0; mappingQualities.add(mappingQuality); } @@ -159,7 +163,7 @@ import org.broadinstitute.sting.utils.MathUtils; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final BaseIndex base, final byte qual, final int mappingQuality) { + public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { switch (base) { case A: --count_A; sumQual_A -= qual; break; case C: --count_C; sumQual_C -= qual; break; @@ -170,6 +174,7 @@ import org.broadinstitute.sting.utils.MathUtils; case N: --count_N; sumQual_N -= qual; break; } --totalCount; + nSoftClippedBases -= isSoftclip ? 1 : 0; mappingQualities.remove((Integer) mappingQuality); } @@ -231,6 +236,10 @@ import org.broadinstitute.sting.utils.MathUtils; return (byte) (sumQualsOfBase(base) / countOfBase(base)); } + @Ensures("result >= 0") + public int nSoftclips() { + return nSoftClippedBases; + } @Ensures("result >= 0") public int totalCount() { @@ -281,22 +290,42 @@ import org.broadinstitute.sting.utils.MathUtils; return baseIndexWithMostCounts().getByte(); } + /** + * @return the base index for which the count is highest, including indel indexes + */ @Ensures("result != null") public BaseIndex baseIndexWithMostCounts() { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (final BaseIndex i : BaseIndex.values()) { - if (countOfBase(i) > countOfBase(maxI)) - maxI = i; - } - return maxI; + return baseIndexWithMostCounts(true); } + /** + * @return the base index for which the count is highest, excluding indel indexes + */ @Ensures("result != null") public BaseIndex baseIndexWithMostCountsWithoutIndels() { + return baseIndexWithMostCounts(false); + } + + /** + * Finds the base index with the most counts + * + * @param allowIndels should we allow base indexes representing indels? + * @return non-null base index + */ + @Ensures("result != null") + protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + int maxCount = countOfBase(maxI); + for (final BaseIndex i : BaseIndex.values()) { - if (i.isNucleotide() && countOfBase(i) > countOfBase(maxI)) + if ( !allowIndels && !i.isNucleotide() ) + continue; + + final int myCount = countOfBase(i); + if (myCount > maxCount) { maxI = i; + maxCount = myCount; + } } return maxI; } @@ -307,22 +336,36 @@ import org.broadinstitute.sting.utils.MathUtils; @Ensures("result != null") public BaseIndex baseIndexWithMostProbability() { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (final BaseIndex i : BaseIndex.values()) { - if (getSumQuals(i) > getSumQuals(maxI)) - maxI = i; - } - return (getSumQuals(maxI) > 0L ? maxI : baseIndexWithMostCounts()); + return baseIndexWithMostProbability(true); } @Ensures("result != null") public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { + return baseIndexWithMostProbability(false); + } + + /** + * Finds the base index with the most probability + * + * @param allowIndels should we allow base indexes representing indels? + * @return non-null base index + */ + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + long maxSum = getSumQuals(maxI); + for (final BaseIndex i : BaseIndex.values()) { - if (i.isNucleotide() && getSumQuals(i) > getSumQuals(maxI)) + if ( !allowIndels && !i.isNucleotide() ) + continue; + + final long mySum = getSumQuals(i); + if (mySum > maxSum) { maxI = i; + maxSum = mySum; + } } - return (getSumQuals(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); + return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels)); } @Ensures("result >=0") @@ -362,6 +405,7 @@ import org.broadinstitute.sting.utils.MathUtils; count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0; sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0; totalCount = 0; + nSoftClippedBases = 0; mappingQualities.clear(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java index e41878a0b..665e3e7ce 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java @@ -121,7 +121,7 @@ public enum BaseIndex { * * @return whether or not it is a nucleotide, given the definition above */ - public boolean isNucleotide() { + public final boolean isNucleotide() { return !isIndel(); } @@ -130,7 +130,7 @@ public enum BaseIndex { * * @return true for I or D, false otherwise */ - public boolean isIndel() { + public final boolean isIndel() { return this == D || this == I; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index dec323213..38b9e957b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -62,9 +62,10 @@ public class HeaderElement { private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site. private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site. private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right - private int nSoftClippedBases; // How many bases in this site came from soft clipped bases private int location; // Genome location of this site (the sliding window knows which contig we're at + protected static final int MIN_COUNT_FOR_USING_PVALUE = 2; + public int getLocation() { return location; } @@ -84,7 +85,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location); } /** @@ -94,7 +95,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location); } /** @@ -103,15 +104,13 @@ public class HeaderElement { * @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read * @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read * @param insertionsToTheRight number of insertions to the right of this HeaderElement - * @param nSoftClippedBases number of softclipped bases of this HeaderElement * @param location the reference location of this reference element * HeaderElement */ - public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location) { + public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int location) { this.consensusBaseCounts = consensusBaseCounts; this.filteredBaseCounts = filteredBaseCounts; this.insertionsToTheRight = insertionsToTheRight; - this.nSoftClippedBases = nSoftClippedBases; this.location = location; } @@ -119,10 +118,13 @@ public class HeaderElement { * Whether or not the site represented by this HeaderElement is variant according to the definitions of variant * by insertion, deletion and mismatches. * + * @param minVariantPvalue min p-value for deciding that a position is or is not variable due to mismatches + * @param minVariantProportion min proportion for deciding that a position is or is not variable due to mismatches + * @param minIndelProportion min proportion for deciding that a position is or is not variable due to indels * @return true if site is variant by any definition. False otherwise. */ - public boolean isVariant(double minVariantPvalue, double minIndelProportion) { - return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); + public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) { + return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); } /** @@ -140,11 +142,9 @@ public class HeaderElement { public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts if ( baseMappingQuality >= minMappingQual ) - consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); + consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); else filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - - nSoftClippedBases += isSoftClipped ? 1 : 0; } /** @@ -162,11 +162,9 @@ public class HeaderElement { public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts if ( baseMappingQuality >= minMappingQual ) - consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); + consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); else filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - - nSoftClippedBases -= isSoftClipped ? 1 : 0; } /** * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions @@ -246,15 +244,15 @@ public class HeaderElement { /** * Whether or not the HeaderElement is variant due to excess mismatches * - * @param minVariantPvalue the minimum pvalue to call a site variant. + * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage). + * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage). * @return whether or not the HeaderElement is variant due to excess mismatches */ - protected boolean isVariantFromMismatches(double minVariantPvalue) { + protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) { final int totalCount = consensusBaseCounts.totalCountWithoutIndels(); final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon); - final double pvalue = countOfOtherBases == 0 ? 0.0 : MathUtils.binomialCumulativeProbability(totalCount, 0, countOfOtherBases); - return pvalue > minVariantPvalue; + return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion); } /** @@ -264,6 +262,7 @@ public class HeaderElement { * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. */ protected boolean isVariantFromSoftClips() { + final int nSoftClippedBases = consensusBaseCounts.nSoftclips(); return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); } @@ -271,10 +270,11 @@ public class HeaderElement { * Calculates the number of alleles necessary to represent this site. * * @param minVariantPvalue the minimum pvalue to call a site variant. + * @param minVariantProportion the minimum proportion to call a site variant. * @return the number of alleles necessary to represent this site or -1 if there are too many indels */ - public int getNumberOfBaseAlleles(final double minVariantPvalue) { - final ObjectArrayList alleles = getAlleles(minVariantPvalue); + public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) { + final ObjectArrayList alleles = getAlleles(minVariantPvalue, minVariantProportion); return alleles == null ? -1 : alleles.size(); } @@ -282,16 +282,18 @@ public class HeaderElement { * Calculates the alleles necessary to represent this site. * * @param minVariantPvalue the minimum pvalue to call a site variant. + * @param minVariantProportion the minimum proportion to call a site variant. * @return the list of alleles necessary to represent this site or null if there are too many indels */ - public ObjectArrayList getAlleles(final double minVariantPvalue) { + public ObjectArrayList getAlleles(final double minVariantPvalue, final double minVariantProportion) { // make sure we have bases at all final int totalBaseCount = consensusBaseCounts.totalCount(); if ( totalBaseCount == 0 ) return new ObjectArrayList(0); - // next, check for insertions - if ( hasSignificantCount(insertionsToTheRight, minVariantPvalue) ) + // next, check for insertions; technically, the insertion count can be greater than totalBaseCount + // (because of the way insertions are counted), so we need to account for that + if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) ) return null; // finally, check for the bases themselves (including deletions) @@ -301,9 +303,7 @@ public class HeaderElement { if ( baseCount == 0 ) continue; - final double pvalue = MathUtils.binomialCumulativeProbability(totalBaseCount, 0, baseCount); - - if ( pvalue > minVariantPvalue ) { + if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) { if ( base == BaseIndex.D ) return null; alleles.add(base); @@ -316,26 +316,34 @@ public class HeaderElement { * Checks whether there are a significant number of softclips. * * @param minVariantPvalue the minimum pvalue to call a site variant. + * @param minVariantProportion the minimum proportion to call a site variant. * @return true if there are significant softclips, false otherwise */ - public boolean hasSignificantSoftclips(final double minVariantPvalue) { - return hasSignificantCount(nSoftClippedBases, minVariantPvalue); + public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) { + return hasSignificantCount(consensusBaseCounts.nSoftclips(), consensusBaseCounts.totalCount(), minVariantPvalue, minVariantProportion); } /* * Checks whether there are a significant number of count. * - * @param count the count to test against + * @param count the count (k) to test against + * @param total the total (n) to test against * @param minVariantPvalue the minimum pvalue to call a site variant. + * @param minVariantProportion the minimum proportion to call a site variant. * @return true if there is a significant count given the provided pvalue, false otherwise */ - private boolean hasSignificantCount(final int count, final double minVariantPvalue) { - final int totalBaseCount = consensusBaseCounts.totalCount(); - if ( count == 0 || totalBaseCount == 0 ) + private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) { + if ( count == 0 || total == 0 ) return false; - // technically, count can be greater than totalBaseCount (because of the way insertions are counted) so we need to account for that - final double pvalue = MathUtils.binomialCumulativeProbability(totalBaseCount, 0, Math.min(count, totalBaseCount)); - return pvalue > minVariantPvalue; + // use p-values for low counts of k + if ( count <= MIN_COUNT_FOR_USING_PVALUE ) { + final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count); + return pvalue > minVariantPvalue; + } + + // otherwise, use straight proportions + final int minBaseCountForSignificance = (int)(minVariantProportion * total); + return count >= minBaseCountForSignificance; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 85aee9fc9..bdd407fba 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -97,13 +97,14 @@ public class MultiSampleCompressor { final int downsampleCoverage, final int minMappingQuality, final double minAltPValueToTriggerVariant, + final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 4d90a83be..82a02ca55 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -205,15 +205,17 @@ public class ReduceReads extends ReadWalker, Redu /** * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be - * considered consensus. + * considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that + * this value is used only regions with high coverage. */ - @Deprecated + @Advanced @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) public double minAltProportionToTriggerVariant = 0.05; /** * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region. - * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to trigger polyploid compression). + * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to + * trigger polyploid compression). Note that this value is used only regions with low coverage. */ @Advanced @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false) @@ -288,6 +290,9 @@ public class ReduceReads extends ReadWalker, Redu if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); + if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 ) + throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); + if ( known.isEmpty() ) knownSnpPositions = null; else @@ -412,7 +417,7 @@ public class ReduceReads extends ReadWalker, Redu */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index ec041386c..61c34b6a0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -63,6 +63,7 @@ public class SingleSampleCompressor { final private int downsampleCoverage; final private int minMappingQuality; final private double minAltPValueToTriggerVariant; + final private double minAltProportionToTriggerVariant; final private double minIndelProportionToTriggerVariant; final private int minBaseQual; final private ReduceReads.DownsampleStrategy downsampleStrategy; @@ -76,6 +77,7 @@ public class SingleSampleCompressor { final int downsampleCoverage, final int minMappingQuality, final double minAltPValueToTriggerVariant, + final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy) { @@ -84,6 +86,7 @@ public class SingleSampleCompressor { this.minMappingQuality = minMappingQuality; this.slidingWindowCounter = 0; this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant; + this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant; this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minBaseQual = minBaseQual; this.downsampleStrategy = downsampleStrategy; @@ -114,7 +117,9 @@ public class SingleSampleCompressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), + slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, + minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 5fd7724cb..d3ca037be 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -60,7 +60,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -78,8 +77,8 @@ import java.util.*; public class SlidingWindow { // Sliding Window data - final private ObjectAVLTreeSet readsInWindow; - final private LinkedList windowHeader; + final protected PriorityQueue readsInWindow; + final protected LinkedList windowHeader; protected int contextSize; // the largest context size (between mismatches and indels) protected String contig; protected int contigIndex; @@ -99,6 +98,7 @@ public class SlidingWindow { // Additional parameters protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches + protected double MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value protected int MIN_MAPPING_QUALITY; @@ -146,28 +146,33 @@ public class SlidingWindow { this.windowHeader = new LinkedList(); windowHeader.addFirst(new HeaderElement(startLocation)); - this.readsInWindow = new ObjectAVLTreeSet(); + this.readsInWindow = new PriorityQueue(100, new Comparator() { + @Override + public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { + return read1.getSoftEnd() - read2.getSoftEnd(); + } + }); } public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader, final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber, - final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, + final double minAltPValueToTriggerVariant, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final int minMappingQuality, final int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant; + this.MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant; this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant; this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; this.MIN_MAPPING_QUALITY = minMappingQuality; this.windowHeader = new LinkedList(); - this.readsInWindow = new ObjectAVLTreeSet(new Comparator() { + this.readsInWindow = new PriorityQueue(1000, new Comparator() { @Override public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { - final int difference = read1.getSoftEnd() - read2.getSoftEnd(); - return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName()); + return read1.getSoftEnd() - read2.getSoftEnd(); } }); @@ -290,8 +295,8 @@ public class SlidingWindow { regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.remove(readsInWindow.first()); + while (!readsInWindow.isEmpty() && readsInWindow.peek().getSoftEnd() < windowHeaderStartLocation) { + readsInWindow.poll(); } return regions; @@ -353,7 +358,7 @@ public class SlidingWindow { /** * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks) * - * @param stop check the window from start to stop (not-inclusive) + * @param stop check the window from start to stop (not-inclusive); given in global coordinates */ protected void markSites(final int stop) { @@ -363,21 +368,16 @@ public class SlidingWindow { // copy over as many bits as we can from the previous calculation. Note that we can't trust the // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there. final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1; - final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize); + final int locationToProcess = Math.max(windowHeaderStartLocation, Math.min(lastPositionMarked, stop - contextSize)); - // update the iterator to the correct position - Iterator headerElementIterator = windowHeader.iterator(); - for (int i = windowHeaderStartLocation; i < locationToProcess; i++) { - if (headerElementIterator.hasNext()) - headerElementIterator.next(); - } + final ListIterator headerElementIterator = windowHeader.listIterator(locationToProcess - windowHeaderStartLocation); // process a contextSize worth of region from scratch in case there's a variant there for (int i = locationToProcess; i < stop; i++) { if (headerElementIterator.hasNext()) { HeaderElement headerElement = headerElementIterator.next(); - if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) + if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) markVariantRegion(i - windowHeaderStartLocation); } else @@ -409,7 +409,7 @@ public class SlidingWindow { } /** - * Adds bases to the running consensus or filtered data accordingly + * Adds bases to the running consensus * * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus * @@ -422,9 +422,10 @@ public class SlidingWindow { @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { - ObjectArrayList reads = new ObjectArrayList(); - if (start < end) { - ListIterator headerElementIterator = header.listIterator(start); + final ObjectArrayList reads = new ObjectArrayList(); + + if ( start < end ) { + final ListIterator headerElementIterator = header.listIterator(start); if (!headerElementIterator.hasNext()) throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end)); @@ -432,37 +433,29 @@ public class SlidingWindow { HeaderElement headerElement = headerElementIterator.next(); if (headerElement.hasConsensusData()) { - reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); - - int endOfConsensus = findNextNonConsensusElement(header, start, end); - addToRunningConsensus(header, start, endOfConsensus, strandType); + // find the end of the consecutive consensus data in the window + final int endOfConsensus = findNextNonConsensusElement(header, start, end); if (endOfConsensus <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); + // add to running consensus and recurse + addToRunningConsensus(header, start, endOfConsensus, strandType); reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType)); - } else if (headerElement.hasFilteredData()) { + + } else { + + // add any outstanding consensus data reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); - int endOfFilteredData = findNextNonFilteredDataElement(header, start, end); - reads.addAll(addToFilteredData(header, start, endOfFilteredData, strandType)); - - if (endOfFilteredData <= start) - throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); - - reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, strandType)); - } else if (headerElement.isEmpty()) { - reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); - - int endOfEmptyData = findNextNonEmptyElement(header, start, end); - + // find the end of the consecutive empty data in the window + final int endOfEmptyData = findNextConsensusElement(header, start, end); if (endOfEmptyData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); + // recurse out of the empty region reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType)); - } else - throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start)); - + } } return reads; @@ -474,24 +467,21 @@ public class SlidingWindow { * @param type the synthetic reads you want to close * @return a possibly null list of GATKSAMRecords generated by finalizing the synthetic reads */ - private ObjectArrayList finalizeAndAdd(ConsensusType type) { - GATKSAMRecord read = null; - ObjectArrayList list = new ObjectArrayList(); + private ObjectArrayList finalizeAndAdd(final ConsensusType type) { - switch (type) { - case CONSENSUS: - read = finalizeRunningConsensus(); - break; - case FILTERED: - read = finalizeFilteredDataConsensus(); - break; - case BOTH: - read = finalizeRunningConsensus(); - if (read != null) list.add(read); - read = finalizeFilteredDataConsensus(); + final ObjectArrayList list = new ObjectArrayList(); + + if ( type == ConsensusType.CONSENSUS || type == ConsensusType.BOTH ) { + final GATKSAMRecord read = finalizeRunningConsensus(); + if ( read != null ) + list.add(read); + } + + if ( type == ConsensusType.FILTERED || type == ConsensusType.BOTH ) { + final GATKSAMRecord read = finalizeFilteredDataConsensus(); + if ( read != null ) + list.add(read); } - if (read != null) - list.add(read); return list; } @@ -499,19 +489,145 @@ public class SlidingWindow { /** * Looks for the next position without consensus data * - * @param start beginning of the filtered region - * @param upTo limit to search for another consensus element + * @param header the header to check + * @param start beginning of the filtered region + * @param upTo limit to search for another consensus element * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position */ - private int findNextNonConsensusElement(LinkedList header, int start, int upTo) { - Iterator headerElementIterator = header.listIterator(start); + private int findNextNonConsensusElement(final LinkedList header, final int start, final int upTo) { + final Iterator headerElementIterator = header.listIterator(start); int index = start; while (index < upTo) { if (!headerElementIterator.hasNext()) throw new ReviewedStingException("There are no more header elements in this window"); - HeaderElement headerElement = headerElementIterator.next(); + if (!headerElementIterator.next().hasConsensusData()) + break; + index++; + } + return index; + } + + /** + * Looks for the next position witho consensus data + * + * @param header the header to check + * @param start beginning of the filtered region + * @param upTo limit to search for another consensus element + * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position + */ + private int findNextConsensusElement(final LinkedList header, final int start, final int upTo) { + final Iterator headerElementIterator = header.listIterator(start); + int index = start; + while (index < upTo) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("There are no more header elements in this window"); + + if (headerElementIterator.next().hasConsensusData()) + break; + index++; + } + return index; + } + + /** + * Adds bases to the filtered data synthetic read. + * + * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData + * bases. + * + * @param header the window header + * @param start the first header index to add to consensus + * @param end the first header index NOT TO add to consensus + * @param strandType the strandedness that the synthetic read should be represented as having + */ + @Requires({"start >= 0 && (end >= start || end == 0)"}) + private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { + if (runningConsensus == null) + runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); + + final Iterator headerElementIterator = header.listIterator(start); + + for (int index = start; index < end; index++) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist"); + + final HeaderElement headerElement = headerElementIterator.next(); if (!headerElement.hasConsensusData()) + throw new ReviewedStingException("No CONSENSUS data in " + index); + + genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts()); + } + } + + /** + * Adds bases to the running filtered data accordingly + * + * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus + * + * @param header the window header + * @param start the first header index to add to consensus + * @param end the first header index NOT TO add to consensus + * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. + */ + @Requires({"start >= 0 && (end >= start || end == 0)"}) + @Ensures("result != null") + protected ObjectArrayList addToFilteredReads(final LinkedList header, final int start, final int end) { + final ObjectArrayList reads = new ObjectArrayList(); + + if ( start < end ) { + final ListIterator headerElementIterator = header.listIterator(start); + + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end)); + + HeaderElement headerElement = headerElementIterator.next(); + + if (headerElement.hasFilteredData()) { + + // find the end of the consecutive filtered data in the window + final int endOfFiltered = findNextNonFilteredElement(header, start, end); + if (endOfFiltered <= start) + throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFiltered, start)); + + // add to running filtered consensus and recurse + addToFilteredData(header, start, endOfFiltered); + reads.addAll(addToFilteredReads(header, endOfFiltered, end)); + + } else { + + // add any outstanding filtered data + reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); + + // find the end of the consecutive empty data in the window + final int endOfEmptyData = findNextFilteredElement(header, start, end); + if (endOfEmptyData <= start) + throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); + + // recurse out of the empty region + reads.addAll(addToFilteredReads(header, endOfEmptyData, end)); + } + } + + return reads; + } + + /** + * Looks for the next position without consensus data + * + * @param header the header to check + * @param start beginning of the filtered region + * @param upTo limit to search for another consensus element + * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position + */ + private int findNextNonFilteredElement(final LinkedList header, final int start, final int upTo) { + final Iterator headerElementIterator = header.listIterator(start); + int index = start; + while (index < upTo) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("There are no more header elements in this window"); + + if (!headerElementIterator.next().hasFilteredData()) break; index++; } @@ -519,43 +635,21 @@ public class SlidingWindow { } /** - * Looks for the next position without filtered data + * Looks for the next position witho consensus data * - * @param start beginning of the region - * @param upTo limit to search for - * @return next position in local coordinates (relative to the windowHeader) with no filtered data; otherwise, the start position + * @param header the header to check + * @param start beginning of the filtered region + * @param upTo limit to search for another consensus element + * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position */ - private int findNextNonFilteredDataElement(LinkedList header, int start, int upTo) { - Iterator headerElementIterator = header.listIterator(start); + private int findNextFilteredElement(final LinkedList header, final int start, final int upTo) { + final Iterator headerElementIterator = header.listIterator(start); int index = start; while (index < upTo) { if (!headerElementIterator.hasNext()) throw new ReviewedStingException("There are no more header elements in this window"); - HeaderElement headerElement = headerElementIterator.next(); - if (!headerElement.hasFilteredData() || headerElement.hasConsensusData()) - break; - index++; - } - return index; - } - - /** - * Looks for the next non-empty header element - * - * @param start beginning of the region - * @param upTo limit to search for - * @return next position in local coordinates (relative to the windowHeader) with non-empty element; otherwise, the start position - */ - private int findNextNonEmptyElement(LinkedList header, int start, int upTo) { - ListIterator headerElementIterator = header.listIterator(start); - int index = start; - while (index < upTo) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("There are no more header elements in this window"); - - HeaderElement headerElement = headerElementIterator.next(); - if (!headerElement.isEmpty()) + if (headerElementIterator.next().hasFilteredData()) break; index++; } @@ -571,67 +665,26 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param strandType the strandedness that the synthetic read should be represented as having - * @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated. */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - private ObjectArrayList addToFilteredData(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { - ObjectArrayList result = new ObjectArrayList(); + private void addToFilteredData(final LinkedList header, final int start, final int end) { if (filteredDataConsensus == null) - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, SyntheticRead.StrandType.STRANDLESS); ListIterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { if (!headerElementIterator.hasNext()) throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist"); - HeaderElement headerElement = headerElementIterator.next(); - if (headerElement.hasConsensusData()) - throw new ReviewedStingException("Found consensus data inside region to add to filtered data."); + final HeaderElement headerElement = headerElementIterator.next(); if (!headerElement.hasFilteredData()) throw new ReviewedStingException("No filtered data in " + index); - if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) { - result.add(finalizeFilteredDataConsensus()); - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType); - } - genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts()); } - - return result; - } - - /** - * Adds bases to the filtered data synthetic read. - * - * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData - * bases. - * - * @param header the window header - * @param start the first header index to add to consensus - * @param end the first header index NOT TO add to consensus - * @param strandType the strandedness that the synthetic read should be represented as having - */ - @Requires({"start >= 0 && (end >= start || end == 0)"}) - private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { - if (runningConsensus == null) - runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); - - Iterator headerElementIterator = header.listIterator(start); - for (int index = start; index < end; index++) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist"); - - HeaderElement headerElement = headerElementIterator.next(); - if (!headerElement.hasConsensusData()) - throw new ReviewedStingException("No CONSENSUS data in " + index); - - genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts()); - } } /** @@ -726,7 +779,7 @@ public class SlidingWindow { for ( int i = start; i <= stop; i++ ) { - final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT); + final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT); // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios if ( nAlleles > 2 || nAlleles == -1 ) @@ -760,8 +813,8 @@ public class SlidingWindow { if ( headerElement.getLocation() == positionToSkip ) continue; - if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) || - headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) > 1 ) + if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) || + headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) > 1 ) return true; } @@ -784,6 +837,7 @@ public class SlidingWindow { final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed); result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads); result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS)); + result.reads.addAll(addToFilteredReads(windowHeader, 0, allReads.stopPerformed + 1)); result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary @@ -914,6 +968,7 @@ public class SlidingWindow { if (!windowHeader.isEmpty()) { finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS)); + finalizedReads.addAll(addToFilteredReads(windowHeader, 0, windowHeader.size())); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } } @@ -983,7 +1038,7 @@ public class SlidingWindow { // initialize the mapping from base (allele) to header final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); - for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) ) { + for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ) { final int currentIndex = alleleHeaderMap.size(); if ( currentIndex > 1 ) throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); @@ -997,7 +1052,7 @@ public class SlidingWindow { if ( alleleHeaderMap.size() != 2 ) throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); - final ObjectList readsToRemoveFromHeader = new ObjectArrayList(); + final ObjectList readsToRemove = new ObjectArrayList(); for ( final GATKSAMRecord read : readsInWindow ) { @@ -1006,38 +1061,36 @@ public class SlidingWindow { continue; // remove all other reads from the read cache since we're going to use them here - readsInWindow.remove(read); + readsToRemove.add(read); - // if the read falls before the het position, we don't need to look at it - if ( read.getSoftEnd() < globalHetRefPosition ) + // if the read falls before the het position or has low MQ, we don't need to look at it + if ( read.getSoftEnd() < globalHetRefPosition || read.getMappingQuality() < MIN_MAPPING_QUALITY) continue; // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead removeFromHeader(windowHeader, read); - // make sure it meets the minimum mapping quality requirement (if not, we won't use it for the consensus) - if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { + // where on the read is the het position? + final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - // where on the read is the het position? - final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); + // this is safe because indels are not supported + final byte base = read.getReadBases()[readPosOfHet]; - // this is safe because indels are not supported - final byte base = read.getReadBases()[readPosOfHet]; - final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; + // check which allele this read represents + final Integer allele = alleleHeaderMap.get(base); - // check which allele this read represents - final Integer allele = alleleHeaderMap.get(base); - - // ignore the read if it represents a base that's not part of the consensus - if ( allele != null ) { - // add to the appropriate polyploid header - final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; - header.reads.add(read); - addToHeader(header.consensus, read); - } + // ignore the read if it represents a base that's not part of the consensus + if ( allele != null ) { + // add to the appropriate polyploid header + final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; + header.reads.add(read); + addToHeader(header.consensus, read); } } + for ( final GATKSAMRecord read : readsToRemove ) + readsInWindow.remove(read); + // create the polyploid synthetic reads if we can final ObjectList hetReads = new ObjectArrayList(); @@ -1171,15 +1224,15 @@ public class SlidingWindow { int readBaseIndex = 0; HeaderElement headerElement; - for ( CigarElement cigarElement : read.getCigar().getCigarElements() ) { + for ( final CigarElement cigarElement : read.getCigar().getCigarElements() ) { switch ( cigarElement.getOperator() ) { case H: break; case I: readBaseIndex += cigarElement.getLength(); - // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it. - if ( removeRead && locationIndex == 0 ) + // special case, if we don't have the previous header element anymore, don't worry about it. + if ( locationIndex == 0 ) break; // insertions are added to the base to the left (previous element) @@ -1200,9 +1253,8 @@ public class SlidingWindow { headerElement.removeBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); else headerElement.addBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); - - locationIndex++; } + locationIndex += nDeletionBases; break; case S: case M: @@ -1211,6 +1263,8 @@ public class SlidingWindow { case X: final int nBasesToAdd = cigarElement.getLength(); final boolean isSoftClip = cigarElement.getOperator() == CigarOperator.S; + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); final boolean readHasIndelQuals = read.hasBaseIndelQualities(); final byte[] insertionQuals = readHasIndelQuals ? read.getBaseInsertionQualities() : null; final byte[] deletionQuals = readHasIndelQuals ? read.getBaseDeletionQualities() : null; @@ -1219,14 +1273,15 @@ public class SlidingWindow { headerElement = headerElementIterator.next(); final byte insertionQuality = readHasIndelQuals ? insertionQuals[readBaseIndex] : -1; final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1; + if ( removeRead ) - headerElement.removeBase(read.getReadBases()[readBaseIndex], read.getBaseQualities()[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); + headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); else - headerElement.addBase(read.getReadBases()[readBaseIndex], read.getBaseQualities()[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); + headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); readBaseIndex++; - locationIndex++; } + locationIndex += nBasesToAdd; break; default: break; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java index 4e5652c45..a3bdc6691 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -102,6 +102,9 @@ public class AssessReducedQuals extends LocusWalker implem @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) public double qual_epsilon = 0.10; + @Argument(fullName = "exclude_low_mq", shortName = "excludeMQ", doc = "ignore reads with mapping quality below this number", required = false) + public int excludeMQ = 0; + @Output protected PrintStream out; @@ -146,7 +149,7 @@ public class AssessReducedQuals extends LocusWalker implem } private boolean isGoodRead(final PileupElement p) { - return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= 20; + return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= excludeMQ; } private int getTagIndex(final List tags) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java index 5ae6e86df..f988471a0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java @@ -179,7 +179,7 @@ public class BaseCountsUnitTest extends BaseTest { BaseCounts counts = new BaseCounts(); for ( int qual : test.quals ) - counts.incr(BaseIndex.A, (byte)qual, 20); + counts.incr(BaseIndex.A, (byte)qual, 20, false); final int actualSum = (int)counts.getSumQuals((byte)'A'); final int expectedSum = qualSum(test.quals); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index d73a71855..32791dd97 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -128,8 +128,8 @@ public class HeaderElementUnitTest extends BaseTest { Assert.assertEquals(headerElement.hasConsensusData(), test.MQ >= minMappingQual); Assert.assertEquals(headerElement.hasFilteredData(), test.MQ < minMappingQual); Assert.assertEquals(headerElement.hasConsensusData() ? headerElement.getConsensusBaseCounts().getRMS() : headerElement.getFilteredBaseCounts().getRMS(), (double)test.MQ); - Assert.assertFalse(headerElement.isVariantFromMismatches(0.05)); - Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip); + Assert.assertFalse(headerElement.isVariantFromMismatches(0.05, 0.05)); + Assert.assertEquals(headerElement.isVariant(0.05, 0.05, 0.05), test.isClip); } @@ -177,7 +177,7 @@ public class HeaderElementUnitTest extends BaseTest { headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false); } - final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue); + final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue, test.pvalue); final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue); Assert.assertEquals(nAllelesSeen, nAllelesExpected); @@ -195,9 +195,14 @@ public class HeaderElementUnitTest extends BaseTest { if ( count == 0 ) continue; - final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count); + final boolean isSignificant; + if ( count <= HeaderElement.MIN_COUNT_FOR_USING_PVALUE ) { + isSignificant = MathUtils.binomialCumulativeProbability(total, 0, count) > targetPvalue; + } else { + isSignificant = (count >= targetPvalue * total); + } - if ( pvalue > targetPvalue ) { + if ( isSignificant ) { if ( index == BaseIndex.D.index ) return -1; result++; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 1ab001147..b5963498a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -157,46 +157,44 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "62f8cdb85a424e42e9c56f36302d1dba", false); + RRTest("testDefaultCompression ", L, "fa1cffc4539e0c20b818a11da5dba5b9", false); } @Test(enabled = true) public void testDefaultCompressionWithKnowns() { - RRTest("testDefaultCompressionWithKnowns ", L, "874c0e0a54c3db67f5e9d7c0d45b7844", true); + RRTest("testDefaultCompressionWithKnowns ", L, "d1b5fbc402810d9cdc020bb3503f1325", true); } private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; @Test(enabled = true) public void testMultipleIntervals() { - RRTest("testMultipleIntervals ", intervals, "2e849f8324b27af36bae8cb9b01722e6", false); + RRTest("testMultipleIntervals ", intervals, "7e9dcd157ad742d4ebae7e56bc4af663", false); } @Test(enabled = true) public void testMultipleIntervalsWithKnowns() { - RRTest("testMultipleIntervalsWithKnowns ", intervals, "71bc2167cc6916288bd34dcf099feebc", true); + RRTest("testMultipleIntervalsWithKnowns ", intervals, "dbb1e95e1bcad956701142afac763717", true); } - final String highCompressionMD5 = "c83256fa2d6785d5188f50dd45c77e0f"; - @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, false); + RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "8f8fd1a53fa0789116f45e4cf2625906", false); } @Test(enabled = true) public void testHighCompressionWithKnowns() { - RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, true); + RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "52fd2a77802a4677b604abb18e15d96a", true); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a903558ef284381d74b0ad837deb19f6", false); + RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "79c6543d5ce84ebc2ca74404498edbd1", false); } @Test(enabled = true) public void testLowCompressionWithKnowns() { - RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a4c5aa158c6ebbc703134cbe2d48619c", true); + RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "271aec358b309603291a974b5ba3bd60", true); } @Test(enabled = true) @@ -208,7 +206,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testIndelCompression() { - final String md5 = "56154baed62be07008d3684a0a4c0996"; + final String md5 = "d20e6012300898a0315c795cab7583d8"; RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); } @@ -216,25 +214,27 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("d7655de41d90aecb716f79e32d53b2d1"))); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("e5da09662708f562c0c617ba73cf4763")), "4f916da29d91852077f0a2fdbdd2c7f6"); } + private static final String COREDUCTION_QUALS_TEST_MD5 = "26d84a2bd549a01a63fcebf8847a1b7d"; + @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fa549ba96ca0ce5fbf3553ba173167e8"))); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5f4d2c1d9c010dfd6865aeba7d0336fe")), COREDUCTION_QUALS_TEST_MD5); } @Test(enabled = true) public void testCoReductionWithKnowns() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9edcf09b21a4ae8d9fc25222bcb0486b"))); + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("ca48dd972bf57595c691972c0f887cb4")), COREDUCTION_QUALS_TEST_MD5); } @Test(enabled = true) public void testInsertionsAtEdgeOfConsensus() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("27cc8f1a336b2d0a29855ceb8fc988b0"))); + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("760500a5b036b987f84099f45f26a804"))); } /** @@ -248,7 +248,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("34baf99904b676d5f132d3791030ed0a")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("67f8a3a647f8ec5212104bdaafd8c862")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); } /** @@ -259,7 +259,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("985c4f15a1d45267abb2f6790267930d"))); + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("1663f35802f82333c5e15653e437ce2d"))); } /** @@ -269,7 +269,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("388ef48791965d637e4bdb45d5d7cf01"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("0ce693b4ff925998867664e4099f3248"))); } /** @@ -279,7 +279,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("cfa2588f5edf74c5ddf3d190f5ac6f2d"))); + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7e7b358443827ca239db3b98f299aec6")), "2af063d1bd3c322b03405dbb3ecf59a9"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index 4bf67f5a2..56ad02084 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -251,14 +251,15 @@ public class SlidingWindowUnitTest extends BaseTest { } private class ConsensusCreationTest { - public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression; + public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage; public final List myReads = new ArrayList(20); public final String description; - private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) { + private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; - this.description = String.format("%d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression); + this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; + this.description = String.format("%d %d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -268,10 +269,11 @@ public class SlidingWindowUnitTest extends BaseTest { myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality, CigarOperator.M)); } - private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) { + private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; - this.description = String.format("%s %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression); + this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; + this.description = String.format("%s %d %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -319,46 +321,50 @@ public class SlidingWindowUnitTest extends BaseTest { private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); + private static final GenomeLoc loc320 = new UnvalidatingGenomeLoc("1", 0, 1000320, 1000320); private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); + private static final int DEEP_COVERAGE_ITERATIONS = 100; + @DataProvider(name = "ConsensusCreation") public Object[][] createConsensusCreationTestData() { List tests = new ArrayList(); // test high quality reads and bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 6)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 6, 5 + DEEP_COVERAGE_ITERATIONS)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc320), false, false, 11, 10, 4 + (6 * DEEP_COVERAGE_ITERATIONS))}); // test low quality reads - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 2, 2, 2)}); // test low quality bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 1, 1, 1)}); // test mixture - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 1, 1, 1)}); // test I/D operators - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 9, 9)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 10, 10)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 10, 10)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 11, 11)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9, 2 + (7 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 9, 9, 2 + (7 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))}); return tests.toArray(new Object[][]{}); } @@ -368,14 +374,14 @@ public class SlidingWindowUnitTest extends BaseTest { final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); // test WITH het compression at KNOWN sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); for ( int i = 0; i < 1200; i++ ) @@ -384,11 +390,28 @@ public class SlidingWindowUnitTest extends BaseTest { Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); // test WITH het compression at ALL sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); result = slidingWindow.close(null); Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); + + // test with deep coverage + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 0, ReduceReads.DownsampleStrategy.Normal, false); + for ( int i = 0; i < DEEP_COVERAGE_ITERATIONS; i++ ) { + for ( final GATKSAMRecord read : test.myReads ) { + final GATKSAMRecord copy = ArtificialSAMUtils.createArtificialRead(header, read.getReadName() + "_" + (i+1), 0, read.getAlignmentStart(), readLength); + copy.setReadBases(read.getReadBases()); + copy.setBaseQualities(read.getBaseQualities()); + copy.setMappingQuality(read.getMappingQuality()); + copy.setReadNegativeStrandFlag(read.getReadNegativeStrandFlag()); + if ( read.getCigar() != null ) + copy.setCigar(read.getCigar()); + slidingWindow.addRead(copy); + } + } + result = slidingWindow.close(null); + Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsAtDeepCoverage); } @Test @@ -412,14 +435,14 @@ public class SlidingWindowUnitTest extends BaseTest { final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all // test WITH het compression at KNOWN sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); for ( int i = 0; i < readLength; i++ ) @@ -428,13 +451,59 @@ public class SlidingWindowUnitTest extends BaseTest { Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all // test WITH het compression at ALL sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); result = slidingWindow.close(knownSNPs); Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all } + @Test + public void testAddingReadPairWithSameCoordinates() { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); + + final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1); + read1.setReadBases(new byte[]{(byte)'A'}); + read1.setBaseQualities(new byte[]{(byte)'A'}); + read1.setMappingQuality(30); + read1.setReadNegativeStrandFlag(false); + slidingWindow.addRead(read1); + + final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1); + read2.setReadBases(new byte[]{(byte)'A'}); + read2.setBaseQualities(new byte[]{(byte)'A'}); + read2.setMappingQuality(30); + read2.setReadNegativeStrandFlag(true); + slidingWindow.addRead(read2); + + Assert.assertEquals(slidingWindow.readsInWindow.size(), 2); + } + + @Test + public void testOnlySpanningReadHasLowQual() { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + + final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead1", 0, globalStartPosition, 100); + final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead2", 0, globalStartPosition + 50, 100); + + final byte[] bases = Utils.dupBytes((byte) 'A', readLength); + read1.setReadBases(bases); + read2.setReadBases(bases); + + final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength); + baseQuals[80] = (byte)10; + read1.setBaseQualities(baseQuals); + read2.setBaseQualities(baseQuals); + + read1.setMappingQuality(30); + read2.setMappingQuality(30); + + slidingWindow.addRead(read1); + slidingWindow.addRead(read2); + + Assert.assertEquals(slidingWindow.close(null).getFirst().size(), 1); + } + /////////////////////////////////////////////////////////// //// This section tests the downsampling functionality //// @@ -452,7 +521,7 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "Downsampling", enabled = true) public void testDownsamplingTest(final int dcov) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); @@ -500,7 +569,7 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "ConsensusQuals", enabled = true) public void testConsensusQualsTest(QualsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); final Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); @@ -569,7 +638,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); read.setMappingQuality(30); - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start); Assert.assertEquals(newIndex, start > 0 ? start : 0); @@ -613,7 +682,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setMappingQuality(30); // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); for ( int i = 0; i < start; i++ ) Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); @@ -628,7 +697,6 @@ public class SlidingWindowUnitTest extends BaseTest { Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); } - ////////////////////////////////////////////////////////////////////////////////// //// This section tests functionality related to polyploid consensus creation //// ////////////////////////////////////////////////////////////////////////////////// @@ -691,7 +759,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setMappingQuality(30); // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); // set up and add a soft-clipped read if requested diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index f4644036f..1cc798e36 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -383,7 +383,7 @@ public class MathUtils { for (int hits = k_start; hits <= k_end; hits++) { prevProb = cumProb; - double probability = binomialProbability(n, hits); + final double probability = binomialProbability(n, hits); cumProb += probability; if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); From 4d561421635a3db1e13c5668e9badb031081f455 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 11 Apr 2013 13:04:29 -0400 Subject: [PATCH 211/226] Detect stuck lock-acquisition calls, and disable file locking for tests -Acquire file locks in a background thread with a timeout of 30 seconds, and throw a UserException if a lock acquisition call times out * should solve the locking issue for most people provided they RETRY failed farm jobs * since we use NON-BLOCKING lock acquisition calls, any call that takes longer than a second or two indicates a problem with the underlying OS file lock support * use daemon threads so that stuck lock acquisition tasks don't prevent the JVM from exiting -Disable both auto-index creation and file locking for integration tests via a hidden GATK argument --disable_auto_index_creation_and_locking_when_reading_rods * argument not safe for general use, since it allows reading from an index file without first acquiring a lock * this is fine for the test suite, since all index files already exist for test files (or if they don't, they should!) -Added missing indices for files in private/testdata -Had to delete most of RMDTrackBuilderUnitTest, since it mostly tested auto-index creation, which we can't test with locking disabled, but I replaced the deleted tests with some tests of my own. -Unit test for FSLockWithShared to test the timeout feature --- .gitignore | 1 - .../sting/gatk/GenomeAnalysisEngine.java | 3 +- .../arguments/GATKArgumentCollection.java | 11 + .../gatk/refdata/tracks/RMDTrackBuilder.java | 118 ++++--- .../walkers/coverage/DepthOfCoverage.java | 3 +- .../walkers/variantutils/VariantsToVCF.java | 5 +- .../sting/utils/exceptions/UserException.java | 17 + .../sting/utils/file/FSLockWithShared.java | 309 ++++++++++++------ .../FileSystemInabilityToLockException.java | 47 --- .../org/broadinstitute/sting/WalkerTest.java | 3 + .../ReferenceOrderedViewUnitTest.java | 3 +- .../rmd/ReferenceOrderedDataPoolUnitTest.java | 7 +- .../tracks/RMDTrackBuilderUnitTest.java | 171 ++++------ .../refdata/utils/TestRMDTrackBuilder.java | 3 +- .../utils/file/FSLockWithSharedUnitTest.java | 60 ++++ 15 files changed, 451 insertions(+), 310 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java diff --git a/.gitignore b/.gitignore index 9a20b68ca..65f111587 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ /*.bam /*.bai /*.bed -*.idx *~ /*.vcf /*.txt diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index fed33c1cb..82bee7826 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -865,7 +865,8 @@ public class GenomeAnalysisEngine { SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) { - final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType); + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, + getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); final List dataSources = new ArrayList(); for (RMDTriplet fileDescriptor : referenceMetaDataFiles) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index a9016708b..e98dcfe9e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -274,6 +274,17 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; + @Hidden + @Advanced + @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", + doc = "UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking " + + "when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index " + + "generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it " + + "for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general " + + "because it allows reading from index files without first acquiring a lock.", + required = false) + public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; + // -------------------------------------------------------------------------------------------------------------- // // Multi-threading arguments diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index c5f87d625..4c50cfaae 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -44,7 +44,6 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.file.FSLockWithShared; -import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import org.broadinstitute.sting.utils.instrumentation.Sizeof; import java.io.File; @@ -83,6 +82,10 @@ public class RMDTrackBuilder { // extends PluginManager { private final FeatureManager featureManager; + // If true, do not attempt to create index files if they don't exist or are outdated, and don't + // make any file lock acquisition calls on the index files. + private final boolean disableAutoIndexCreation; + /** * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, @@ -90,14 +93,19 @@ public class RMDTrackBuilder { // extends PluginManager { * @param dict Sequence dictionary to use. * @param genomeLocParser Location parser to use. * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. + * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. + * UNSAFE in general (because it causes us not to lock index files before reading them) -- + * suitable only for test suite use. */ public RMDTrackBuilder(final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, - ValidationExclusion.TYPE validationExclusionType) { + final ValidationExclusion.TYPE validationExclusionType, + final boolean disableAutoIndexCreation) { this.dict = dict; this.validationExclusionType = validationExclusionType; this.genomeLocParser = genomeLocParser; this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); + this.disableAutoIndexCreation = disableAutoIndexCreation; } /** @@ -208,12 +216,15 @@ public class RMDTrackBuilder { // extends PluginManager { // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (sequenceDictionary.size() == 0 && dict != null) { - File indexFile = Tribble.indexFile(inputFile); validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); - try { // re-write the index - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); + + if ( ! disableAutoIndexCreation ) { + File indexFile = Tribble.indexFile(inputFile); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); + } } sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); @@ -225,7 +236,7 @@ public class RMDTrackBuilder { // extends PluginManager { throw new UserException(e.getMessage()); } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(inputFile, "unable to write Tribble index", e); + throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); } } else { @@ -242,25 +253,36 @@ public class RMDTrackBuilder { // extends PluginManager { * @return a linear index for the specified type * @throws IOException if we cannot write the index file */ - public synchronized Index loadIndex(File inputFile, FeatureCodec codec) throws IOException { - // create the index file name, locking on the index file name - File indexFile = Tribble.indexFile(inputFile); - FSLockWithShared lock = new FSLockWithShared(indexFile); - - // acquire a lock on the file + public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { + final File indexFile = Tribble.indexFile(inputFile); + final FSLockWithShared lock = new FSLockWithShared(indexFile); Index idx = null; - if (indexFile.canRead()) - idx = attemptIndexFromDisk(inputFile, codec, indexFile, lock); - // if we managed to make an index, return + // If the index file exists and is readable, attempt to load it from disk. We'll get null back + // if a problem was discovered with the index file when it was inspected, and we'll get an + // in-memory index back in the case where the index file could not be locked. + if (indexFile.canRead()) { + idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode + : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); + } + + // If we have an index, it means we either loaded it from disk without issue or we created an in-memory + // index due to not being able to acquire a lock. if (idx != null) return idx; - // we couldn't read the file, or we fell out of the conditions above, continue on to making a new index - return writeIndexToDisk(createIndexInMemory(inputFile, codec), indexFile, lock); + // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index + idx = createIndexInMemory(inputFile, codec); + if ( ! disableAutoIndexCreation ) { + writeIndexToDisk(idx, indexFile, lock); + } + return idx; } /** - * attempt to read the index from disk + * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if + * a lock could not be obtained. Returns null if a problem was discovered with the index file when it + * was examined (eg., it was out-of-date). + * * @param inputFile the input file * @param codec the codec to read from * @param indexFile the index file itself @@ -268,20 +290,21 @@ public class RMDTrackBuilder { // extends PluginManager { * @return an index, or null if we couldn't load one * @throws IOException if we fail for FS issues */ - protected Index attemptIndexFromDisk(File inputFile, FeatureCodec codec, File indexFile, FSLockWithShared lock) throws IOException { - boolean locked; + protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { + boolean locked = false; + Index idx = null; + try { locked = lock.sharedLock(); - } - catch(FileSystemInabilityToLockException ex) { - throw new UserException.MissortedFile(inputFile, "Unexpected inability to lock exception", ex); - } - Index idx; - try { - if (!locked) // can't lock file + + if ( ! locked ) { // can't lock file + logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", + indexFile.getAbsolutePath())); idx = createIndexInMemory(inputFile, codec); - else + } + else { idx = loadFromDisk(inputFile, indexFile); + } } finally { if (locked) lock.unlock(); } @@ -294,7 +317,7 @@ public class RMDTrackBuilder { // extends PluginManager { * @param indexFile the input file, plus the index extension * @return an Index, or null if we're unable to load */ - public static Index loadFromDisk(File inputFile, File indexFile) { + protected Index loadFromDisk( final File inputFile, final File indexFile ) { logger.info("Loading Tribble index from disk for file " + inputFile); Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); @@ -302,14 +325,17 @@ public class RMDTrackBuilder { // extends PluginManager { if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) return index; else if (indexFile.lastModified() < inputFile.lastModified()) - logger.warn("Index file " + indexFile + " is out of date (index older than input file), deleting and updating the index file"); + logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable - logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file"); + logger.warn("Index file " + indexFile + " is out of date (old version), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - // however we got here, remove the index and return null - boolean deleted = indexFile.delete(); + if ( ! disableAutoIndexCreation ) { + boolean deleted = indexFile.delete(); + if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); + } - if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); return null; } @@ -319,13 +345,18 @@ public class RMDTrackBuilder { // extends PluginManager { * @param index the index to write to disk * @param indexFile the index file location * @param lock the locking object - * @return the index object * @throws IOException when unable to create the new index */ - private static Index writeIndexToDisk(Index index, File indexFile, FSLockWithShared lock) throws IOException { - boolean locked = false; // could we exclusive lock the file? + private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { + if ( disableAutoIndexCreation ) { + return; + } + + boolean locked = false; + try { - locked = lock.exclusiveLock(); // handle the case where we aren't locking anything + locked = lock.exclusiveLock(); + if (locked) { logger.info("Writing Tribble index to disk for file " + indexFile); LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); @@ -337,11 +368,6 @@ public class RMDTrackBuilder { // extends PluginManager { try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } catch ( ReviewedStingException e) { } - - return index; - } - catch(FileSystemInabilityToLockException ex) { - throw new UserException.CouldNotCreateOutputFile(indexFile,"Unexpected inability to lock exception", ex); } finally { if (locked) lock.unlock(); @@ -356,7 +382,7 @@ public class RMDTrackBuilder { // extends PluginManager { * @return a LinearIndex, given the file location * @throws IOException when unable to create the index in memory */ - private Index createIndexInMemory(File inputFile, FeatureCodec codec) { + protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 29016af43..c4ef4d23b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -577,7 +577,8 @@ public class DepthOfCoverage extends LocusWalker { if ( dbsnp == null ) throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records"); - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe); + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + getToolkit().getArguments().unsafe, + getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods); dbsnpIterator = builder.createInstanceOfTrack(VCFCodec.class, new File(dbsnp.dbsnp.getSource())).getIterator(); // Note that we should really use some sort of seekable iterator here so that the search doesn't take forever // (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to) diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 83400cc73..3abe5a7f4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -440,4 +440,21 @@ public class UserException extends ReviewedStingException { f.getAbsolutePath(), PHONE_HOME_DOCS_URL)); } } + + /** + * A special exception that happens only in the case where + * the filesystem, by design or configuration, is completely unable + * to handle locking. This exception will specifically NOT be thrown + * in the case where the filesystem handles locking but is unable to + * acquire a lock due to concurrency. + */ + public static class FileSystemInabilityToLockException extends UserException { + public FileSystemInabilityToLockException( String message ) { + super(message); + } + + public FileSystemInabilityToLockException( String message, Exception innerException ) { + super(message,innerException); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java b/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java index 3813cfc85..87e89e0f1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java +++ b/public/java/src/org/broadinstitute/sting/utils/file/FSLockWithShared.java @@ -26,15 +26,13 @@ package org.broadinstitute.sting.utils.file; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; -import java.nio.channels.ClosedChannelException; -import java.nio.channels.FileChannel; -import java.nio.channels.FileLock; -import java.nio.channels.OverlappingFileLockException; +import java.nio.channels.*; +import java.util.concurrent.*; /** * a quick implementation of a file based lock, using the Java NIO classes @@ -52,125 +50,244 @@ public class FSLockWithShared { // the file channel we open private FileChannel channel = null; - /** - * A bit of experimental code for Siva at Partners. Conditionally throw an - * exception in the case where an unknown failure occurs, in an effort to stave - * off disabled nfs file locks. - */ - private boolean throwExceptionOnUnknownFailure = false; + // Timeout (in milliseconds) before we give up during non-blocking lock-acquisition calls. + // Necessary because these "non-blocking" calls can hang if there's a problem with the + // OS file locking support. + private int lockAcquisitionTimeout; + + // Default value for lockAcquisitionTimeout when none is explicitly provided + public static final int DEFAULT_LOCK_ACQUISITION_TIMEOUT_IN_MILLISECONDS = 30 * 1000; + + // Amount of time to wait when trying to shut down the lock-acquisition thread before giving up + public static final int THREAD_TERMINATION_TIMEOUT_IN_MILLISECONDS = 30 * 1000; /** - * create a file system, given a base file to which a lock string gets appended. - * @param baseFile File descriptor of file to lock + * Create a lock associated with the specified File. Use the default lock + * acquisition timeout of 30 seconds. + * + * @param file file to lock */ - public FSLockWithShared(File baseFile) { - file = baseFile; - } - - public FSLockWithShared(File baseFile,boolean throwExceptionOnUnknownFailure) { - this(baseFile); - this.throwExceptionOnUnknownFailure = throwExceptionOnUnknownFailure; + public FSLockWithShared( final File file ) { + this.file = file; + lockAcquisitionTimeout = DEFAULT_LOCK_ACQUISITION_TIMEOUT_IN_MILLISECONDS; } /** - * Get a shared (read) lock on a file - * Cannot get shared lock if it does not exist - * @return boolean true if we obtained a lock - * @throws FileSystemInabilityToLockException in cases of unexpected failure to capture lock. + * Create a lock associated with the specified File, and set a custom lock + * acquisition timeout. + * + * @param file file to lock + * @param lockAcquisitionTimeout maximum number of milliseconds to wait during non-blocking + * lock acquisition calls before concluding that there's a + * problem with the OS file locking support and throwing an error. */ - public boolean sharedLock() throws FileSystemInabilityToLockException { + public FSLockWithShared( final File file, final int lockAcquisitionTimeout ) { + this.file = file; + this.lockAcquisitionTimeout = lockAcquisitionTimeout; + } + + /** + * Get a shared (read) lock on a file. Does not block, and returns immediately + * under normal conditions with the result of the lock acquisition attempt. Will + * throw an exception if there's a problem with the OS file locking support. + * + * @return boolean true if we obtained a lock, false if we failed to obtain one + */ + public boolean sharedLock() { + return acquireLockWithTimeout(true); + } + + /** + * Get an exclusive (read-write) lock on a file. Does not block, and returns immediately + * under normal conditions with the result of the lock acquisition attempt. Will + * throw an exception if there's a problem with the OS file locking support. + * + * @return boolean true if we obtained a lock, false if we failed to obtain one + */ + public boolean exclusiveLock() { + return acquireLockWithTimeout(false); + } + + /** + * Attempt to acquire a lock of the specified type on the file in a background thread. + * Uses non-blocking lock-acquisition calls that should return immediately, but may + * get stuck if there's a problem with the OS file locking support. If the call gets + * stuck and the timeout elapses, throws a UserException, since it's not safe to + * proceed with a stuck lock acquisition thread (and there's no way to reliably + * interrupt it once the underlying system call hangs). + * + * @param acquireSharedLock if true, request a shared lock rather than an exclusive lock + * @return true if a lock was acquired, false if we failed + */ + private boolean acquireLockWithTimeout( final boolean acquireSharedLock ) { + // Use daemon threads so that hopelessly stuck lock acquisition threads won't prevent the JVM from exiting + final ExecutorService executor = Executors.newSingleThreadExecutor(new ThreadFactory() { + public Thread newThread( Runnable r ) { + Thread lockAcquisitionThread = new Thread(r); + lockAcquisitionThread.setDaemon(true); + return lockAcquisitionThread; + } + }); + final FutureTask lockAcquisitionTask = new FutureTask(new LockAcquisitionTask(acquireSharedLock)); + boolean lockAcquired = false; - // get read-only file channel try { - channel = new RandomAccessFile(file, "r").getChannel(); + executor.execute(lockAcquisitionTask); + + // Wait at most lockAcquisitionTimeout milliseconds for the lock acquisition task to finish. + lockAcquired = lockAcquisitionTask.get(lockAcquisitionTimeout, TimeUnit.MILLISECONDS); } - catch (IOException e) { - logger.warn(String.format("WARNING: Unable to lock file %s (could not open read only file channel)",file.getAbsolutePath())); - return false; + // Lock acquisition timeout elapsed. Since we're using NON-BLOCKING lock-acquisition calls, + // this implies that there's a problem with the OS locking daemon, or locks are not supported. + // Since it's not safe to proceed with a potentially stuck lock acquisition thread, we need to + // shut down the JVM in order to kill it. + catch ( TimeoutException e ) { + throw new UserException.FileSystemInabilityToLockException( + String.format("Timeout of %d milliseconds was reached while trying to acquire a lock on file %s. " + + "Since the GATK uses non-blocking lock acquisition calls that are not supposed to wait, " + + "this implies a problem with the file locking support in your operating system.", + lockAcquisitionTimeout, file.getAbsolutePath())); } - // get shared lock (third argument is true) + // Lock acquisition thread threw an exception. Need to unpack it via e.getCause() + catch ( ExecutionException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because exception %s occurred with error message %s", + file.getAbsolutePath(), + e.getCause() != null ? e.getCause().getClass().getSimpleName() : "unknown", + e.getCause() != null ? e.getCause().getMessage() : "none")); + lockAcquired = false; + } + // Interrupted while waiting for the lock acquisition thread -- not likely to happen + catch ( InterruptedException e ) { + logger.warn(String.format("WARNING: interrupted while attempting to acquire a lock for file %s", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( Exception e ) { + logger.warn(String.format("WARNING: error while attempting to acquire a lock for file %s. Error message: %s", + file.getAbsolutePath(), e.getMessage())); + lockAcquired = false; + } + + shutdownLockAcquisitionTask(executor); + + // Upon failure to acquire a lock, we always call unlock() to close the FileChannel if it was opened + // and to deal with very hypothetical edge cases where a lock might actually have been acquired despite the + // lock acquisition thread returning false. + if ( ! lockAcquired ) { + unlock(); + } + + return lockAcquired; + } + + /** + * Ensures that the lock acquisition task running in the provided executor has cleanly terminated. + * Throws a UserException if unable to shut it down within the period defined by the THREAD_TERMINATION_TIMEOUT. + * + * @param executor ExecutorService executing the lock-acquisition thread + */ + private void shutdownLockAcquisitionTask( final ExecutorService executor ) { + boolean shutdownAttemptSucceeded; + try { - lock = channel.tryLock(0, Long.MAX_VALUE, true); - if (lock == null) { - logger.warn(String.format("WARNING: Unable to lock file %s because there is already a lock active.",file.getAbsolutePath())); + executor.shutdownNow(); + shutdownAttemptSucceeded = executor.awaitTermination(THREAD_TERMINATION_TIMEOUT_IN_MILLISECONDS, TimeUnit.MILLISECONDS); + } + catch ( InterruptedException e ) { + shutdownAttemptSucceeded = false; + } + + if ( ! shutdownAttemptSucceeded ) { + throw new UserException(String.format("Failed to terminate lock acquisition thread while trying to lock file %s. " + + "Exiting because it's not safe to proceed with this run of the GATK.", + file.getAbsolutePath())); + } + } + + /** + * Background task that attempts to acquire a lock of the specified type, and returns a boolean + * indicating success/failure. Uses a non-blocking tryLock() call that should return immediately + * (but may get stuck if there's a problem with the OS locking daemon). + */ + private class LockAcquisitionTask implements Callable { + private final boolean acquireSharedLock; + + public LockAcquisitionTask( final boolean acquireSharedLock ) { + this.acquireSharedLock = acquireSharedLock; + } + + public Boolean call() { + // Get a read-only or read-write file channel, depending on the type of lock + try { + channel = new RandomAccessFile(file, acquireSharedLock ? "r" : "rw").getChannel(); + } + catch ( IOException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because we could not open a file channel", file.getAbsolutePath())); return false; } - } - catch (ClosedChannelException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because the file channel is closed.",file.getAbsolutePath())); - return false; - } - catch (OverlappingFileLockException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because you already have a lock on this file.",file.getAbsolutePath())); - return false; - } - catch (IOException e) { - logger.warn(String.format("WARNING: Unable to lock file %s: %s.",file.getAbsolutePath(),e.getMessage())); - if(throwExceptionOnUnknownFailure) - throw new FileSystemInabilityToLockException(e.getMessage(),e); - else - return false; - } - return true; - } - /** - * Get an exclusive lock on a file - * @return boolean true if we obtained a lock - * @throws FileSystemInabilityToLockException in cases of unexpected failure to capture lock. - */ - public boolean exclusiveLock() throws FileSystemInabilityToLockException { + boolean lockAcquired = false; - // read/write file channel is necessary for exclusive lock - try { - channel = new RandomAccessFile(file, "rw").getChannel(); - } - catch (Exception e) { - logger.warn(String.format("WARNING: Unable to lock file %s (could not open read/write file channel)",file.getAbsolutePath())); - // do we need to worry about deleting file here? Does RandomAccessFile will only create file if successful? - return false; - } - - // get exclusive lock (third argument is false) - try { - lock = channel.tryLock(0, Long.MAX_VALUE, false); - if (lock == null) { - logger.warn(String.format("WARNING: Unable to lock file %s because there is already a lock active.",file.getAbsolutePath())); - return false; + try { + // Non-blocking lock-acquisition call, should return right away. If it doesn't return immediately + // due to problems with the OS locking daemon, it will potentially be timed-out and interrupted. + lock = channel.tryLock(0, Long.MAX_VALUE, acquireSharedLock); + lockAcquired = lock != null; } - else return true; - } - catch (ClosedChannelException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because the file channel is closed.",file.getAbsolutePath())); - return false; - } - catch (OverlappingFileLockException e) { - logger.warn(String.format("WARNING: Unable to lock file %s because you already have a lock on this file.",file.getAbsolutePath())); - return false; - } - catch (IOException e) { - logger.warn(String.format("WARNING: Unable to lock file %s: %s.",file.getAbsolutePath(),e.getMessage())); - if(throwExceptionOnUnknownFailure) - throw new FileSystemInabilityToLockException(e.getMessage(),e); - else - return false; + catch ( AsynchronousCloseException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because the file channel was closed by another thread", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( ClosedChannelException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because the file channel is closed.", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( OverlappingFileLockException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because you already have a lock on this file.", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( FileLockInterruptionException e ) { + logger.warn(String.format("WARNING: Interrupted while attempting to lock file %s", file.getAbsolutePath())); + lockAcquired = false; + } + catch ( IOException e ) { + logger.warn(String.format("WARNING: Unable to lock file %s because an IOException occurred with message: %s.", file.getAbsolutePath(), e.getMessage())); + lockAcquired = false; + } + + return lockAcquired; } } - + /** - * unlock the file + * Unlock the file * * note: this allows unlocking a file that failed to lock (no required user checks on null locks). */ public void unlock() { + releaseLock(); + closeChannel(); + } + + private void releaseLock() { try { - if (lock != null) + if ( lock != null ) lock.release(); - if (channel != null) + } + catch ( ClosedChannelException e ) { + // if the channel was already closed we don't have to worry + } + catch ( IOException e ) { + throw new UserException(String.format("An error occurred while releasing the lock for file %s", file.getAbsolutePath()), e); + } + } + + private void closeChannel() { + try { + if ( channel != null ) channel.close(); } - catch (Exception e) { - throw new ReviewedStingException("An error occurred while unlocking file", e); + catch ( IOException e ) { + throw new UserException(String.format("An error occurred while closing channel for file %s", file.getAbsolutePath()), e); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java b/public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java deleted file mode 100644 index a17dc612b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/file/FileSystemInabilityToLockException.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.sting.utils.file; - -/** - * A special checked exception that happens only in the case where - * the filesystem, by design or configuration, is completely unable - * to handle locking. This exception will specifically NOT be thrown - * in the case where the filesystem handles locking but is unable to - * acquire a lock due to concurrency. - * - * @author hanna - * @version 0.1 - */ -public class FileSystemInabilityToLockException extends Exception { - /** - * Force user to create this exception with a nested inner stack trace. - * @param message Exception message. - * @param innerException Caused-by exception. - */ - public FileSystemInabilityToLockException(String message,Exception innerException) { - super(message,innerException); - } -} diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 155d44ecd..dd5a2b0a7 100644 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -53,6 +53,7 @@ public class WalkerTest extends BaseTest { private static final boolean GENERATE_SHADOW_BCF = true; private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false; + private static final boolean ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS = false; private static MD5DB md5DB = new MD5DB(); @@ -209,6 +210,8 @@ public class WalkerTest extends BaseTest { String.format(" -et %s -K %s ", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile)); if ( includeShadowBCF && GENERATE_SHADOW_BCF ) args = args + " --generateShadowBCF "; + if ( ! ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS ) + args = args + " --disable_auto_index_creation_and_locking_when_reading_rods "; } return args; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index 3cd059333..fad632cfd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -84,7 +84,8 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { // sequence seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true); } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index 2144cd09b..4a6d14d32 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -95,12 +95,9 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { public void setUp() { String fileName = privateTestDir + "TabularDataTest.dat"; - // check to see if we have an index, if so, delete it - File indexFileName = new File(privateTestDir + "TabularDataTest.dat.idx"); - if (indexFileName.exists()) indexFileName.delete(); - triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index e30ab6e5d..4904428d0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -27,17 +27,15 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; -import org.broadinstitute.variant.vcf.VCF3Codec; +import org.broad.tribble.util.LittleEndianOutputStream; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.testng.annotations.BeforeMethod; @@ -61,7 +59,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @BeforeMethod public void setup() { - File referenceFile = new File(b36KGReference); + File referenceFile = new File(b37KGReference); try { seq = new CachingIndexedFastaSequenceFile(referenceFile); } @@ -69,7 +67,11 @@ public class RMDTrackBuilderUnitTest extends BaseTest { throw new UserException.CouldNotReadInputFile(referenceFile,ex); } genomeLocParser = new GenomeLocParser(seq); - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); + + // We have to disable auto-index creation/locking in the RMDTrackBuilder for tests, + // as the lock acquisition calls were intermittently hanging on our farm. This unfortunately + // means that we can't include tests for the auto-index creation feature. + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true); } @Test @@ -78,134 +80,83 @@ public class RMDTrackBuilderUnitTest extends BaseTest { } @Test - // in this test, the index exists, but is out of date. - public void testBuilderIndexUnwriteable() { - File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/relic.vcf"); - try { - builder.loadIndex(vcfFile, new VCF3Codec()); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - // make sure we didn't write the file (check that it's timestamp is within bounds) - //System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()); - Assert.assertTrue(Math.abs(1279591752000l - Tribble.indexFile(vcfFile).lastModified()) < 100); + public void testDisableAutoIndexGeneration() throws IOException { + final File unindexedVCF = new File(privateTestDir + "unindexed.vcf"); + final File unindexedVCFIndex = Tribble.indexFile(unindexedVCF); + Index index = builder.loadIndex(unindexedVCF, new VCFCodec()); + + Assert.assertFalse(unindexedVCFIndex.exists()); + Assert.assertNotNull(index); } - // we have a good index file, in a read-only dir. This would cause the previous version to remake the index; make - // sure we don't do this @Test - public void testDirIsLockedIndexFromDisk() { - File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf"); - File vcfFileIndex = Tribble.indexFile(vcfFile); - Index ind = null; - try { - ind = builder.attemptIndexFromDisk(vcfFile,new VCFCodec(),vcfFileIndex,new FSLockWithShared(vcfFile)); - } catch (IOException e) { - Assert.fail("We weren't expecting an exception -> " + e.getMessage()); - } - // make sure we get back a null index; i.e. we can't load the index from disk - Assert.assertTrue(ind == null); + public void testLoadOnDiskIndex() { + final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); + final File tempVCFWithCorrectIndex = createTempVCFFileAndIndex(originalVCF, false); + final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithCorrectIndex); + + final Index index = builder.loadFromDisk(tempVCFWithCorrectIndex, tempVCFIndexFile); + + Assert.assertNotNull(index); + Assert.assertTrue(tempVCFIndexFile.exists()); + + final Index inMemoryIndex = builder.createIndexInMemory(tempVCFWithCorrectIndex, new VCFCodec()); + Assert.assertTrue(index.equalsIgnoreProperties(inMemoryIndex)); } - - @Test - public void testBuilderIndexDirectoryUnwritable() { - File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf"); - File vcfFileIndex = Tribble.indexFile(vcfFile); + public void testLoadOnDiskOutdatedIndex() { + final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); + final File tempVCFWithOutdatedIndex = createTempVCFFileAndIndex(originalVCF, true); + final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithOutdatedIndex); - Index ind = null; - try { - ind = builder.loadIndex(vcfFile, new VCF3Codec()); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - // make sure we didn't write the file (check that it's timestamp is within bounds) - Assert.assertTrue(!vcfFileIndex.exists()); - Assert.assertTrue(ind != null); + final Index index = builder.loadFromDisk(tempVCFWithOutdatedIndex, tempVCFIndexFile); - } - - - @Test - public void testGenerateIndexForUnindexedFile() { - File vcfFile = new File(privateTestDir + "always_reindex.vcf"); - File vcfFileIndex = Tribble.indexFile(vcfFile); - - // if we can't write to the directory, don't fault the tester, just pass - if (!vcfFileIndex.getParentFile().canWrite()) { - logger.warn("Unable to run test testGenerateIndexForUnindexedFile: unable to write to dir " + vcfFileIndex.getParentFile()); - return; - } - // clean-up our test, and previous tests that may have written the file - vcfFileIndex.deleteOnExit(); - if (vcfFileIndex.exists()) - vcfFileIndex.delete(); - - try { - builder.loadIndex(vcfFile, new VCFCodec()); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - // make sure we wrote the file - Assert.assertTrue(vcfFileIndex.exists()); - } - - - // test to make sure we get a full sequence dictionary from the VCF (when we set the dictionary in the builder) - @Test - public void testBuilderIndexSequenceDictionary() { - File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); - Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified(); - try { - Index idx = builder.loadIndex(vcfFile, new VCFCodec()); - // catch any exception; this call should pass correctly - SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - - // make sure that we removed and updated the index - Assert.assertTrue(Tribble.indexFile(vcfFile).lastModified() >= indexTimeStamp,"Fail: index file was modified"); + // loadFromDisk() should return null to indicate that the index is outdated and should not be used, + // but should not delete the index since our builder has disableAutoIndexCreation set to true + Assert.assertNull(index); + Assert.assertTrue(tempVCFIndexFile.exists()); } /** - * create a temporary file and an associated out of date index file + * Create a temporary vcf file and an associated index file, which may be set to be out-of-date + * relative to the vcf * - * @param tribbleFile the tribble file - * @return a file pointing to the new tmp location, with out of date index + * @param vcfFile the vcf file + * @param createOutOfDateIndex if true, ensure that the temporary vcf file is modified after the index + * @return a file pointing to the new tmp location, with accompanying index */ - private File createCorrectDateIndexFile(File tribbleFile) { + private File createTempVCFFileAndIndex( final File vcfFile, final boolean createOutOfDateIndex ) { try { - // first copy the tribble file to a temperary file - File tmpFile = File.createTempFile("TribbleUnitTestFile", ""); + final File tmpFile = File.createTempFile("RMDTrackBuilderUnitTest", ""); + final File tmpIndex = Tribble.indexFile(tmpFile); tmpFile.deleteOnExit(); - logger.info("creating temp file " + tmpFile); - - // copy the vcf (tribble) file to the tmp file location - copyFile(tribbleFile, tmpFile); - - // sleep again, to make sure the timestamps are different (vcf vrs updated index file) - Thread.sleep(2000); - - // create a fake index, before we copy so it's out of date - File tmpIndex = Tribble.indexFile(tmpFile); tmpIndex.deleteOnExit(); - // copy the vcf (tribble) file to the tmp file location - copyFile(Tribble.indexFile(tribbleFile), tmpIndex); + copyFile(vcfFile, tmpFile); + final Index inMemoryIndex = builder.createIndexInMemory(tmpFile, new VCFCodec()); + final LittleEndianOutputStream indexOutputStream = new LittleEndianOutputStream(new FileOutputStream(tmpIndex)); + + // If requested, modify the tribble file after the index. Otherwise, modify the index last. + if ( createOutOfDateIndex ) { + inMemoryIndex.write(indexOutputStream); + indexOutputStream.close(); + Thread.sleep(2000); + copyFile(vcfFile, tmpFile); + } + else { + copyFile(vcfFile, tmpFile); + Thread.sleep(2000); + inMemoryIndex.write(indexOutputStream); + indexOutputStream.close(); + } return tmpFile; - } catch (IOException e) { Assert.fail("Unable to create temperary file"); } catch (InterruptedException e) { - Assert.fail("Somehow our thread got interupted"); + Assert.fail("Somehow our thread got interrupted"); } return null; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java index a993d1783..48e3bbd8c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestRMDTrackBuilder.java @@ -45,7 +45,8 @@ public class TestRMDTrackBuilder extends RMDTrackBuilder { private GenomeLocParser genomeLocParser; public TestRMDTrackBuilder(SAMSequenceDictionary dict, GenomeLocParser genomeLocParser) { - super(dict, genomeLocParser, null); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + super(dict, genomeLocParser, null, true); this.genomeLocParser = genomeLocParser; } diff --git a/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java new file mode 100644 index 000000000..5c0eec252 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/file/FSLockWithSharedUnitTest.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.file; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; + +public class FSLockWithSharedUnitTest extends BaseTest { + + private static final int MAX_EXPECTED_LOCK_ACQUISITION_TIME = FSLockWithShared.DEFAULT_LOCK_ACQUISITION_TIMEOUT_IN_MILLISECONDS + + FSLockWithShared.THREAD_TERMINATION_TIMEOUT_IN_MILLISECONDS; + + /** + * Test to ensure that we're never spending more than the maximum configured amount of time in lock acquisition calls. + */ + @Test( timeOut = MAX_EXPECTED_LOCK_ACQUISITION_TIME + 10 * 1000 ) + public void testLockAcquisitionTimeout() { + final File lockFile = createTempFile("FSLockWithSharedUnitTest", ".lock"); + final FSLockWithShared lock = new FSLockWithShared(lockFile); + boolean lockAcquisitionSucceeded = false; + + try { + lockAcquisitionSucceeded = lock.sharedLock(); + } + catch ( UserException e ) { + logger.info("Caught UserException from lock acquisition call: lock acquisition must have timed out. Message: " + e.getMessage()); + } + finally { + if ( lockAcquisitionSucceeded ) { + lock.unlock(); + } + } + } +} From d20be41fee027eb9d31d20ffa44835a2685825de Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 24 Apr 2013 17:22:07 -0400 Subject: [PATCH 212/226] Bugfix for FragmentUtils.mergeOverlappingPairedFragments -- The previous version was unclipping soft clipped bases, and these were sometimes adaptor sequences. If the two reads successfully merged, we'd lose all of the information necessary to remove the adaptor, producing a very high quality read that matched reference. Updated the code to first clip the adapter sequences from the incoming fragments -- Update MD5s --- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 4 +- .../sting/utils/fragments/FragmentUtils.java | 27 +++++---- .../fragments/FragmentUtilsUnitTest.java | 58 ++++++++++++++++++- 4 files changed, 76 insertions(+), 15 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 17f04971b..5fe4e6dfa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd51f8c7235eb6547b678093c7a01089"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "27db36467d40c3cde201f5826e959d78"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 2664f3ed0..fff1c0bb9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "c1530f2158cb41d50e830ca5be0f97a0"); + HCTest(NA12878_BAM, "", "18d5671d8454e8a0c05ee5f6e9fabfe3"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("eb5772b825120a0b8710e5add485d73a")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cb190c935541ebb9f660f713a882b922")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 99f1d99c7..5d882ba8c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -31,6 +31,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -213,24 +214,28 @@ public final class FragmentUtils { * * Assumes that firstRead starts before secondRead (according to their soft clipped starts) * - * @param firstRead the left most read - * @param firstRead the right most read + * @param unclippedFirstRead the left most read + * @param unclippedSecondRead the right most read * * @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one */ - public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord firstRead, final GATKSAMRecord secondRead) { - if ( firstRead == null ) throw new IllegalArgumentException("firstRead cannot be null"); - if ( secondRead == null ) throw new IllegalArgumentException("secondRead cannot be null"); - if ( ! firstRead.getReadName().equals(secondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + firstRead + " and " + secondRead); + public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord unclippedFirstRead, final GATKSAMRecord unclippedSecondRead) { + if ( unclippedFirstRead == null ) throw new IllegalArgumentException("unclippedFirstRead cannot be null"); + if ( unclippedSecondRead == null ) throw new IllegalArgumentException("unclippedSecondRead cannot be null"); + if ( ! unclippedFirstRead.getReadName().equals(unclippedSecondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + unclippedFirstRead + " and " + unclippedSecondRead); + + if( unclippedFirstRead.getCigarString().contains("I") || unclippedFirstRead.getCigarString().contains("D") || unclippedSecondRead.getCigarString().contains("I") || unclippedSecondRead.getCigarString().contains("D") ) { + return null; // fragments contain indels so don't merge them + } + + final GATKSAMRecord firstRead = ReadClipper.hardClipAdaptorSequence(ReadClipper.revertSoftClippedBases(unclippedFirstRead)); + final GATKSAMRecord secondRead = ReadClipper.hardClipAdaptorSequence(ReadClipper.revertSoftClippedBases(unclippedSecondRead)); if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { return null; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A } - if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { - return null; // fragments contain indels so don't merge them - } - final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getAlignmentStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); final int numBases = firstReadStop + secondRead.getReadLength(); @@ -264,7 +269,7 @@ public final class FragmentUtils { final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); returnRead.setIsStrandless(true); - returnRead.setAlignmentStart( firstRead.getSoftStart() ); + returnRead.setAlignmentStart( firstRead.getAlignmentStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 4f49eb933..e9600480a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -222,7 +222,7 @@ public class FragmentUtilsUnitTest extends BaseTest { return read; } - @Test(enabled = true, dataProvider = "MergeFragmentsTest") + @Test(enabled = !DEBUG, dataProvider = "MergeFragmentsTest") public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); @@ -240,4 +240,60 @@ public class FragmentUtilsUnitTest extends BaseTest { Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); } } + + @Test(enabled = !DEBUG) + public void testHardClippingBeforeMerge() { + final String common = Utils.dupString("A", 10); + final byte[] commonQuals = Utils.dupBytes((byte)30, common.length()); + final String adapter = "NNNN"; + + final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, common, commonQuals, "", 30, 10); + final GATKSAMRecord read2 = makeOverlappingRead("", 30, common, commonQuals, adapter, 30, 10); + final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10); + read1.setCigarString("4S" + common.length() + "M"); + read1.setProperPairFlag(true); + read1.setFirstOfPairFlag(true); + read1.setReadNegativeStrandFlag(true); + read1.setMateAlignmentStart(10); + read2.setCigarString(common.length() + "M4S"); + read2.setProperPairFlag(true); + read2.setFirstOfPairFlag(false); + read2.setReadNegativeStrandFlag(false); + + final int insertSize = common.length() - 1; + read1.setInferredInsertSize(insertSize); + read2.setInferredInsertSize(-insertSize); + + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); + Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); + Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); + Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality()); + for ( final EventType type : EventType.values() ) + Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); + } + + @Test(enabled = true) + public void testHardClippingBeforeMergeResultingInCompletelyContainedSecondRead() { + final String adapter = "NNNN"; + + final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, Utils.dupString("A", 10), Utils.dupBytes((byte)30, 10), "", 30, 10); + final GATKSAMRecord read2 = makeOverlappingRead("", 30, Utils.dupString("A", 7), Utils.dupBytes((byte)30, 7), adapter, 30, 10); + read1.setCigarString("4S10M"); + read1.setProperPairFlag(true); + read1.setFirstOfPairFlag(true); + read1.setReadNegativeStrandFlag(true); + read1.setMateAlignmentStart(10); + read2.setCigarString("7M4S"); + read2.setProperPairFlag(true); + read2.setFirstOfPairFlag(false); + read2.setReadNegativeStrandFlag(false); + + final int insertSize = 7 - 1; + read1.setInferredInsertSize(insertSize); + read2.setInferredInsertSize(-insertSize); + + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + Assert.assertNull(actual); + } } From 7cb1247164b13f8976b328901fa2b95fc3bff47a Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 25 Apr 2013 13:53:32 -0400 Subject: [PATCH 213/226] Rev picard, sam-jdk, tribble, and variant jars to version 1.90.1442 -This is mainly to get the new "0-length cigar element" check in the sam-jdk --- ...2662.jar => picard-private-parts-2872.jar} | Bin 6445 -> 6445 bytes ...2662.xml => picard-private-parts-2872.xml} | 2 +- .../repository/net.sf/picard-1.84.1337.xml | 3 --- ...ard-1.84.1337.jar => picard-1.90.1442.jar} | Bin 1525728 -> 1644252 bytes .../repository/net.sf/picard-1.90.1442.xml | 3 +++ settings/repository/net.sf/sam-1.84.1337.xml | 3 --- .../{sam-1.84.1337.jar => sam-1.90.1442.jar} | Bin 615411 -> 617595 bytes settings/repository/net.sf/sam-1.90.1442.xml | 3 +++ ...le-1.84.1337.jar => tribble-1.90.1442.jar} | Bin 259515 -> 265519 bytes ...le-1.84.1337.xml => tribble-1.90.1442.xml} | 2 +- ...nt-1.88.1401.jar => variant-1.90.1442.jar} | Bin 556173 -> 556173 bytes ...nt-1.88.1401.xml => variant-1.90.1442.xml} | 2 +- 12 files changed, 9 insertions(+), 9 deletions(-) rename settings/repository/edu.mit.broad/{picard-private-parts-2662.jar => picard-private-parts-2872.jar} (68%) rename settings/repository/edu.mit.broad/{picard-private-parts-2662.xml => picard-private-parts-2872.xml} (63%) delete mode 100644 settings/repository/net.sf/picard-1.84.1337.xml rename settings/repository/net.sf/{picard-1.84.1337.jar => picard-1.90.1442.jar} (75%) create mode 100644 settings/repository/net.sf/picard-1.90.1442.xml delete mode 100644 settings/repository/net.sf/sam-1.84.1337.xml rename settings/repository/net.sf/{sam-1.84.1337.jar => sam-1.90.1442.jar} (86%) create mode 100644 settings/repository/net.sf/sam-1.90.1442.xml rename settings/repository/org.broad/{tribble-1.84.1337.jar => tribble-1.90.1442.jar} (74%) rename settings/repository/org.broad/{tribble-1.84.1337.xml => tribble-1.90.1442.xml} (76%) rename settings/repository/org.broadinstitute/{variant-1.88.1401.jar => variant-1.90.1442.jar} (96%) rename settings/repository/org.broadinstitute/{variant-1.88.1401.xml => variant-1.90.1442.xml} (71%) diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2662.jar b/settings/repository/edu.mit.broad/picard-private-parts-2872.jar similarity index 68% rename from settings/repository/edu.mit.broad/picard-private-parts-2662.jar rename to settings/repository/edu.mit.broad/picard-private-parts-2872.jar index 54ef6d5e2d1ea141ee3ef9d98591402288f73e26..b6e685684ba330c92c78fb809981938025bea9db 100644 GIT binary patch delta 319 zcmZ2$wAP3>z?+$ci-CcIfx*0B=0si%4zq%pP7Au*#3tH$F#1ev^J6Z|pE>cmJec~a z45B8>Giovel}+|$Gyv0ejHb**`7<|fW0YV5GcGb$Fo9)!*etM5W6@S zK&mgW&1M9to}9J74xD^f*b3|% zDG_}z?IU6grkg|{mhKaA0GrDw>I$~UQ#1ukZvfID=ZlMZf@v*r>B)6sR$#f^VmV-1 aOFR@T*CZYR=HC^s0@G0vNnm=vL>>UL7HO>j delta 319 zcmZ2$wAP3>z?+$ci-CcIfx))fej=|1hfTA+(`=c=QWI@G7<(qR`7y7mv7dNd9!&jI z22qpc88w-K$|id=8i46KMpNe1HTIjgF-kCj85fx=n7}eVY?fd;k3)a*0x>>rh+P~E zKvOnfV4KYdHZG6D6|86*hX;tinUV7^BiI;5ZecbsLtL<#i5cj=$#aD*;0~O8SJ(=y zUrIzDO#6r!gXty_h^6~P9Khx>in@YKnd~W=0;V?rX^`{9#XP~ZmbmofIx&b@yTx+A cW@w3rg5{dTBf$K-;#FY1Q4&dDdcQ;-08)HpBLDyZ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-2662.xml b/settings/repository/edu.mit.broad/picard-private-parts-2872.xml similarity index 63% rename from settings/repository/edu.mit.broad/picard-private-parts-2662.xml rename to settings/repository/edu.mit.broad/picard-private-parts-2872.xml index 119255e8d..677d27d80 100644 --- a/settings/repository/edu.mit.broad/picard-private-parts-2662.xml +++ b/settings/repository/edu.mit.broad/picard-private-parts-2872.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/picard-1.84.1337.xml b/settings/repository/net.sf/picard-1.84.1337.xml deleted file mode 100644 index 99f746ff6..000000000 --- a/settings/repository/net.sf/picard-1.84.1337.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.84.1337.jar b/settings/repository/net.sf/picard-1.90.1442.jar similarity index 75% rename from settings/repository/net.sf/picard-1.84.1337.jar rename to settings/repository/net.sf/picard-1.90.1442.jar index 68db4184822193ef73c7859b80e9c17982343b4c..caf2bc09d6fb177f5165882867bf8a16fe6cb5ac 100644 GIT binary patch delta 212334 zcma&P2VhiH+CTi9d+*GhncTFQOp*x+BoJD9D1n6Di}W6f5Fiu@Nk~Fdbm$<0fP-E^ zMT&?8r6xm>rU;51%UV{!wXPl4U0DSAe$Tm+kk$R~_o6xX^nT9M&sh3~UT&)uFwn)Gyyzon`4+}mk&wDkA}FN%rgrn)R# zH?)U`i2phSQeICn-`pLW6nNd^^nd1})>Nrem5UUrbWuFbW#2pp&FAj|Ct?<=Bvx)? z-|afx;iN@$rwh(q?D#pjn4KjqT1v|}*K(a!ILSjRIc62dtY+tKoz`#zvXy(-x7Ilu z?m8FUOY2>N&O*-Ar-v?Y|(u3TS%?{e)pod)aFg?O4TXlMr%WQK|OWMvc zJJ`3AoyWNHUF>_@MZ4(a=Sy`1_HZ8em zYEgMraYb@v@tnEErBjM4ldH;-=T;Uc&!16Tnp|E{hN#M!rPGtEW)vqEmCT%8I=i^E zDtUHsRmIFHl?Mk+TcxQ(XsDU;kTcQoH0?3y8G1JG{%s?Ty&TA+b_NxYncFKWrln5% z4LU^C2Gwv{ke$OEJVMnv9X03}J!eoC>Kd3ctF`+$oiOMmoieB=^)jeG-4ZDKyIjM# z41-S77=xat@dmv>lXW^{(2I0dr*j6qL@(=f-k?|LRfF14+dxrskK3TvINR$6y+Lns zAKwZTmShFimJANp>4HIT(>n&eOOpeiHqQ$LY&<8ZlpjkB3 zpbzOIgZ@e%>-3317wN1)m*~^LA>( zL~%?3Qgr0NV#6JV&@r;XM|7uo`Kp@YuO?Sol>In`6{-iwsG;!Tg_`t43@@}Dphy{G zmmb9E2PjI$IHV_H_+g5!CU=^4oIG4X3?jVM6q}}2Q`~;EqR?=-378V1PNZS14Kc7( zj9?miGoO-1(g-9OR+nfbjY7ytqiGCx*L*TeB>A}pK2=2HDU2r6D_~F|jYWJ6jYFC> zzHEGXe73Y2YOn_$s|&m)M3d?jwuM^LQ01+m!WYoChJ3fB9iqg;l$2gW4gEFLXftIU zpk$0PrBFRaN@0*x9B-L!^m?q+amNFQE9&5_jNEa}>A($vmr$Eal?Vq#HO zS`s{N{JHQK7o_xR%HUa2>&?;>tXDBbV}3f)ZCK-3ewsaYUeZL2h{tss z713mjv>&C=6l@8VrXto(YDZUBfl|e&&YGS9NK=uPm!pm3WY-I13AqJwP^7Lpka2B-T*w&EdvkBNA0% zxbv|7^Rb8vDUoiMEyr3F$%g&BJ))@ulOU)RMR>DVF=g;MsJyNRjS=R5E)T^Yrd9_j znVS=GcC8K21r^UhoI(|JkaHq+5$&S`U41uUCxfBOdp6uSjL{rc%p7-EjCpEzY*b*w zx`qd@&ReJ0x{*_K53F677tVQOd%SmbYc|o=z#9v@`O{srSfo11C7Rkhe}1KU^=C6K zbEp|R&Dm+ePD^&ubb;(Hk;+bnOJs^Hj?Q+89FfbuR_wIqU>o+eWhYM;?Ofy)?b*q9 zi2~8VDLRTyy6CKnE?BMxBGnxhPficZplX zfG{yo4C0#L3>HIlF_aq`=Av{loYO|Q#7Hs9B}R)e+|xoXIhLJqx)|@GHe!NH+$tt= zF8?HsFXG6_>`Y;2Dm%sOOk-y{J2SYcne5!g&Ma=ML>IFiqLk|?bBS^>hg+*)r&1RH z`~7)h`-{2k%wuQ1ODqrz*>^iTcesRqk+_q+cfl(bv$I4OOIdvz*v$EcZgGi+#KU3Y5wSH)JSxVrQ)KU2h~0*N-10Vdw(DYtix!BT+=IuQ@ayEk6O5FDZ}~Uc($jT%O{X`pC9tZx zcv=^GuzEV3#=_|08O)k4o<-MmvG?G*6_bR%PZ#?!A%S03UJ*M2->*8IvWK2AfOM=k zXe&Kx(01Bk&@OhK;<%@2J)mS(YeO6m2X%4C5Y?i_pp~>r7ePZD7Do)AM5_$i2uB>{ z;3f{5v?{Q_EWzCNYNR8652D2}@tiJ>8{&jGrHj*scwW3*5Uz)(~%sw{&sA5O0fj z%v?uQvy3O`NzV1IA>I>z!DJZXeer=VJ~YHf;;)AAh!{hBEIu*BMRCa-dbc-ep!ih$ zO&6aT;&VD|h%earQd}~`SDfo>cK*)JH|%`Nso$~lJv%>$9}V%7xNL}@+4+a~1!#*m z5V$AB`%m$bF8*bRU&R$e{99Z#=o(!&8%2xE4u<#*6DzKM4bfHz_h4aK224aFsn1ajBrGz#Nv20P(Ogdv1bu*8NEsYDq{wBimdUh7po zN=)FvwOK8^N-QqREQtaZPtBTKG}}<(lnCqwLy4!u0F_yW;!_gL%dufi9fs0CydxKXB@n$oQ61NP?7?H3mT9=A7UMXC12oZ6&Qk7<=u{sp~m)1LCm>|D1Xsrv?y@jai%OhbL6^{ha&& z!q~E*WUowBx#0&nbu(=lOAjFu$0U@6@wL10htx;#!O{71!Zt+VIE4~wM}rbOX&9=0 z3<)@Be@I5c<6P@*ObvFkoEDr!`Nd(x*MSl^v~ZtAs02`&!CtdK`Ih8CjCnaNkiHj@ zjhPk%YN`QKbO9%f1|}E_5ZeHFQ!=oorlK35Uo8UGjP}z3gq(nlhq!w4#Y!XD&lOp? z!9l1xgB0nDM6LVG~Au$L= z6+h=1z|KH8VT0Hy(2kLp@usMtpJEGa@Y%OeM;~ykxEhK-K)zj+b&L`Uk5L1lV~Or0 zplX~Q*cyE9WOh>E)KKFfH92{VQVY3o(;zh~up=Y7*1SOX=|`xApaMsz;+6$YpW`e= z`kd&U&*5_(rZm4Oa2%ub!UL2Mq)cQ$nJhtt1&*^K%I5%QxtzY$(D9oiJkKzIlEIsZl^bLEA{k+dm~%tMR}th zqV^fysPS3aq`{f^W^>stD~g8rG%Ms|WKcg$U4598Zj;#UK8e$voU@02^zIwoF?gWA zJsJJA=DQc>mPgRrnq%O-eRlu;_s~ew>c)hb35^Ngn`7c3?Y{92P{h6kuC(+Z^~(=S z!}Rvc504Lz4=c-$jOTf@&B~8f10|1m{r$1Xw-m(qV!WPe8sLjb_jQ2nkF%P_VguCBIA2VV z#``?7N}q=-siBdFX+nX==h15D*4p`;$n(kRjukaD2?2LeZH#jT+UF?jET5O_o5}UXsz>NHg-+3|8Y&4@>2=R` z9H-JkrfH&aen0}k=aD{}&pW)(?(@fh_(P;}n94xom5)}Lw9*JygXj(Xn?QExAUYyI zYIs3xB!W^%18tCv>!}TH^?YFNU4W?%0N!0hm9&=T(H0>6JLwKwWs7i$+=)KjMdttq z&I7r44_EPLfC1m&j{T9=kpCAzfa|nQIOtvh2qSW6qv%foF^o2eQo3I(AXD5)4~pfq zMchpfiH-EI*h-Ix6ZDukOS{C&v>Pz{3Gov>Ew0cWz}jb&MB1w~qC)^%)k+4{D6J`| z6aeGzK}VH=bW9md$CXKRLYYn{{Yp8VQf{Zyib>Bad*}t_485qlf{bs`OUj4zit;JF zs(eMSDL>Kc%D?FiRiQW4FnUXkrMJ~2dPi+W@2lzbftpJnszvlybpc&em(nG59et`E zqR-SXF;-lxVgN|EJ-DBb&{5poHMp^l!RL2kWCipb?*Ax^cL*JaFA}rx0G$B4C4#;Y zDkktW?&)9X-(az<2dR9WegdCmJsFs_FX#+X!!hmzS;B>p&0_8g#VEZox95=0g8_75 zP7HiM0ce~DHztN}a@y~ga3LF;=T-1z1hqk*Ux!bje048#X8?q{s87;c@M$zaeV8tQ zJ7c3Q>L7XtEE+rQRC_UV2D{|E+K}D{XU0MAscz=WkW*eHL)vnX`I{l!NAAdi}|9>LP zYk^Leb^7@_Mdb`b9^ru7?!&@E*7_=6GsBl2zOM`#Y9 zw*=!A0%%$@2F{fNN1=a#qovQmm;RLg@iz7iTs0-%w)@6=5&3#VzB5!b%olcqWYKZ# z3l%kd2A6DEmLJZ(fve(e43Z+_!>7kk6@ZdIK03nON=-R+Kvb)p zaB_rm_l@lt@p?7Q%ZLY1n9r!0yf%T5x#7Q#f&ET4jJ=5vl!1X32pe@0cIt{#d#iBb zDtDnDJ0?HAGXlA!9{Qu(;iNlLPJ$LI5u5S z9nKN%-8R^QI7YZ>TcUa#Be1^u$r1*5a+75VtToU%3loG-IMD(04{>}U@JdvZpAV*r zRXZRLD|$?~qmPfGxvn%{81Ome$+I|nTuC@8;k6w=_TMR7Rdo?@l`?9*H-MZPye~zh zE~0qzA{`qS9>NxhBFBFu79y4CD)vS+s#Jsmg-l z);z5vdvk~&PR!Q^Yg6At-1D~s}ytr+suCs_0akxo&5*v|v12xM@p?~6vl6Q+P z;!$HRPCBLo6j^9GPGP0E_3wPzA-ZETa1sqv!X46GB;7j`^K4!lrleE4*VZ~O&|-H? zgGG!lDY;5ct>Lqe%G{Tca8=5ZK(lU_;9HBinb3VN)I?y|!BV81o%N*!#UcrDd~1`=>L|Q%0Bq#;{L5uGzEs_dlxN<2 zaOr?}oG1LXWkFe>(fMd|T#ShD3!K<)v{q+vl|pO5>?827z+Q0B1_y0)(0v?&@XAY@ z>|ImQ8vbzzax5=BpmiGX>TB#Xb$ZZ=xdvlv3q8a!59{;@S10S-YVUGM7RBy*)WOm% zg|<6r2Wb0lw3D+#xV4MG((=-7okd$u(o-(bq)&6q9`-%M&a+(OUiR&C@kDd#L7fgc zs2UiMi)l*-9pUt&IvsOSD>}hVo#f1?9CX@+!}0WvfT`9N#&wWCgZ8eM%GsiuUXlM+44hZjUM|omx~eHF;|B zyqQH+Gs{W?E1wx7wgfIdvpKN;*G7SoX>Rk>xAt^%@nNN@`PxE}e=v7%Usb$eb_+8S z0-FvbtJ7$@l&^)oNpBg94hiZJ_~^E_fgcY<05bOoTtCo55ftchuu0rIoTU#7%!bl1 z0~f(a0|fdwmZ*8&rT26eY5`?3a9d9llpPS~vu!NWDmCXcR=t6wLka2y`Y6!l&|vQz zs$eUvpQYi1%2@m5XX&cT-;1!V#^0wXASIGC74xP1}kbJY*u zS}7UFF%M<{at^Fy;aDZ^S;12dms6RiGXQ@`7>*B?A(X)UAeNl?QEn6`LqIi_3K5G{ z3gxL+^?1%Q0nK2!;7&r-3b4sPbYL>)nt~K81adG}2YUwY420I;-oY(H0|2rF=r~K} zWWKT9ljx73uxc;|fF4Jrxg)D7isf(7EPr!@HUoVeV=;hcqP&Hp#{4qqx(%+Y*|AsSYFo@V{6IM$}KeZ0HtGr zGHxzVX1xWT;4NRFhjP4>&Sm6X;kH1B()ag=m0D?1Aa4093vm+=(UNPV52y;uE?b#ODJ* z;5W<3XW{IVm`~`HU|Bu^hGzK)=)JWd2Kj#mZMp)}zY@(@VoZ)S?YaSG`9HI%7_c7E*M1WMJE(9AG#(flJ{H1Cw5EbnxHLce9jrI=%F^P3GBh%kQ0+AkUJz zJY!$K&&u=T+b>$4;0K!>Qd8(^_@|#P18Tl3!rZ8YhvcT)zK9IG089U|dKm^!`AOftUDh|$txR?vd((^fHfrA#hurznD zvxuELb=LQZ=aztqB+z1ZmV{AftVkG^WjS)m2h7ImBtlL0S>{3$#B4I{F+1%+fq>9h*ZUd92%`7Q4x0f1y4t%*@Xd899 zFOYja9caP;8fYOR0or6>*FFGn894h@Zy+Is0psgJe8&WezaE=va(WhJ8-NpC82slM z5PxjeX^R2j$HR=F^fgaUQW64-Psb)3kb8_^HJ@yjBI&f1iN0+tKZK6Y4%%rJWre53 zar2L{Jn?aK$RH><8H@!&DX5^iR6J@$z)K*2%%|1{h_}54kn;UHZ8Shy9+Zklfw$IW znq98gA_F&0Hwk2a)6D!t5%bKSzEv6qPJP(o;6vXmQ2bHYS*+?3%>%%`&q0V!MJ~R4 z3R;Ka7)rOnE^Au}Y@uBx00`^q)TT0=XYhj50#5Bf9mi59L}EhVa;3EuAhj#!>xTI5 zG>cIWPG%IM4gQea10T#zD516-iuB^t-iXJ1qAUGSOaVLK55-xFgRzc*a0emFd>Mus zjX9^%NHX_nkv4PhPmy+inpRC=EcY?e4paDHim0YY3rGw`u~{+6I56V~B{65G zGdvBMvD0W!rd2YzK=Xl7?WdG#YRo=rQcbDwK*|F+neNk$P;-SKy*P=q6i&w4&LBh=t9Q)CwaY;Bp4AxfEb+ErGY)0=#V`(2HAvolVDo<+y1V;;LT+=(-UAohOhv zT>g>JXJS1h4Mnn0!FViGd#pUG6)}Jggyy#;3Qxp}3yBc&+*>h4@J&X(NK#X-Q>;!? zz)8P~?W0J%Pytq$fn+8K)3E(0%7MEpiZXjT<*kk@J#?!_{B%&NZwAjqiuMYtRR(WF zi)2OWRlYp_%WKVnlJH*^$i;-fkOR>{kc;i4N@no>c+p45D}3Phi!SG5KJpEz`Vu%5 z?ld>}gxeh4Qh0(H2@u~W6=Jl*v13Mn8#fX}^C)VLMbE`7*YQoR)H689U1zl+WK4I;)tjV(n570~rs4 zF%Lr;NVYi+kfv6%G>kP%{p_-8sS9|(10mH?lSy%tws0FzFZIyF>;pv(@iY6jaZTGf zxI?F%4yM0X(JlvHzPnv4P3H8c*!Q#xSj;mzJ?mh~TVcw(HQ&1jbUFw;1H$l8KIr%! zYc9NIlT(pN=9|sMe9_wM+d{N5vwnz(j}c>`#@KHfz+Dw1s%K~Yn>hulr_YHjNIn*;?m-ZqAIAkCQmJ@DoU;@np}cF z>D1&YGm0vtrmUdZvx|#LljqEZ4n)<$eCLUAyTOpYCWtz`+@|}e^#G8WH5);V1A<1=RlLMQ&7_LAsz(i@wb^Mhe#%*DhsZH z&dR(Y^;o2R%&`9xI?JTL`BSFos$Qf^k}@BdC0fRRY5?uUh(6?p)=8K@dJh+nC^<10b$|M{#iyH3;VB7_;r#svp=uclTBKiY-48@gN3sWcHY8{P`6B@D1 zZ{TiqLjMIw5{f(qwc8MBewgBKw!-ur$vCdhZ4N z+9yX~k?HN~M$irb>oz%pc9Nb`{IZ)iQX?U!&;Z7mi-hG7EwO93k2`^gqz+(87h2Se zk-!JWM)NHc!IAJvGzH!kAh@k;%{NO{183U|mdhdXYR4(9P<5w+%FftLL*1Dmf3uiU z_H(gpi&XYWACtn3l!N^v z>K4|DVRjHNtU>BEYN}IZ)^9k43K%qe9_Iq#ws4&GxCI!;3CFhsx&BivckgG%zdVht zg+KFeU+5kSIeAya1>3Y2Z)p)p!G(EtM${=>-S#zaOR-9nz%2#e#h9#Ci6;;)-Li0M@sI-r%vm>B5AA zDSN?8ITblBF!DmAd9k^O4qW;y)68!nBI<*B8-x1J47}4NYzG7{>LTJq9X#LPMR-MT z^IR9v3u#7I))dk2A}=i)F5dbb3dpcY$J=*Sm`^ zv>Kh>Gr#O1vQ^x57To^}TXK9r9}0@D2lp-L4&tsBs#i`at0+#M3hS7PnSjdR*HkDoH_5qaY->D(-nO7(!j9zSlRh*I{lSFBWy=- zAF~0;C7mQF{EVSs&sr#GFl^Ep{(*z8)7u7pOWzsvJ^dgdGSGn^>3s~_fNI8N1F~{B z|6u1A`X}?s;rz<-Jd0Ouo_Zvrk=d}9@I{ZI-vovA7(8;&h>3%HPaHhd+)!l1M7?J~ zNUj->^FsgBPG8jm?JkodA*N#C3vx~xVIkgb^&-t<9q>7naZu< z?fkdkw0A?0;W9(&zLfxQf~Px_z~G!sTzWGn9;t&7#yW&j>lGUc=!~_Iw*!_3Uxk-P zhAr60K-+?jYz)G3X~EEEBtz{cex{6F?1X6KXj=T75Q45TYLk$+V zX6AV-NY?WKyKaK)4!qI;*=&I`9(gTVdZeWK7+hPX8Dq&@zt#v(i3XUC0XzWbF7{^q z7MLCkFg?UPzSd~y2Q0F{F|(cH0MhXRRE`JeN?@qmpdM6CWT?zqj}QP@P?@`IfwhqU z%A-*2SlprGaN&#xXuDN*(*n%dp>8tI*|H5`-*i7lq@YM5;O$KGq*g#+#mquX37UYQ z;u?YBGaK{5mp=-Bpjgiy==s^sT3pBW?1vZ~{BF}GE zSv2pp@1PE-#@IsPnO?p1VQRXCQV{`hGJ@l?UC>Rmods6{++&}uHfp&Jg}pNtgy*TS zxVcjR;im$^7h^T2Q#wYLi?J~b?}!of#t24Z48;Ih3&=kk{PIdUPb?Mgg8vwjvFzMm zH)G5SuEd}@j)e;fDjgvu2P&!_glgtvV%lMAEQHU7W_o}pIS13jlh>4V;0rfE7ejV` zhfa%h$*u&)68}SROB9#P@_P-nTk<6mZy*f*EB$ngniaa6yITaQCDJ(pGZukw32vHGz*&lXg&Z zwNR5NpeFgOnX#D~9ixIe(P{j3r)zWnGH(Sccf;psH5@54jrz&<2n^8*%M!lIz+T>yUtphu2j_Y6{BE!E9r8 zX@VbzgirG4sB9^gI37aRW$>|xZY?TZ5jqo$8(O(m*5z7Rmupq%oHQW)cS-;|AODLc z30T!@mi${tTUS{H$j#;9E-!I}qVfi9ePnDl^?=%8C1G{8Xv9W;m|20QtNVFzYC5j0Gv;hb5P z7$L2@bQH_S2mnMRe^=#Rh{8E22?N-=G52r*+9S`Tp1MktQqNdpWv zu@d)P!%85gf*TK|GGn&^YC&oWr#7Z0V<{D<8hwXwKqyCP>#%lwb4W0IbWw#Nma;38$<-xG1Y zq_21V!?U@2%U{+k?H5bLe;v&wPg`3H2;BZyI~$M(bLWfxv63%(*E;b_nB@PJK5+G5 z8}R?yxXs0zMeoShT%I!KQ_ zkh7StA62HB-)$53s#q81GJ!gpC7kBA?IH|O&q{?7v*~ln2s3VnI15d%|0l*vw1jthdBcb!HzOt>p4F2K+l+$g$yUyImyl`c22YN zJUcIN$usPGk&Ev!Sc}K~2g`6Eg29P9#X6-`MBm{Phx8iMZNP}aiEVPm2&x~^mLnqT z<;cy+8DmEFRvMa)$HhDk?@s=wVJq?eRIpz*J`ScYGlQ&ioe8`lOJcOvHjiSWfy)38 zb;EAteHkTjK;Gt2*pa>EHje`5sjuATQQ$)LmD@ZDSYlrsf6m7YC>?k(OJ=N+-GDTN zYHJKMSdo$i1zaGu*kwJGj=1R<3&AN8syTeOc())e0}_q&C}0AOtEpK=DA-B{o7V=X z%3zBOKD`RgoJ5Jtwoizb{?9OipQ#7%jh=FhlG_12(9js+vy@*P(AHrjTfu)8ijB@F zci`YBpt>vR=g1bZ7kz2D22R?VvV0BS^n5$qhV2}18?|%6&EVurBpXSq;L?ubLOPgX9F4;5y0>Zr6F=HOgnGfA>1S+8^I^PN%?+84hE6~&)0AfRN&yNBS zV&+ph0Q*uPe=EUex*MqBPN2L`12qf+Eqoa{-T-d!KJdN207rnbR|RyV7BC(@|s^k)f2Fs9w=Q+MhOiaz6f^7I1Hc> z$X!+la-j2Xqg%z;kF9VNrX!X4$j_mh<8YmK#VMN!UmkD5YV>;?0QgYaEtyj>7~u|} zx8ndbOVQwPim>0n#x!&))2UpCpym8iqM1Jw5UQ$ataA%uQYA%I|4j}fsv9PgnKoE) z91`9Exc<|yOzm!NC<~;e$%(P{v%PNQcA&{4QNDIVGqTJZabDf69m&cF-5Wi1eO>|% zM}qyP1g7>*a+(tl#M;gExT1_MtdY}Er%pO`4!-oXSfQ9bXDe={O^(@Zw&L~4L+>5Z zW5mP}-G=q)HDY4-Zo_*GpO~9t_Ip;0Yu{9i5%bNjR6S+zB_`OjO@Y2LM0v<8PG z8-^q1U-pXkrvoFyFq?zc2-X)?&l*P^mTRjm_)}ir0Mca+rKVC782ce6LFAmRv0^CrMw-+rn<8CVehuScv87 zuf^E*GWCmOrG;JkqM9oeNfI93@kadF#h^A8+oS@AWUzI8OvJJRhtZywpsh}MI<*Ua z(o?zb|GWg4%*pqC=`;6@RD2KgRnk>pN32VbW8T_VX>IP`E1H?XzRC|NEkl@HTIib| z`cep=%97KNpIti*N|h9xTjaM5h*2v!>x|tf)TVB$$(?p4@7I;V>;05hwQyX~Nw}dAqR5OCF0VO$ zh|<+Ll@z5I$xY+AJTc%<$(qX^m*)f`&Z|X;UzZ zBAH_vJK<}>PAWT1*=fd3b9V4ffGS$*A}vg$iwvy`h6Ev>$mEh)9G4x&?*y>ZN*Aql z(bfem^}I0APPFGd5In9C1<+s-{tjGKNA`ANr!zY^PpkQvfYo>=0M7w*cZoKlhfDOt zQvvYx<`R9_0eP{S9}8HG#{%Ho!jAP~0H+S-AczZ<-waUs&4ATnIQu|ZtQI4~#3+%i zw-ci|YK#k-^M(AN0D9hzUl3?3#<|3JJRg7#+{(^G&NqpjA`He%AFecjWK|V2*qO=B zZS2fqr-U8o46VV70dUH=&T^gt=nTb*3Qnzbi7GMI-sR=Z{12FyD(3O%tak&ra~T|m z#{*P>*Md~Bh#g3ERRO84D!6StBf#DzTyCi@mO1$8fE8k;E>^jqBaQ`KjfXL;w*&CK zM;B{#0da1LxR-mhUKbm}s09F(>|9~z-|Spv=QlWz1AoM9>5uSSfw*pn-^DQW^9H1M z-~fEAD+=B#kdGB0uVPnpUBMdyPVqN9R=_b~?0{`8-}mNi%wfJ>Hx!B%*~2i>1U{?-p!4W5hce z;5i8;5o^2u1Ggv6P?D5}y3)u{l9d#^Re;%0nuw$3&{Lv8cO?~Tr!-ZX2?9z1Pjh~! z08x0Sz);fJmx19bnF=08$mSmAu#>B_(v{YR(ne{^uSPVqpQh&_?Xxz}rJC5J;Mig% z+wp%yNP;V7D-S9I*8uqw;8fXwxnaY>f3JjEY*d212fv&5a9#ZU2;(?_iil#pJfu63 z2FncOf7raOT&WR{m??9VF8-}ZyB+A_3J@7k$^$7=2L3KTMG%3MUBGj9;mbbAr%;-G z{MG=|>AUdm2VO3czNg^>E`Yc_@G$|i3j(TVB}a4@gr(1-2OPH#KE{4_$#I1HK`|0Z zngUTfK{fI}ZsK65iM3G62?y=Sa!2I`dI!w38X_;96$%bZ7oMP|X&@x=${5>vWre2a z7zX_uh)I5pfsy_?%(R*kxLv5}KZ1`W<3uuS>r%i8tR=T#h>DKzKf9b`A-if}51htb z71Bc1s(yH=dM6l|HoVs)-&Rne`dHgQQy3vNtJi=Nm9(^4aMCHvTKyI{(P{zvhX)X} zS1Jw_%N99LxyZ{VqQIxh2X*xVZNnn;2E_Dhxu}}h z2a57#NM^&tlb|wQfzKtjfTG}hY&yJ1=5vUJkm0fguP$a{Bk|U-i{V&4-WoQ)(j2qK zMRXj_CNk2lYzCiUACrYr+N!1-LtY;=qZJC(^GeqKj~4rKuN-W&(Mx$;coF~q2EE_}*s{i9TV18J=&k}vcLIUmM^O$e zZFCmU9mUk@kcWwU;|kfXkOEV~>%hHana0;Cj(yiDS>~_HoP@mpmU){*V5Dr?j5XeU zdI3yWA2FX_sI=>Gb0_{7nwcVOYrIQ`oVUOd(j}pthkR$BgeZy~Vze8ycB@M9iBUX8xtMJ~!tuhoD3Qd%us8DDU z4_%X|>TlHE{PGT^%>N?&47mGGagMG4;5HQJMF+s#&f+y-sIQAY;teqf&;JYsz#Stl zh;euVXtH=m6pMF72>@;x;O#ts+l7F(ivekuq5N7v%`E_$PvC8z8opkQMw5mtY5+ zzb|mEm};M9ZGO=KJRiu3vALR}j5JjjY2XHMEiob!b#pCV(GVm%mvsvVm~4D-qR|n5 z?fwV@!2c4BK%Y+hjlMz=1(yGp=xg{?6#5vV=8rLXUhyG)gE%`qE4qOhaUSRI5IXT4 zd=C0duoU_{y85MX(hu;t=b>nRX!(d^4ISYg>z=Pok!KC!ZiLy& zw@JN%eyb$T-3{Dc(KjoEHF)TstojC415yF|lgUTiiU(}#6XSWVe?R{VaLiXiTq3~|>Kh`Xj9qhj#er?I_E!dZ$xM$=hoIUz_h z24?U}8@B;)ih;W+e9$o}Da;Jg?2}Q&ovoK`q{|X9oX=A8le?5ef0<>&Ue2f0i`Qs$ zY1qyumjRE@=Qu+73eqZSs50Lb?_w+T!!&Poo&k}lfhS$?63cvkvc)Dt&Kg>PP{vuh z*T?=2GT9zN_b+K(d%Xw`uK3*OlIi;GGbL95A$Q3ej1#Yh0H@57@>NNzzstb;kb)9U* zvBjMUfcFy);y<85{x3Xw^egOZuE5g%H|i*Ur+z?y1}MT$!xV5i6^-svY_tZD=mEt+ zPbyA44-$qKKMZ;cDD1nyU_V!)#U#9yHAP7hvy@~}s-%cHKr|Nt&s+%%a|8Sj0Il4v zv=q-MY2pZ8wK$_>idU2@aY4xuUn#lbn$kj1l(rIc_ylNLE^83~iUX!lSDaE#) zYAh6_y2LXy7$^N>pb1C7{MEQ-aRy++24AFKY^Ir3MoiP^!j2;Z?jJ5;t6KwbX~LQo z-5LfmkzWPSpe*u~2!qciO7Rd28*tdg96Zeu4j+U=iW&lPe7`e)(CS_Ux@V;^kmGx! zm?Ey_;~@tTd5xgC5_N-*BcEy!$>Hd0&&;0vZY?jmF zkS0-f5g*mfPp5?s5!^I$@wsVdTKXwkT|;-@Y}4Z1{Z_9%MhbT!GqQ^W(IC{Gg$UFl zjB2QzM3IDi3Sh=2d=oH^SkVR+2H^BGfQMfcD`TqMZrKK~vIcX$f(Mr=-7+Mn@7KM` zLpI5yIuzp?^03wPJCNL27jtncDuNQx#g77MizbR6sQ*ID|tb= zV)>?@RW4Z!#{Tas>GIv@;DS#Tzr~2W`nhu5Vme;^yW-Jh*{I9PwZ^jI$QU(OM)inM zw_DYH6{~iYuSf^e;?;7i=p%`$(yO@EoYuky)m4in%FQ+nKKZKJO4_~!KY2^t zZ`oKpc}J413X~K7bufpBQ|#e_c&L|yzza+y@B$O5uY+|AU{UR2K^FT4vTu-szz4jAhOje~qgly7C-4Ac zsE30_@^=(F&@=E5_<*oIV`r>R<8%reuhRsmHkB7u%`i`Yr^MUmSIorw*5=}IF?a(z zO{*|_HFCs*Mf{dG+#8thj7)7g3i#0Qs-h{g1{Rf%V7-4?)2$eY^uSC$-rU|+ja6EX zGGQWB=!5QoAltN(xs`YvOKrgw*rt?}RTi6KklSm8BMZ5DY#WhbTuHEnIZKWX7XFpx6urKa2okUx&e(JM9pRgic<2i z%e=&16%{ZzfOaB}kuCbzOmL1g-$Sj?OEG|eqC^v}33s9acfJZ9!e&0)UsNlc7co}t-IbvY!ui^q5jb8m(=$Q<^Jv!z)%K}|N3GEuf? zrnbqv)ZgLqr(iLYumBBlPq}$OxK7+WCVcsdY67&1bt5262j0YYS#(}t`6}551tJIdGKA(MIOAKCh$nzpN8BDbJe=ukoCHb z=(2h20?2OKhAR6C-RH_GS$W&(2Kp2RsN}@34LqtH^5Vrk>2*VCyf!)N968+jZqktz zs^PU+@m8Dk!ZvR+rLZl)F^U0l=XJ-zqz?4~k%J;uJgn-ZCY34|ODZ+`pJF_~Yj5jXL`L*E%|)IY__% zQxX*XDqH*3GIxu3rf*rS2bk8wIQKq6QJq=S511(>P}n}0oi zI>5nN=9&C61*>U~gPw6nD=!D_bFh|q3>|RLK?faTv=J3`xXE>6PMaX-w4$J`wboSV zV8I>3D9z*(ka+DY&~zK$nl0rmY_1yM@rY4oZX0ctc+R|BsriBz+GsgK-Z_C~r6Ehwm~dM@Xe>osP2x$Vr1$C=BQoz*HD&v^QD1&=~8(KOWGBMum)>=0mfc^+H}W zMP_)aS<+66kAW>XXF1FFug|rcBTh}z#rNG_RK3hfeH zuI8`pv|H3ysKo5mUK`T{yfCimbwCU2UalxEntJn6Zr-%AIoQ)4*l6Kth|tdv^kEAs zR$?n!yoGQRMA{RXqyHbBvAzTV}vZr+uz^=|MIT@d7|i%iC8z|0!p&y6up ze-e`#{54+_!W=PKZEU_EA{v{&PY#PWXLZnCGsB*BInAe^w{kQ1PSim;Q|3Yl-NY^bWRy)65NplLRiV)ZOsuc>_a8R`RHrxDDU z?jxoD{g9#d7WG@#Xh?^SJ0qF2IOKM0LvGO0A<*lZ(dEea{=kla%e9rFD-1XirJ9~r zO-dRVvnoRjTN>i*)udb31GiwQm+@~8WH5!*8RPAO9(M(R=vFsIR94qyVrXO>$J5G~ zy!CqoXdD!zj?RM4z-0z{!Wh6RFk&B|V)$l?K1LCRc#+6daV6>282vrW*!3^{bJ!K=V5ek8%9DmI5u!r@0%V zAn5EyJ`E1ud6Qij=uQrG?HGQ^Ae}X@@d!ubj7$wRvCpUBtz0~|3u0T*Ct+rNwtdhv zjwp1eg89)Ds=atV7s|WM0SsH1mpW^)a!ZW@OeU<50~{|Xp(4~DFmwQBob~;OVowZ% zy39zRRimJ@G8)ij4DPS-5M4|Fw`&q;#UgO@r-ETR4J7Y$Fu7)ey>**hp1zoH2l!~L zaKrXho1oM(7E3Z2!j17X){m{Q1G)BHO1!WaKvqcpfTaC>6w!io#~7v)$#$l7q-@nrzUz*evQVTV14P^mvFa#Xq>w@9*#>safn z0_al}!gGPE&690dha@@FMl24|RE*g=i_l<=hSyT+=eGDc1Ma-*7-bf^vx1a;5+QzX zptj=+>vi0PO8q=n0?RN1*|-Wu*|-WN%~JfiS#)u|qP0IaGLv&+@r2YwvId+Ts)3na z3eQ~q_LtVH#D?~vh{5e!FA?=KH}+7hU1m-5>ObVRGUEb95$^1AYy+NKCi-zeb9Pe` zxr1v_qgbtAF!W8pogTpLY@$<}Wp^zUd2pzc%$w*Xj}rSTMnWOujU$38sz!*%iqK`vO3VjI5O>+Jw{y}8FecR^`w3akYFxa;ur z^dGy|qFyb$>v$h2xOdoH2;nJz@2Eu@LHsip#N~;QjY7^PfM3^X4yb3 zM-tofZ~XIvWhk=DW)a(6MuJ59a0rO)t}n*ggC7sn)>)ML?BUuk7Ncb9Xzh&UOCP7* zugYptL@(p}aH8+S&C7p{jxqh25%IyRw`$kJWy4oXpp-6s#(ZswRd~bgT01N5{Bo_k z)xpS>+U-`S*RIi$EOoCv8?=AQ5`pdEjm#mlZBFymO`64Id2q?zc3-CweVuB(;3HEX z-=)O{->e~$Pg+gGgi$2`JG&WAF-NQf$WnPn%y>QaY}Lv^R5TAj?lY4`k;1K z-253cr_4y6KeMESE$#X38?L&{JpHaME(Zqt{idOfnU%?9Ad0!=4L$lv29TB?x(eA-PRg6NSolJ2eihDKcD!=w&H;S@dIr+Ku=l$ z@#q57;|?IlkK(wV0ycL6@bv@Sa98o$Cdo8F02LE$z|NgYL&R)6LOK_gix1IAyy`Vt z;0YeFiwdC~Gfo_Z)yCJjtL9@zm!s7S5auq1eG8%Y$0^F_+mIk$TB4r(F zz89G129({1viAYz2LP!}z@!HORJNFjN3=t7gAW)JH|h#Rnpfh?sORlT{vP1XV4iwn zvy3PkUNpOI=CY=gVAaq6e)I$e(MrVgTaGJrrnnS-#Hjnr$YlrG4D7H2E9ZA07^Xty%44eLL6(B1}GR>186!%S0GnxMT-PS6&$VLWSaKN1d539=EiJFTAb6Z^TNp8>?r=2wrY=p1(FJpCs(p_qatg?( zs$wjm)ih86bLd1zs+qIYZZjL-3U=qjD{7dzt*L#S*{qp;bl0~r}t;uPz+g!(h0 zY;e6H2>ifUw}GMI^e($OoH-_bnD6TRa z&9$}4{I8kth({$Rt#s<(X+w%;R!BSLgKVcf^h4n>%cJ!~^MIy!lavKH=u^G9tJ|`I zwwafeumrQw_$b}H)XA2RRF{_eUkaE-jhzXy+wMR2$lUO@)7zw8sXrW#+luBDu_^Pr z=EQp<=6hXG6bb-*epCGyej)Te zn?EvWzTDD&r@3p8y^-0~Z-?Dmn*A{i%;|o?R_XRWimATji054qYXf_{MZe(OO#56B zrdL=e8pp+a>x?bYEI$|%Yxud23+{}!T@z@RW%MYw@J}kWaVpGU3Td^?G$QIp$a0>%_689kt(wH6uP8fQ!6x3NY zX-uBmm^`N3ie=&v!scGVcGNwlzPgBFngD)=%}Q2{?C#t(^2$BOY>+eDuoyo4m5J*jr3V0+F+@%b-N;#nXT;L+}fW0h~E6IEXjK6OE zwxZBX%nrxhhU$3zwxW`ra{0DGl+zQ7#c{KdYC}ghu}}y5ffsNBId0&9LGJ|%2>Y5z ztbZ~|_cIL$5}w!ak7-E4Dpm+A^#4p(;-kaSFI+^Swfhona^Jr{MlIx9v}^*73AACs z*gPj4e;N48#6#m*g9P|NFs0v2eG!#?j=CY_IdPnF3T$eg-OG#Xf#G-}G)L@AGkd&t zyz2@oIez$ZJC4;QpAA-6QCU%gVC0RuIFDQ9ENmzaqO>hu;looh1vZ#?wNrhz;X?kt z61GLNG{4NP)z7V3x!FRj9k#`AR2O+-t9+pcMciz1%29`@{b*><_?TmWK>lcH#iq)` zI}FDa&yfOcUWV>YbnFt<_yRkbsSA4qJj;W0#Qu&;JU$;-b2YJcQ_3}&Fcytu| zgw-Q?$EfJYS-Cqjv^o(xrp%9xvKbe|qPmTe$tN8G54lnJq0-v(?#I@9`W*leuhCMw69?uV{9H@uLU(-UO5OF1-7El-Jd3E095e< z1mdunbXt2)9G9h{>z6tKD(9q`@w`^#eGv+_XQ7I5j^f2jlq_DxFEE@pcRa6c@VyNn z@eUsTf0vqz_wd^mf1z&Teb7Z8n76#3b(J(m(}#K{Y{dnuV=bM86K5|p3+@aoCxRHEusbK4d@$wb!d;MH?lw}htugnq^s__={EDMkE? znn8)^|6}b-0IMpl{%7uc_uaSLBrnU$9(W01$x0IT1VRFYMYfQzg*_wzA|Zh!h^Y8L zakmzvGAi!7RVtzqD2kx1c2NtpUt6sUSnZ;eRr8VGk509 z%$ak3=bSu%h{ggk-vq?`Rd9;2jejSWy=T2s)4163S8Ke8`>VAGC?C4;h`PU82WXn5 z3l~Spuw(BoLHHlEGF`9f_ntJkKmmD)6RgEpPrs?A_` zYctscS~Yt_t3e2*TK1GS8xfG^AVktUL`JG(Uua9%_u6u9*BW??)`;3Q@d9lXAF4I; zGOdMIXlwXXt(DKz+V}!(9ba0awet(L4gjb+0iL>uU!iS)ZSTc=yS95*#X_xbT z+7^C9yMq5xyOJN(uIYNiSW|&{?U<0(Dup!zlY`V6SHQ{PU#=B;yRr@4A4Gj@iKA&CCX<1>D8An#Nio&%GiajnGgrLWP^4ce%&2&` z+%SYB2VFVEdKpjY8>r>DKqUij-}5(Hn)peHK2p5?ABSWZV}m(AInSLU##SdJi-PI| zr||yCW7dH*jf5c~q^r8BO%vqSZY9KC{1I+PU<^`{k1>U6)v&9Gq;773~ z;_0FKGkKEU?qGo$-?IN3Zy1gkBix$n0QcvP(FN zr@h`_xVUN=Ji8f=CTG`otPXJdNx_r`ghEUo-`3dFz}Y+EXo@~Y9L%=4k^z<}TQFr> zC=>EKT{dNbD1HKeA+{x~NV%ydzn>d_e8YTo^D+iA! zaZK3)@#)coq^{>j>LuJ_Ae#K(dVR2{ywj76y^J0!PL9$u;T5dnWvgebMOsCxmFg}? zm;y|&6O3Fst_anpR3~(&53Ue~6lBLCif8P{Vos?(!>2M)viW9JvtW|iw{;(yBY>(4<%(C_LKL%kkGokPg+_`scS zxnsoX;d*N537oJT8>8n$7WVyhdS9na$z!1uf=_Tv4%kCR)T)n7n^6hXSPyx8!fz0j zWA!WKxON=s|49nC;S^1+h?G-pkkWdRWwb{ObvF=R1{Th<3wOd~E6eYJ71-K-2`)Z5 zB-qK!nLTsP?CLqQ;Zt_}%$f;hfYyxec1Il98HZ{0q?y^T*Qv6py^x6CU@@WH`jE4K zO^*^D!F~34@u_CS_s}smE-qT_E=%tY!W_3@*X>h}g3Y798WCHj=Y`7&{9@w9Fd>6j zHW?1D;tUpLFt=!XG$BcRI?3%4$1k_G$trSMBk52ORE2BY49JU7D+bMrt*pZXIhm=Al92Z2i;=Ly(-(QW9 zSZx)R)p}uAe%1=zTYx1+XXy?cGMwOk`_Nfmps@^=Ww3sT7r(Z>xeZm@D21Tsw{Y-E zJxlEQJ#eNZ$V&`-ZF_dLKd+kgt*s*ZxbB4pfQDp_hbC;GtB3Nrl$8R@rxlN zBg0sg-kqu>R}(&5oLc4XKa}EH5)q|Ag`tQ?XI>U()dDI!wSTF6={Ki#g52nDv>bvL z1RFbhHXTf)xCx>FQP)Q5&<`Rv)S=`=hA z=kEL{3ZdX46Puw>sOWhG-9L>wB-zAHc{sc`m;nBbG8DQLoJ$`B{U(3U;1g})-68R@ zhC;}mp~G4_N$;w+piuy5!e1PfM_q&SD0CLAy>T@W9YIv&l#;+S;7Ev&)Do-uA8{PH z2^xq(cayqZrbcc=2zXDNnBnT{^CJalVOD+ZmZ=q?km*rX#53s@T}o8Y_j+g}b(>M3&H6(8wNH}UqAY5%}bpg*-Dg%PFXT;5r! z$B2p#G+ne^psxeJe|Y!Wh&83zIHxzfg}&JIlxro2TnZJ=7Vyh3Wbn|0EdPDz*JvD-(Cj z(3kTO;`9u?2tP$L^>X~Q&D3}3BOwqPv137eW=d-JS><%cg)ghp_8^h>t#e6D_c)@HqhZ>2MXFqXYfmr%Pw_6exzdB0E9{x zTMTxET>?;@*wtU{1WiI0n#-=Ghu1kVBH^jbM-gl;BVDMEu&6$ElM85;ZO(F>Zl~*; zDIox9rt6m8tgW}Q3(bsv))B{K}C0`m_17$%^St`S^AX3o)o6c-X%!JL~4| zjaqc{dk@;B?k}7I(fWyETt3XT>SY=Ig0o`pYqQw?yq*vP4$}@x5?C2@HUUv0PR_F_ z^-fsd%4%LSTaOl>e-j@Yv2Pdh=^L}L8Tvwp((|y}nkcezxu70Ed@0d17Pj;kqn(|#ZR^_`nkIq^z~-|z zS&M%IK~@=9H~sb*_lPIEq7pJqs3Tx5GMEncQuL%DbQr-#8tgt3Th#|byJK+l51QnC z!6e5EB6gwvzy&`wVaE43<-FVPW;zfLDHIHe-0XFchdmg#0znKv2!22{Py+tg5D+hj zd+*e<#im92!pLDole$iL>Xizr}0 z3^E{CMp+Ih+1k;333`gOrRY5W3nh5r&w7 z=8hw+2{EqLrVUeCJ38wTM2xeYVoQoSPrSMyx^ju!sM}UIZLDqT#2lR~SpZ^yKyIM^ z0=d)|;u@3}yryVypg;KxBq+UxkWmJNpm-8c2E(VW*i)~MfN-T8kEC#gSAsJe2OISX z2zXotN%ur(Hv$ZPKC6JZg+Y=PerE!%G9~JqG|b&ucE=vvP}wVXXj)iyx<5N#C_lQT z@*_`*D!*=CKKwp7sJ;>v$ZG+?o-H#C=s#2)iEpGPz^sR~if^<9b0=N4bc@IlCEJ8F z2qvS1e0Dm;bIbMlK5~96opoDf)5rh`d;zRz7ef=V1Q9ZpB3}10s2u9qM7|tGu?>jV z-2}bt3R#t0m81Xr$Q+fF=9sQ5HT+e?uHu*cXKVWvRugE_1E?}<8Le)nsGn5nPh6d+r>WnJ z{ee)Mbw{jK1H6vcM># zMpx~|WjJYE93l-Lwic13Mt2lxEDq*c>Ccrkh0e(Qe4+f%Fp=4XC`|=w?MAue$lR}h zCQ4JJd-Zwcr#^leK3;*yDc>FVP7S|9uwUrAm)Rd8*tfSXE)z~VpGQ$f-T@X(8gjCB zrwie0()CsS_UvWZG=LNls`0rEM7SN}^A>2L0QTj#hIP&m|5s9&8AWiR&Y@@2d_+e8 z{3fKLh7HhEVibsXjD*Ch(1+gn13t77*(ks{edw*C-wJ(zuYX+}#dbwl+B_Hx#C5TO zesA7XhBW8nXafE)#?hm&#(zwfr^Y}@@DoacV}|?{uS7*dA493aJ|;Z@wICI~@^o4^ z5u`@M=0tlBI)JF)@N(0g)}9LcM5n*VQ1)~uM93Fn$5;m4E(lqC%%hTgp_U@3Ci9P8 znq0==ipZrEb1hZC2Ho%4ZZ!`~K^hEOG4tf(Rt?7xoovSwcw4ZqD;Kj`u&*l@+gkKB z`dA1M%Ejsvv1e@T_SLNTI0HJZ3|o&IyOWoJQ)qmj#}hQ~SEl+*Q!iOIu+R*3*86 zDy3x?E0tDn33%bd-?r*Jwf9yR#qYaQT`(~;vAY~XBBe8>hza7Oi*z^GNywDMiGnx} zP9cryk&TpX_0d{rwr;L9l!`K(PEcuw;u{0ntgxq$fQ|@hlHxEF)NK5z6!xigoeUdE z@CZDyf2F58;a=iNZn8``+x4_s_{|As^0mxF%{*>@CgkV=5+FH#-4Ft$V+@8erc}rn zf*aB*Av6v0S}ghuib)0+70OuEPGuYw%s9%g*CIA76Vpa`#*~U`Y&}DYGeF)Y7i&@Y zBfk$W;KfY5O-La6XTfz|YNWjLy?cM_+l(Zejou0PliwxUrv$*&Waba8D;)|+yaf5C zi?pv%Qo>0=0-%6}Nwe|+Yp_w^Y@uq(CcDWd0A(RR>CpS^=zFo63NnK%yOgfh$W&mO zgChY79*hKZZiHMZI0E>!5xlaHkAB+OO@R7Fr#`vY(O}OmL6O+~iPk?m`u3X4Qdm+Z z+ScoH3NXJ6$9J9Ievk%AdTN#lGJu(=$q+(Ef^Iz&V+fcE;VyAz^rdjR;|+q`R@&qie?_BQt7pdce7gM^-%S!}B{&XiZsl(lgKCXk?5C2XJtRS<)M z8IlSD?qp!oK^ro}APVfv zBLhSa48(Sb!$yk)!33a+!saR(BQXxYFong)QU6@HE{zaoqjtE)`{q;PHcNF za>8EYVmOOsmpFk`y9}XbNSy&=c7!XX!)7{cp~DsQ>Xj~l!>*EnO4I7uHFgHSwrM0! zg!>^n!{JgADk3m8C5KQE5lbmF0mO-eNt{^9b{J&ZUP`9z@Kk9?F(TqeLytp~1W*f2 z3cU}kzgYGPmcSb>g>dYiUs%sr0;SmQ#HD(X{Z4H4Ni`#i-_rBLggSIDd@8x?oKwkP zdZ5;Dku%DKn7JYYl_HvP#GOWNYOn~sTdt< zH#sPKU1WZFrJlzB9693LTlK+V3l2tHP1xnGt$K!d`+#5GAJoefZpE@q0zETWI0U-F zJ5SFQzSlIj*ndmH;7F#wM=9fnsMCn9n5*?GdGz@R?u6%B{c1!c%F?aPtt;9>7>tWEtGeBph%0jMK%zcmqAkG^g)J9%iMkv0mA(#g za!^VR3KC(@jGtuqCxAaU;NADbN&zv74zN`60Wg>n&fCvSNF5NE_aIAyiFJA${3P^| zP7v_Bk9py{F%vFI^U0nh3jx&wIrZaEwq_$%{EfJ!;m@*HhG!)y5QoEqhgk;sdcn~j|78Ew zK4jl%9|3CnFFCpP<0;6omxt zo4|P#;o)%njsUM!ieC!glaJ^0&xPxmfUg(ecbAnMX|H9kS41dmlMb&qz6KOde;1q~ zF(DRt*^l)YA2h2A*^3SwV0XEY>@F9Q-6h;F8|+tvE_#{n{Mw)xtC+DJ{uvt zDL*jS-|Xz~q@(=@T(`0h>B&cM-OAoF*vEA7Z;A%{iNQW~qOni9U}^U`d2;*0L1uP# z_7z=xZLn_~=;Ckb;=gn_MOFXK&b}uO@3a#_GmhI_b8-YefRHJj>rPh0ZB8 z9h_(n4(Y%`ZX%GV6Xf?Y^5M~_hxE)c6V z+jc!OY$2g#=DZcX;`Da?!_cxL)y#aFoJWI8kau`Yir9SAZWTqT?m~@sN^Uf)#G^a( zyk54k#K+7Z}0R;}-BY}?Qp##lfL%h{I5+GF1{ zCO=3;Qoa+RW)E^EXXiLPV)QQEVe6fJ5jXA9V@LHat5ld?6Bb?9n9u-RXcEvx`dlQR zU)P8~?9!`Z2bp{@FEV(s$x9HxiRFru-@__r>>b#J_Qdhhwk|Qdm~uFb1IIpNEgvSG zK$1%^arh3sG!^Ro7Pbb%(S+C4kqFge@=*+-JxaNVc|p&QWR`;l!eJ8!hKC1X?$+`# z1}``HSh$qrm3$nj@RuZvH#ovISMiA^Jf2TB2*9BvK4Fv=8l@1_b40RUi^;2}lIV-l zv$1Jf%cmQBhRJ90Y7-vRYYjfzWQ1ro;GrMDuZ!mcyZ!-7_x`6%kD&A!BW`3c;SDUMInMml9zYzG83VC$_&2VgyZ%NCcl_(H2EcbnaM9@Pm7Q5 zh7SHQh9ETsCclhtB5`x?{U&_(=r-%+U?_$6UVXH)e@CIoeQ*TdBVry3jx8u6<6iv= z`xR8_EBV%5g=dx?dt+VV`Pe9zSaKgAopbNdb3&rDZg63O{DuuN z!Yu@W4k7C3in$-V0P9t~KwQESUipCDhx1dBgOq?Up;RJb`$a=I>?zIE5_NN4GFQvP z`epDV(HkntipivB(1`01J*F3J)Z$0<+;eSu#0Pl4cevHaNA)Nm_OQphQ0#dW3;REw z^(02F4H9m0a0z@+eEd_$LW5RQVFMOZ#91slu6e8%2-G0qdK^pd4-++o^?`M_-WIA) z*n@j_>kIiP@!f8HcNQ!M9sIA*0DlDd!D%=z*V#PH#u~I3_%3&2ClM!J`I$a!s)9oZ zNB}RF%&`C)?*p^E8CS{VweLb^H6)zSkZ?jWLCpgTPu%xaVC%1jWi5w(lJOKV?g{=rzuO2snEDKaP6Np8s*Dn__qfwm%DF4+jJeNrC^Sv9qlRCR0h zkX0{7$-lg-pqaL zQNSfA;8KX>H-R)Rhmq@M)~s#8X7>uVMY|S;t=F+1Yd5fGVX67Nb{l&^yPLhD-NQc9 z?&TKkK5o+<;HLI4AE-UThiQ-TO6_r3(G}S4tw*m>ccDo@q10V=toV1rkoz9&$7YDuc|0Wdl+MC3crqdL(0B}(r%-PYd`A^=OR$Is zQG`nWP%wRpB5C$SEr8?O!+l8u1n3bk5M*EDbwN1=pOXY zT1ZEz56Cm;PXTNE32K*xG}M&{aRNaoJ%uoWeaJTYg2RNqR%HUe(f(I&3BLLuUX?G% zPuq-hFL`{7k=;=(djX@CF8|Axe=TG6poskeDu%>F!A1z9gzy{-un0VfOf02xFiSj? z1u%LG0&w8A8nUi1vI*Tm)4J^>3KqM%^!@|imM;YGX$R%!P&YnHa}N?e2fw&zak2Di zo+!TS(w!;l4V9q|zXg~lknVYQSa|pA**QHt#mnJ-W~Qn zdqna>%c1;U9xICCT*{!io6_XOT1WCI1i4P5So2YAGM!DK!v%Dh>LjTYe7%yZ`I&^3 zsD=Q5)i_BUg=gZj1A3n!)7VlcTgK`s@p320gy^iv1>#vrR=}&AAev@_T?j2vCw8U= zz`!Rb=*c1LoAii13bdK4;Qy_K)YUC#(;4lW64mGY_pEZ)zTxNCDXvkft)6s6|I59w z`0(et87ZOozw>$`{aJmYUrG?yy_IO&FHX*|_VJ4gii3~OQ(W-hNKW*;H@j%a(dV_# zB`JZ06ajZWt3M%ZZPD&|qz#1wmh;vr1$K^*cauU1&i2#E3OkdG=LV8n*mdNMze7A( zq9CCbREky!ZP+a)=}k@MkAb->u-)rf!B0kBVH_)M1bf}L z^tdCWy{DLdrJgG8NrBAic#2^cjk&f$QGFGhQC*&83AhSK33fl4){g*?`*j4+=x)kZ ze?%W9Hh-cUT~{3jvP^N}nef-4zZ67uj||`JgzIz`h_4E41wqZsD3UvzhiYX&P|#3B zZ}j4j4SUA!RXy^ARpBH02EQ&l%=$KXvno1obC0EjZz&hPU@BRgvf;zBhnNh z<*nkgU+VpVVj^j~!kRvhod$et@gs02^#V)bFOoh8SrN;T3T}QFe(8*grKOiJrP)}H z>5PiBN(j0_K~ee?TWsLrT32=xK82lJ5!E}aUxSzPjx-q#mY?!-ka zB@Y&9vXR@*9Ke{yK)2(D8D8vdtO7rA2{h3RG!G%|iM7gNq$s0?x!K=reZf??g@SED@V;<7)X1IgIOW2+N_hSj2TYfLnmPnGSY3IOyP{gNqI(9ir$E zO@|mdxakn-ZLIPOUqRL-OwgIy{wLOQUB> zH?WBJA;o~w4mf!x&!R&=1G)j+?vJmh*x8FHcJ?AZ$jNj3fwL)6c7eg+OeB}$PUlG> zputOkaN}`y4m?{SA4-Q|cJ3QaCnEr{;Uk@V6ryM2Q0C;LDQkNK@9U-&%G(Mef&9Eb2Hm`>3HeO@rwRC^Bi$rfO zK983}Y&PEo*Bx~RUts48>2wjy4G6W@+VzA7?>-lmgZ{Usel}&V5VepldU=@OT zI$(luAsrAPdkF=|UV;GGyp>~7L?DYLe62yju~$$>w3Azjgkfiap%{?L^*osp*W`&e zAh2W3koXa@JK}tjHGYv4ncF37$KV^t$S+Cb7YlMtA(k&TGsV4c>nlY4B)vquu?B7$ z^B_iwNfJlj(Gyb>!@ZV{hhN_M_WI5?fMNzyqQsJ3jNC|gow&Tdy#ZcN6v#-qCxFWn z5&MRg0G}2xNX!(+UiE{GLbU$BLcLIV32W*vCQhkiN^4F18aP;`#=r&wVp>;Y)0*`y zoz3tnd@=4|BVlx|Sv#S*9c}8C4p_ubztp0{Q$_B6qV+9(veVHJ^b{Oq9mB-DJ*`Y4 zv)zql8(N|fp+%vmguOCtqW?SkY?l+>jc2cFr!Ns3-T??|-P?LmMl>~LLv?#wXIm$H zHye<~2Cb$F#kxaLg%&d0#7RM6uLYo$V!L%H`Hm6Kza5=8B3JpL3AvOxTPC*A&8^Ly z<^O-(*|6K^)VX*ED0)#MiG~+p@8$eoaRQ>5tclD!H#tw^pzhQv7wM)z#pih#^C?|1RvZ^O5A}V@|+p$$FqD<@BL8@ zEz%aO#!#1GnZ^I&nt$k7F#m=R>OvrpE5C%Brah0?-NN}#Xc9ht)^mo51@r^S;zNrb zC5CME`w|Onfu!|mU>9^YgVW$fMjD)+!_f5qr!A={uZXoPg=+5`G*PbWF4~`csOMO^ zZOK9sld>EvT!BwL`*tj}b&;E}|HE!;@F{o7Knbg}k8VKZCOPeo9^IoOosiF`73bCu z@1*^;VAn}ms0o21E)lNFglG*rd8spoR)V(en@Lr9w;xoBuwMQx`~z^WzNIUwAnO5s z1=d2zU~oT}Kjf1Q2vmNhIQAbu;RHFXcGA5*z_+UHJ->=yElwZSN2i;J@BDXq0#)g? z{7QpgXY%X$4JN;l|40;{GLi=In>fS-4{#{8Z>I{v4}`(DoBU?J1GB!#f5Lxc@>}>$ zG3(d%M2o;{eQV-jzdYHV=W(hv2s6zX6XCCS(cum;;#0lf;5+$UlGThaFoAl z@M9(di@$Cn9{H;wItUYu7Wa5}NJ!3bj&EhFyVLvu{mwSj9dRp%Dsr?xE}A-K8^Vw6R4 zpC1Qd2{{^uW{)Tmzp0Qn(r*gQ1CSnU!MVn+fQ$zdSMYi(t|4~}USET2%w$L#N0XlX zRXSMp{!O?~%2mp1JM-Nfd~pXJV5*caU5Fp0`*$SLJDW~E^IT}Y3~GkARuu1V__ zsObGrX>tPTqo9YNT6q?FhIoipo`<)n=v+>r+zv8dOx<2Y0&>U2cCk3}A%Bns?!?PG z98Th&K}ar-PR`{$Ht9Crpva*qCV7=liFyXMzbSAa@AAO0bP>gWvLZHSsT+2JBVv(X zavcgxd4{E?xFPLH#cli#hvyF?4-U`qsYsJvmzQ2p-~rS)ql@)HU^%2TJP6I%*JHn( zxpVh3F95~QFzsR1$78<(-saUEkA0cE>B#liae|z($W^zokM-+f+0RJd=l%Dx0YDa~ zIA3DdScbbjREz!x;N2K)HHL7<{XK3NE^F*CgAH?DiW7%>;Bs9(2_`#pHp~T5^0foj z8$W^E>lRS=PUzbNbUx$@e;4Te4oIr*M9RCEZv(Qq8op7kV~@hj=y9~kZYW2efM#+J zbY)LM#kCjyyPsx%hS2UqsM)@N8|!b_v)sv^+NP<&5oc$ax$3QQr@Y!_dXV509ie3e6gR07nXgyUd zx&991c26+3`;psocxeoCo~AE0*dYT==QG%0MBpSh_Ak)kMfv|P4E9Td;%+i)VS;`N z{uKYqD)n#K$U*)|=w*y-#NSjS8@O-f%?uvPF{+6;ERr~+8WpjFboK|?4*{*HzoF4! z+e<(zTz*{Wq5L>{2z7N}GWiY80KLZXA?d=vN5m(U;|K;weH_rAs=TSSLU$-bPM~di z9hJKfok1PbMBe9V$ab+&1oQ`$mm<;yWW?Cb-G+dP2iRy%!*T{fB9Ezq;Q^HM3A7)uIFrO7bY+Y4niJJ&f^(5S%EQ9h*p}~h@5N+Xt zc@MLMX-JvpPdN=KNstW>9y2&-VSSvnORl0bRWt>T2ozpw?Da*K zPEW&%Pci)Eccob25JHDwx*Gc43DB}#f<^xY>kn{7*J@h18p|R> zj7Tf)nut1;10;Mo`xlTwA0wwvShWumK`jh1X2Y(b5gDvPteJL%mbrxe7ukPmrzlXj z4+R^Rf>8NbT{86TH(6=g?5gQkVWEc!bJ1ojI6V}HvAo`x}r zMNSd8@g50hFpgeis$(T)~v0e0dapca}aVxBb;2Zkn!;%*bghcw$G@kbylN}RbnCNXSQ z={YeX9r-^ZJk+I&>a58rPfbZZzx)a4Rkc9)%g(Tuv+$fj4HZ721?qVcv`Y2#F4xcf zQJ;gVDDl_~o5y#~iotHAhl&|t6+274zQRaSU#0vgpc^p3(7Tv^aJ0GA%V_FvFes^3 z$411T`xE|)<(U$noZ&im;_oobDKP~RZ9G?xpH@ANx`p?HraW+-<7M=Zs@B*E3Fjae z#IxXmB?t2omTCN}iD!%Uv&ByV5%wHgQU~j^rL}cvuw}J%n3wBo>uh`3^4h1^0*nUy zf5^IQKWjM58lPedF*fl3A#3Aa)>M0tt)OtBE1zNmXuVZUEj76?=yG759R#&uDJF?Y zFa)!xRhw&=&_Jl%3Pz2gN->x=7s)|TgWuKA__g474Sp-|OYxW6@Jmi@*5S7uza9AP z1b%1?e%G^EPJo&gu?-ce0B|j08}UnSE$H70d9<@T`9^jZGOXrv*k5c8D75Mkg7etQ_EE1IZN&T=v2ri^1{? z>5yf|yp@M6ZMrN#0Z{q>k_?b2|t?ke{yJ={b=A-a0?~ zY$>GhP-`y^nv%z7oi#b8#P$+#qSBKi#?FiJio4qMXz1>qth8r{PnNmkcaDoUNTQpb z+Iu#lVmS0)8DiFOJzkf|#JbT@DQSg%U(ti52V)azYjX6W+_?mV2=QnA8WS9}$1fkr z^=F;=4*ZnEk#Lge*sbTO)`XC(vk4J4;Ym|9tXT49LZ;aEp~Vm{zLS6mX8Bf|$gHvs z6u+MpT_A2vGsh$Y0zk2!p+*7G5QUO25(xUm$!igKn(z*4R^3PpG1eQJBXtmO9Dbo< zf&LtcaXuR_)9ZbJv34L+WHdv#Q}8%Iz|ln^(Qz&yALoM1Yv7ghbGC64NNu zY&;_u)uu=t6w~L7`luYDf;mKC-<~FOhyqw54THui-3x1A7D$h;F|TB9l=GKZ+@Yuh z27U>Mr$e`&z#;&6bJsr9hG~?hQg9|@$5`M~RNz-&%x;sHXV7v`h8Ho4_B(*xSU>}4 zp%VjlF;vEW_#Hi1PL~aI)@ZOMLglR>>&BH(LK6%RhBvTTY$o3n&8&qEYn&7!%Yig& zop@^tSPOH%?)BvmMzt(+{*w81VI~yd5mlY4068`QOqhsv=tgOev&T@`m~1@fik}EhqX3pklt|C_?ErSJ?le(5P}gd<1QxvRNe#QTy{REIjU*Y()c;NJe*%QvRR$x^Ee4DYx`*r( zonvjW$-N-N9{Asd?2JFpskclvjf_8GVmFXy0)us_n>n;d`H_S9>~<3Zl`;5!Qe@mlrg1pjMTfh|@y0!V)yDl4 zct!@Ld4SFyWDlYFr00x)y9{)!^nw95Jj!R0$lc|LkHvNxy)f9sR$-3`am-$E(| z)>IQR5y0M`HX-SFj6M=BGKR4c1m%OXqTD=L^#FeE>{FCKwkGQeet&G30J{e@tvJ$z z*dcq9jJ{%7yu;Z&w5DCINFG^!NUbH2cFW}29+(cEux5&TblojFOYMlv`Ia?C+?s2{ z`u~iHV zpeh~WyDhd-@p_&uZb#9MED>`BeCcg^$C|QZsc?#&g>bCyw!r%5=vB79fpGbP6vKyn-osi~Hmhb=E)a{Cy>uMqFZD@ZqSWhcH<0}D+H z7|EbYfgmjNu_Qj4_2*+G&QPt@1F)lkQwbCT>hKr!cdY5)_*m;f<{*hF0DP((XYc~W zIJ=lj!ut@}5KzA;{6*t027m7TEOrLg_qapW2mBF5hg);=eYk@hYz289NP**Xks<-t z_JTYMF8#p;sm`f#3k<(qd_0g{Rctz+$QJWStc6dORVRBURL@^^Izu1@RGy%BrP33@ z7(5#dPS^rVhfGK(#-bG}fE|R$P5X{vuUrX8Uj=P-DcY)RVdQNGcf(kVjLDdffQ*eP z)_fU?(Nk)q;FA;K%vD8UOfI#1bV#8NvJ?oYQXzdxi%ZABW3w(pfKK?iD0MuHFD-TC zd29z+-?|h>irvR46YqYO34fz1vg0l#kfV>zLh+1Ghg;DZVCQDSJAm zlBY;WKRa`qlJAs);*mEpmE3CLZ;Hi1Yc2{Jq?8DIf0{q9Hx z{{5n_X6?o1h5lxR^Y1;=g$(ExK^)thm}HhXko!_PETcm`9hM_HutC-`l2^B}6$V>r zuvG?YCch+L)m_A?I~hQt$uMDOtrU!63ApuSawwNhEW3>XENUZ`-AR7sz_2HiQ@M2F z+QGLQY$ILBJK*2b*`;(KH*?_H&x241$0SZzt%*B_T6NKyYmKM42;@gntozW?SNPsU z{G44i31bS=&e9$zXi94<#kL2qkod#tk~Kg3Hhud`?uSx)91Vo4l3v{N`}azM0AESa zmjzDkLi_l$ae=%oIvxSgTurTnsjO^ouiq%@6Fr&2eK6J`_CTMeTkG4~v&GUk6TE6$ zJmOmg|XI?iWPzsl3kC2w_r)ezM3QE4;WW!AEqz~~lzO0TZq6UlccftmqqB8n0Nv31Q zAclpkMO2`mIBIV>VWAu3FM0yMF8mQg$gu=7C`bbq#x!k0QICMLG;t!plQlOTYIrqu z79gqOqLyRoTng@Q8SFS+1S*IauC$e_o&!p_5fJW$IF@D8mcnPftIouXjLRzqdH1G{%y8`z(;D%HIHzRv0 zpaSfsH4l8K`_SFYbr7D>fny-GpANB9e7AeJ;qJo6(gXU1v}zyoD0VH>x*b@MIsq40 z5BE$LA>#4|#3H^ptXdVpYE?*Hin9u7U;)SP-L*!wfMu7MTZ%=X!a`^` z?*m@ZN47zBh`%PIlZnsE4%RXo?BYaJ3AH3v`Fr9BF5vuY5EF(R*lX3Em*O>Q&q6c{ z%y^P+hk|U6L~|&!9UY*KY1DLR!c>0~rsfgnX91Of4_9_gS71rM5_Th7SvJ24fgG+5 zYdJEABC<@!0GSTfR@KSEI5}L#G~lmEJ8~p8BE>Q$Jr681>>fD4z~h2AMr0$gd%z{6ZoXHkT80 zR4gJH__B{Ijmk6#mJI>kO0{3P7F7sXd3^%`$j%^GqVUuoFpS;Kor;3-YA}YS4G~5_n!3?vKl$_ ztKwp5wF^l145}NoP(Q^ExvH$u;Gpz|mbMO5nZgr_!T+=jPLO;PjK3j@$;PI-Ih$go z2Dp@D_~)38A;Q@uJ(P%cd?;4&ElH}vfk07k)~{V$f+a}ul0yRzp^B(&UfEjTxtv8X-&U7nHdNk*a6BChD^kX83$1imG${Y?O>X4S|k{jftDwk>fCf44KoI z*Q5ys8)dqfC7CU3mP71M=GnT7-pL?uu?0$UrW8UUijy)r-C(tTB<{fT_|o=(&M4R4 zv|&TTpYxd_GH`uzAlSsGKab6kJdRzy9sua(1WN{gVPce$Qy1C{qCqexQ20pt1ur2k zPjw{uAjTs8B_I5YnsYpOs@63FtP7&2atBx!r`*gbjwJ?Lx>TfM$dhuV>6c=bx={|N zhQcuc_vXlX9rHP%=PX!_FfZHa+=_Dp=O~ao@lc5>BxW=gq!_FVZdsa|?NWlJQKSL# zGD9|Y&?DLw7f%O zP^a|!mps}LGM)+^0}(m#o5?fSFy05yqYT*KdSy#3 zLX{GbjyP5;>X434qH5c*mgeE3$RT?YmP`7)VZn3*kj|$P<^&Vw1rt(yE7j5T&B@IA zJ#!+U2XP~b-;>iHb&7>74XdPTf^G{Lu$8V5P04i(Xd`Oyz&e)1k`y7)l2!yCAVhjY zC@&G~i%#|tWrV+{Os!y9b+Aq30u&yV z4IYDVxbUouJ8n9QwUdQyJc!ONZGGup5~WXujjzGe9IydKtmYV=K?lTYj)8eE_wr1{ z9pPD&u%DA>^Zs-;z`=Z*_&_Hg#B=C67t-sQo9QeM-XNWjhQZ=^li$)eQ(X+EniSFP zVkbhzlo)&nUBKFRARk6&!=2=}kIqI>(oz>M<5=G}ajfqHdAXA;ed(+cmcFw5@sxG~ zRjP{4CQ@}K(HWeZY~pZkvWZ_{z~C1oI?W)P-ywVk-JEIgYC8uoVj#?&d5!eBM;6g@ zD8*cZ&vUZLJj=!B^Ex^#F!(|{jDOi0zSvIozwo&S5AP>@m;oDL%(o2&Z?v=9 zoVJ@-b{#dtMRvZy&JjMq!Z*UAkgDPvMh`Bv^ULfU>e)3Mp#*T3+HfP_NjHUmNHvxwjsvhb+77Gz9@hA?&_MViBslt z_Qvd0P2RQbP0)yXSG2Wzm5Hl&1%XpK3ccQ{i#wZIJDS^CJG}KB-o~bmhW6&=O^x3A z&+tD@hm<~hXh2paR(erXu1BF)#=|hG4KM!WPP2$&o(7j5`g&JSf)==Mx=0q`6 zenVT!`Zd(HXvl6olkbIVACdk?y=4MEv6C8L;EaLPEbBW|%{taLH8ii-h;;r=TN>7l zt!=?iTT|cJuu9zWC;eE4Nku}&y}St>O<%0q5qaao#btliPxDK~WpC(3x-3LI{)Rre zk6m?UeG9Fs9Afd=Q>Ildoi=6q6u@D}3GGe2&?l2qk2W?fU%zrCOha?f{Bf1l)zj*h zR#i@~tz9~C$}~ieN1wnD?^gZ-w*&k+wKS>F@xq4sb`Ziy>O1K8L3k_tw1L(`t@e6n8ujJ4B0LC!*ih3lMJ_ zeg^uS7qIHxcl4>^&Wd;$?I?YX{CZT(*4Q?$t$nrJra(X9Bpb$NC~GNRX?Mo}F_M(v zIlw=oA&h!AXa#2;5^J}2R$UCb?QCisuPC3h%pH{n(rkzbN=D45PUw|8vj3hUo?a83 zZ8Kmmh(G||P1bC2*L!;QNbIY!*0!x(PxveK$sH5h+ty4EI4G!wi-k3$%mK6lcJ9)L z!nFG3O*_0-d&Rwf)jfva=2-MyuNU^9oALLbNdJ#ESwz3D-<6H|9;=kWe{Aq=SZlCS z;nSC|Z*FN+X*rucNEDxN45Yvg@rz~4MeZy}&JIS7W+<9UP&YNrs)+`hF?9*23V5VO zacw7=(ueIPsI<^V+Y_Icy!$m;s{ zU56MVd;A7|)@B{%>6zR4%_iRgi%d3cQWMk?XmPbF$cc7PM~*1?mp&ZokyW+98JHH0 zz+*Y3HGnG6*K#&&s>jS80ME%$D(AS+v%p+Tl$1}40gdY52r7h2_vV1U&c8l+SNZC( z3N?k+sFvo{P2R%MYK|)}lw%_wt4b5**T7wxL@Dd=i4g0SUlpiohQeA9!M42?R411Z#DQGCYMICzu_`7%&f;{1N`B2~!hT{eDf}$dv)E$xh&1zMDT`D1+MH!=RQw5!n6? z4NQdX5>cdJMEog+R8J$?G=GMpf{#G_N#y@W5AiiFMV-f!@n?O3(joxP zi|iMocpUtu)l^kZ@Yc?*nKOR&oSG`{lv=OB51Ra0Snv8FK_gZ9=gZYm0f$WfJbytP z``YxF{3esX$bSLzUG{siWeLx6mP@WhGA-*q(}ySWmtZT+w0xBOvdMqNf2}Ndy}$4b z)hV|}MSUSpsNk=d{8bq9GP@tWvy{g2?=XvzVXw66MJccIKbWw~__N9XhyM}No5}x# zmgRruZ-`~l`p9^bzX_)jj67_=MK8V0kBPxw!ly&AxM_tmZE~<~ORFnuu*l1y@1>I7 z!8|5WR?TgL;THtXwBP0Xz{B?ZhPeGp9jY&WoS!iHdvy3KsD{7KZzctq5|x0RaKZa^ z|3)>M=vzN9`QP~8F${1&Fy;c!rI@D&SfQG;>Fs}rb-VQf#XOt*pWJHl5BWy{agL!n zvUta_9~FoT$r<*H#|@WwwB7KEcgQc$rf>C+#Ewb&2yuKuj7t=JWOs;dUs+P3yF1Ac zTXs7##O`NY#iC*@TuWa6fIf>LSP#eB!M*yeZ;jd0tCvogQX_u%Qe0Ak0$&;YWALv2 zZNobu2PnzlM#>gUQ9ebass6O)j?S{){g=4J+{y4sv2AOVO&m@!4KvJD%mT5cBQ`@E zf86R7wPh~pJtAER38gr+n=2B7{~LSy9-wMiv{bn8+~BwkfZ_=K7`!R71ORdkb`BzT zu_e<=k#JQBTwWaq5}a z3TTec6x+H&LR6=}_fg{=pMIDkb~tz`EX6-j8qa{T^;`(PSdW=Y=Fgj8{|J>IBqC%^ z$=Qv#B7w+{h@U3Nr2ifXLAF8IvAqM>n;ji6F69KEJo72AlpQcNB@@2DyUFrh3%z?g zJ>A9bSjcz>^WBLDcLj?j*!ed6AsN{JwJ>z4ZQlxQ{8^|1=)I{+iwI&-gan z3YtEn(vQe&!t#F<34Y3af$E1nc??f5d*c{U@^1FC1u$A7uf07yeKPPAiqAC);2tN2rZz&!b*cF*Yh}Mw5$d1zW`Yx)&Sg%qlFK@ zskG(!-wcucBtT5Sx*45P1YFf#!G-6i|(9B>?TK zVF}X)f5jaz!{5Mu1dHR>kW#@MZ-Rl~QCwMJt@nHXw~|hgzBLNpnhl9)|9xzL<#F85 zV2%ojuusZjx8j|vp@hFCEVn3sZW{3sDmQ3-<7N8KQ0sxD2prVKatNvUG|PPnR`B@Q z&+_n-{}R6SY3Ai5iO%n0Fa|BOuz8lzFdj!u23sjYJeY+dsNss$Da<8`myJQjLnz~+ z_#3v54X0=L^HOdZ=`A>pz+bX@lD3bH#MLPLmF{I_2ia(Pm9p)YYZn_6&&u&6rHhS) z0sCH7VSSEOQs^#=kGei?9Qv2ugaNJUX&A6hz>WR~VMUAjPJ|V$5*AaV4Tr!V)B|z+ zk0E5;22uPDNUeVYQRppD{|V@oZ-wgib|@ZqfqL(Q`QigG5`9RvEeTggp$-2a#|T98 zpZS}490rxTIUle755|KAqfC;RS~B{ag6bL9U(K{mM#?K@>g0c>Ny141O{7`%f$a@CJHEw3XepU@Codqx7u#1J7+A1kQDyuQL9jHev z_{~&QupHx($V*9t$NFoEQejH^rX;Je2q^U?>f}J9yoEChT%a@P4M-G0son{uBFj*f z>K!~)U&c={U%A2F^^fo4NCLi7(VF%KMC@d;-gc1HQn#WTW=})E&4F3-TztU1=`<9( z*!&q*0uoU7JSy$9Grw2wp=yq=F{T7;`yAJCeM(SS?v zyHqI0$+jRUNWK5IaD#%qxs zxqkp@w8X8$g+EiGbVU?Ppafu;aVel=3OpfXVxAp>nRW(#XJd|Cgju!;GwT-g+E$R! zk3cRvK_*XQ9(oDn{5}ZzGxkR)RQ|#X*;{-xdz+7C@9?QGg>FS(zeeOupI{X`K25h*V4=dgq(0GK9Br{t(3O@+P4F3YTtd>Q7 zKqCXV88sQ2ya6WLWHg%sZ1E?OS~Gz@{uKFvT2a0vS5?GG6rqw4xvFF*{p31>xlW^= z&_8}=u+KrY2Kxdt>$hz1DY9Gs@_ULF{~z4MISiQHmWc-Y>J%G+BfV!0e-EG%qEhg7*sjsmFj*--1L%P7d_TW9VT5x8178doy6xl&Xr*NjB=*M7%e*-G! zg@MZV(2Y@`1}Fk>rn5g%{|{i+(I!8QCZ`**ZL#Egb!(ZUInLK86I zz44TSCTzkabUDV5FSLFuzZ2hJMg3NOj#{5hu5%k$IXlWAYeULMjGK@U@1s>kso(~O zrPrYpruE#;5%SKTz6j;gKOn0%?Pr^xVigD37F6Jhr&t-4PhKYc05x_I=Fg2l^ANaB z1ElCG{MMjhM}yE9+AqP0kOh~vosQr`0% zLKAN;(Gzxjd?Ze|A47`!UymCk1}}5LRBIfZwpO?jN~KEhjs?dbdJYTib5WxBevkAT z?1ga&AgbN86r72 z4dMz2vL3|OWDlA$O)M*S_Yq_SD4NQnl7tzh1iD^GoAF_I-#Q3}J>rw{s2Spem)td; zgxbpKXDF*g@ke@zC%eZ-_pXo15mj^a0tXN(kP8!X7_!uoxHD4=fQ3C#FqaDAekTTT zze!`yOfQ9`QgVAq6MVIZ>^~VokoS-UO->Gi3wL-yyA_JcOlHfNH z4>8zgwF#b8%=?J?A@KvrtJz#}VomfwyB|H(A+}G_M~RJB>+?l* z2Yj53U!Y}*qb1Q^VfmGc$eg%ybzGt(Y7SKPot|Wg=^ezppnsaoQZWt1QPX0ch@*DK zFKt=SvUr*)u1&#Bw}nO5Ru z%G80y(So(sQ1f>vzrSGSeM>yKJ1%Kt9&jDBkpjK+^0wa^E%f$rynRBx?US$zyodhr zzx_5=H7o_J4j>)JAyfd7iE;=R9Ou%X11#7=xAsFtQ%5i!$00i1FJZ#Z({<1ASqs5u z-Qn;WHuhv?gL3i$Bm2`v7DSfTHD0!K-V`!m-$QH^7YaIzkdd_cV zN1u;~_IDHum_9w8Y{sJDHvc=80KfTs?6+t|csN8BU?GuU41CSw22vhm z9{I)UyAnad#4;%omdigvtkK|5>_WW1;7KygWOpUTc zFdq%E3-CwPn2LPLk&o(|@O+Zbl@GGP%f%8PEk6vE(U7epH}5cuLr8;2BzOx;J%1P2|U&W+?O5GNfVecJMOiz{*lbZ;D@Ro{}}Ku z`kO>{OvUzD>zmp)!mL|7F*ZK69Eclu6@Z~2O$mVW08`@t`3<7W9JuEoKLYtcRT-=h zxF)RE${UCB`(6k@6i*eWuaAirnX!&Z&iFv)kcZBVNfQfJ=|v-h_d+h?v2cgndi1zt za~Oz2;aAk-LvrR(l-|=Sp`3Vh6w4(%wZ{|?40N2~x1;q;za718W5!ks^*P){7m%77^a+gyF8hG6N?B;@B_1@NJuq7>!c| z|CL>aK}pPJUUUN)tGVzOM}>k2XhA?laL0|aauIv0E5jFyQ#HH)0&B=GkvK3s+!%QT zbA!D&_NMBk8jv_;LzWCQaq5;5%pEvlbNe{%({{kmY4cynywo}bsG`h^q`(2^83|a0 z(qOUyl1~CG_2_srPvgH3-;FnuW+{jySr2%zN|CV&jRzXYG<-FTIR)?}kw-4_w1Wh5 z0f&oltFbcSgS&vMVvTlDS*;JPMKhB!M(IH2wG(isJ%ZVtOav^69I!ITQ;4`nxG+zT zqC=@cpm7J;<3p}878bx_%jJ53_}%1ahsb!_lPZdGq6|TJ?s%fah|-jSW&qLc9eQR8 z$pQw@+&$CsdGMwq0u*v~&NL$&g)NG2h)>zkXF?hfaZd#zZv!c0+LwDO_Y7=mqh z%`jlK^vN-}uo$EBywTKf_oTT}Mc+kk>Fg%;2cfBxdSghl4cX3epBAeET(gZvXiw^kyG*T!av|1OD+5vNO{GegWD9Ih9W z<=LQa#{8iXE2jw%Ix%IT5libGEeB3W>J;Evtp_yZ6!Q2n4B^MY2JV(aPvKLY!J$Xn zI6_i~4QL-Er^;j~2cy{<&i14Rt%kt%9AGeemW|L^;*@A}Z3opa92oT<;uJaa$ysBW&2$h*M@um?u=yD{5_X+GZ4d?x{t zhdR0g|7D`1r{FUtIx}@_8{Va;j%_FHKu&D}bI{8hBYD)vmEDlXfK3mOy2Lw%I>ePE z7a4q^fhu)4=uw>FR`F4jxjjmAtMJAY3ntF8I zS+5jbwTW~od@_N2qtu+$FMoQ|pP=Mu{a{+|IctMC5sV6-BBx0%$uu%w7Z~r!LP=oi zN-~Y0y)MjsVj-0c{6eNdvUMP>28ykO6*e5P^NYH-njwy-0-h#G)kV|o+ zT5!Z=JRQoE0DE);vo=;EG-`&Gb(dHk`OP139hE3+5N88X*5pE8J%70_*F5DYu)%6f z4q6qL>U5G$2;1A}c+uxYP|rNPcGVO@vmt0Wi6=iw(oil^rO*x(Es$seN1cTBV`?qJ zb^9PSjqWFh&`%CyzJ3Hfq6Ww9MfFg-pr#j%ny38$kQw6|i!qZ1Nhl9eQ6EtaN10BC zDrdjald)HN$~D5P6e($;b4y$RAiGDO?*mN{6d>FA6f{*n)#T8A2s-3vJD*|lne;o$ z21wdl53TlsOrge3%PD3*KKsUojk)heLG*{A)b12-D&W3 z^zJS?UrCbdZWmktFoTyKU>jEmkM5)A8!2s@$hDc$wuM|<>2g2l>D?e9on-Lsbo9~= z=<6vl575g80hBTM!xZ~;H<;lcHTYxn{9kn0Mc#J10hrkf@H>B;Zow#D<%WUK)gB(^ z`#n4q<_8S^gvk%O+4cNMlRxDKS^Ob48^aIdY)TF$@jLht^8B>PpE3EflHYqw{*uXGHu)N{J$pu#?9yPZ%zK4$-g)G4^%uq8vG|L$!_DPO#ZVQztau= zi<{lSC0%}{%Wo9t?*{+F!zBNcE`QPE(+2t_!)kdT>sFe+5mf*;BK~sTJ9Dh z+-$S38NyB(ptdX=Zs8OLJ}yjZ0biRU(?vz|7Rpe#$#no1;W6kGtVe`p2(KZcWYBRkI~5Ok5f2HxD#F%)>Z3G$n_ z92-6L;L72f-gbP*|4(4YmvfCQjsTGh-*>c%z;v;r@&620Oen#Iu;Q%$C0z0E?>oNZ z4e-JV@u=1Fs;|TUCRTXSMXU7~vN3v2t&RJHGVGr9+?)AMvpxI`Pu5LOs~e(Ah&sHLal4q~eW{4}ap=UJtY@d>m)j)_`^w zugzSMEQ6mp(w&Cz6Od578T*-|DFy&}!)K1}^+CC<;x1T7wSe4rqH!*Th=$03j50(3 z{V@E~=MEd^H_8)n-i)<}hcuGux7Zyvf^FBFjF~_K^!Y^(v%1Q0lZmr!=xl|>!l5Wu zrkwVr<7Ef`g|kP`jgk(Z%9TSOi73@CG+93MmE(CnSB~2imn1je<4CsCUJLMMVj_RDEMEa9KVT+=q$PzqQnwi1#)hP?w05wF4dq-frR%KrIxsi z`xIzXPW{=@IbHMtRjER1ioTZUC;D4rfEZ|rL1M5ahKN%6^v{kFD88ZMa`X#JT!Czh zVPd#CgCkt@i^Jee4UubzGE0mQBQ4%jz?j#F;5jWZN{mK7u*8*Oj2!c;qqT$D)>wJR zuZ|80ablb$##5Rm;A=dN$6Gv|!$e!5Iz_p_4hm46Vv-nRiOGlnsLsiDF-5c^C}u_@ zF;(CgYBBI~T)Ous7h1}H*vIn7b5@K@?*NvY0be*gZYl~G2Z$a+<*9g2HVR?}YF^Bw z%PcXN&bDb}%@)HgF^4X5g*-Ce%0Qp3u*7_El_eI4g_c+(u2xk@0_?m#Dvyk_nz_Vc zfkSN!ac!NF2Sv9;gbcCN63YZAQ;xAi^3C5J+oQyCah)Zu7dKd9g}6}xIJ^AmXaUgb zk@bP5^7gyk(XzDA>yfv=ZaTuoX~#HzAs}b;xj;_fz$L$)=g2u{aW3$iFI!#ZYp>b7 za!wEjTCdM_-W#|Gp3|eY+zhem|L%!@7lCm`UF`fe`OjfT7uhY*(Pv$ebFw@+(XuzG z?TCiB>HitD^JU?5$r*oJv2H*?E5}Wp1z@Mlm>X3n2aL3(%8x&WrX~G8(Mt|Ya%9QR z-Y`<-v#YE^IdGntE?su#`^hFugyt=R!3YFq!y|0MC+*IaoL^9PV<+8>JW|1X$lXro z2)W5{0@W#Q{y#%?ZU=~oXfBBT{vfVT;*M6p#M90dj!-`%WaVKjqmEPn;POlb&@Eh{ z2+w=_JiR4UA;d4P6d%#-9>z);VPB%RI3Qvq)UhmdtO9bGlYEN!Bt_?8{K4mO{6JJk zUet@?b&BGRNEDG6AlxDziIF4nt+&V{82d*64+aatS}*C!i=7<-s`7pE^MUrG6BUUO z16d@-dO827CNc<-^WUDVd5b~s{I{Rhyv5KGc^j%1_2-lj5c87&h`c6!Nu8R(C0DuAq0>Bz=YPp->E851s%dGl=S!b!3Z0Pt6mlSh({Q~ zA}{JC_Eb$?FtnZj_LrKs7^%;X^mjbcT=Y+p-l*Q^{=yp!>E{PIgN3|ij@MbJ#xjgd z`0$U0djwF=xgGisg&m<-HF|^-SM@Py7|8cxrA;`<1dt#$Yzf`64`64J*njqcrVg0Z_8}zML;{-GE zu6MTJKS^F|^Q zG*i-%@erL$h0Or2z)u>Tnz>(tVD(-0e%1tE$vmol(iB!T6A$QUqGq|+WX=h*+yg8R zz}1~NkSWaaQ3wUJgV!7o)_trHuENLZRHIfM@umT8HKg9B5il`1sRp>3pnZ~URuN^!soA^YOOtPjzbX5a{zzEo{CRgfBAGAagc9wabx>JgFgAISZ=^lDiVY#) z>QDezFGo|oBFu&zqR^nu>Q9JG96LkdgZ5pB*RC>1bPni)-$3p5Ewp9dL)ZHQ^uE7> zc;`24IsUG4(MBgEk_#k>Cvbx4lPd@pZQk?&{ahpDwl}nKNpRVDGTze`vGz|a9NIr= zkhT95?LUM7DF{F$9cYthup`4$BmEhBMm(Ko=rZA*kPs@9^{UMsz{Wg|00Nx147oz) zRH?K&$OZuss#^ijtkepqWZ(*nq{~ns1~1obf(j__KnN#xv;M%Xjwaw|2^?3F)KwtY zDmK=!jQ|IAagIaSAds?-z<9b$sEcZ6^;6VzbjfN0S4cNVsTfG8WXPsI0L)E7znhI3 zzMFd^g&T6<42BlGF$M)YU}wE}CMQZRWV5!!*Yj*u;vQ&$noxv^8Q>ae`fY9aCJ+i}WVqfP|Yz*+j zlc+&Wh71f$L5rFSB(WELa2kOv(V+peZDrrg2Id6-vz@t~`3eFv4zO9Mjh$>(USM|3 z!`U?tQ89G|2~@4B93b2027u*s09iscQiD>Cd_bd=b$wRA7on6Z;Z?_}_A(Soc|JZ% zl(JW$vn&AqGiV2Z6ZebsDM?hhyb(r9IsG4<#gpp{L61*-;; zL9R%Pa0LzEm0|91Z6rrPD;tDXHUdiyz(fM=N28UO1`V__1u;&e@XI1#<+|K2M^*^; z>j-2m5QxbQQDeq0dL#bkbfR8dS6D75pR&*{klwjzRju1jT29Z?nCh-606SJ%wd z@lR-R76ex&PXJh?5xWFv$Zk9d$73bqWUCZ_M$!;J0pwimw#cvE(SH)F5x9QHcT z3$Y(~K0C#m<8Y@IK#msj4BnEr=56^PUIdzrcKk+gT-?q(@yB>){xsI?U-It!SKfpF zfu*F4_YzHbZ`GH!Lh-)}+FfS+05p4v$m# zXaVL&1~``QM=}lq0&_c?$Xfsf>*8NRPhZGeV&C8g?&Pg_8;I+7d726gVFX2k9-pDi zK$45l6ElS!i6*(22mNb%#B4)eN2)%o-lScm;O^Ud=lbyqb4Di^Jr27xg@+njsMe zFTv^Lk*G-O(uNi(MQ|mPf>L~*1n!2&oCf`0&zvxbQqx_jTQ2NRsO9$0JIYccKEQ#k zUxSEzm2Now^8+`b!%>sV58R9vSJVFT1GnJL(y!pU^(f0Y$ZjK9bZeO1UTn_~tR{)O zre=!a#RKdMs+j7=vQ{L;K#y(W0GlR&u=nJ{a5!5TI?xDUmPWEUumQT3k7hUXD!s|~escLVQ<+QaG$O7&i74T3dcm7pGn#X&s@3}2|Ih%Z#5 zmcTw{I}64+D(}vJGmz0};oBRpp>T>6@EkOz5aHQ3pq#P}vO6f#y~AuR2`$V;?kqUU zTtNp)a~;WoyXe3<5(w&uHzl6h(s6D|AV_UuaW}_|JQ+q6kdW)uKt?V+#gJCTHtei^ z{vf*tLi}EeWDmQq3i6MPFsj(5gKRS-^cK9_8g$wrJMYKbZd+Xmt640cgT6Hv{b(LG zj4OcKYO5V3Xh#1G_U1o zC)cTr>Djpt{dyFF+4*QRD>XYubQ*-Y0(JS|QRX4jt_NXG6&XDHHkcgK{8Nv04{c!4 zxXFH=x$^=KSFuNi;jZ`dEC(TK`<)Gg#-la83-2SHZy)Q83VTd(Z|0E;19%nt*I~A+ zuKsinxgCOTHDu#j)ZaQ3BrF?SbBXg5?*_#0K9C-5WV85YHlJ@{3;6viuX?7jB2r-q zn81Mj2)U&>9+iWBG7e8@66axd-x<~tWE1%&dgFwesfNM?;5~E4uo^E zsVJ!&7!Uvgq_hyLd?KtM84&dr?5nn`ZSzj}#r8fn(YIq^V*)q19ri9J+l=1>kk2@U ziG5oVgq=h=C;X_y=2%~#?>DdlQHXWoC1NG3a=JuM)>`z!^E2!oQCSl7oqeSzPZTRoRe$klkJ=i_2^JXXIn$L{^;(k(ym>7oC(|{K>yRz5=Vz$>+4)0QZmsX zvans2&EhfGrE*ZAVX$ihcQMbC^9DGl@)q*L0nUy#5QRx!Z+n)4@^8vsT!bAqcEBx< z&v0hRUFA?Q`QM5+W&d4HSNP_E&I6{al*M4-kquMjer3*1jsa+G1Lf^y&QRP?H2=%7 z$vqquQii@Z0vN)PGINA;y1aL?(~=uTI7uaubPU5%8Q%lTP zdFTpfS$N|A$PjAVCPKrviq zp}lsblgV2*TCWd**__qnp6cGxdPzg1Z2z(8Jz0+_f zSUT=A>fJX+L28a>{XQG-HUD2T<)2Q%)Q5*qA4Mi{C8pCRbaehGeye5fYkYwL;jTA<*}VBj2k;3?%UV z`g;U=qBEjxbsyCVi*^_)+N1eiVyX`Vek4;?1cITOulQ@f;)g$dD1|;A#epxPlq27x zgFYx#E`q>g@#`Q7sB@=j5zTdOGGa3t z##X9gQ%ikhU!#DiAiAUINThjyU87>smC_T>=!g!)FLz&yGaqZuzQ8Jw%3F^s+Mt<5 z0`*29I=MsF-!qX}SrIfXuEWILh`CfhTs*Wcv#n>TiZPnbEQuxXR{{R^GfOR%zK!rM z(^u6JHN<8>gw6t3W;UA3JT$2aG^$GEz5?lAsFFlYAQ_06KF9|x=!+t`G~uf+_?SVF z#B!)pAXfqCw$L+>#6j>GWTF~Dfw~(EI#Ppdj)tO4PD4PczQcHc%2{|3FO9-SZJh#M zDW^DRW=7E(>?q5D)g4AW-&ok+sRa{UU_VmNWYek6cB%lEs4j9G->36eUq_DKgp_;> z-RB*o=UsGz_mJlIk)w~$aQ=gu{us6VCT1p|p;$jh9=<@Tzk<~K8bi*1aZt*)D1+~y zrTZS72S31G^GB@hz5>_4?@-kJ0rR`l(3by=Nz3UFl;HoUVj}y@6qM>P6q6JBqIRml zXtt6Kkz9sE4aE~GL<3`5AmZwWLLPxzygAW)n%uOQbzxUe?`-P35iC05TP zFv=o(0~6}`Mq}W#tI}@CtZIDMX|S>QUsg_YHZ>rdsM*M^)0~Y%{vu~`oMUX9a~pH+ zWT^#-agOrV{!2GAFCC~JU2LZ#%N^h;eNiikG_*|_*<&7H7YVTpoS=_o^Jpx# z0ysx12E&@gO0fQ;7EI^e1|g{?sJu8K2g>1s6bYeEOoWsEbyu0D{?Q!`UyG6MD$_)k zS)vL|=L&WMXDLsUF<_^{_8=B^13&;m5J&n$eQbW9VeibT*Qr`Nbi?)cfV7wz361$W z{==3BDGy@K3oh?V3nYZ5kz`OGo&LZ3(a1MZS2NhmNZ?kg8v)Mmc#6fALD?Lql+E$b zu_QpDyjNowxe~}EvWjV8W{U^|9N>rK3N{BFi;Rji?h{UwS|aP}w9c8&grA$?yuz;L zEU9yyKY5iaI^?`BTDf+wc5ZMhSO0sQPsJ)%)GlWi9b(|)&OJJW<7sDKOHCSX8vd_Y zH%2pdW3(JJoyW?;kTY8Pk2_<+@>QqQv9>(voMltK6Xt!q764v=c;%X(%`7?gq%$h~ z@n_BgJ$dqd>y+B{;Lpx>`lFxy?u>C!LgZi7&ezGY?^WY3{q7})FIkwQ~ABrQ+Ps;_> zkC!e{ZfxoM$rVkPfWc$jJeI?k!Q!im)5A1;w!H{b3WSVMPCI|X5il@>g&E)Cy zn?aYxgpbuwA_hF_2lJ*dH(UFCl=_EwW$F>!OYhduebQcp`1tm!Z`x@U)>}e12od$ov;tzr_4QA!{ zS^Q!Ch+NiAbd;C(G%m?_ls~5E(*Ov`PbU-}&_9nVDmA``Lhq%SbB-+8X?LaI_Hm2d z&$bcB=p#BW5hpRe#okstYVx0+#&l2E;`{jl3iyPq>}8ZXH(LB4e-faXm}GkOlm!}+ zHHPLYI}G43e_EZfD1Yu{w6;CN_sC|K8eIYpfw~MtKp=cO#12~+f?tq}E;ZK0J;$H4 z2wIA=CC;)(EdC;YiNGNVcK$M4AXF-aO)Wu_u!r}OqArtn^)}|GKg3^Cgk=VQ9kV6H zD5f~Cu=%g(%jCGfjaVCMMy(W@;|OtZnX!*8{x+4_JK#4XHZ#RzMjU1z82lp(qVP~t!;X7R6imE6+X**u51!M?G;4E7zB&-WHsuzs@m zDGpoyU+D3#2LH|CzwHC%$SGxS&RRpP^a{Tr#X(zJhk6!>;b(cC^Qkf5OQrFW7#la6Lw2D*e8~7 zBHQ2;*kK8iZe4V7(=!jbEOL425=9puJ@b<*np^>L#gHo&%o(hMZ26-zSFSEJeBo_< zjZ?13YSE?6IkkaxR?gk$To{fXW(*Xe#mdB%>>bcXFq|gRf)1naop1}#bp)uz!yTY@ zu71(z)h`-$t-p=%i|&EvvD89qpKnDtv|Z&Nr+U^tC+h?>HJTlH*fMMZKa5w{%N3-| z!pO z3;1CmilBi&3>-L}Fct^cgjqZ>qa>8&C4=EfZDdn$`~@R4O!_WqKTFozC@Hk_2-cOl zcvBHwAnhPa2fa#|W$b5-@gNd6eLI@34zo;BHU_frW>^p>AnFul;})MvIshdU6gweU zM#kBmVomq6X5?`X%dz9G%ocRw1vc<;M)}u92a-)HO#; z$aM?+1h5(0Amc5|3c{?_KGvF+q9Hqb6h9*v8Otre3~e-)BnvU0UW6x$0b#iYldvV2 zi-j=zz7FQDE1+pw1wM$I&{}SW2I@Af1a8NU@M`RxLhZoT^2Y2=>?f>)rt)q+4oQ0q zG1KfAm5L<<%{9qItqS8p2u^BPn2f@;8>k#CJmI<*iv>GA^8u(j?!z?tIb4Y#%n2o! zUM}i)8}V*Go`6=r8Y#q@>wv+YKspW@1o${Zqz+F(=Vf5gz)7&_b_S~z4JHT?Ax`Tm z)<(~PsVwwzd9zBlUYDAYbaSjho<>p$bWk_{MkO>1{9wJnj!0nLp`O8apG9s|YsdUr z9=*vK2+?PisukhLKNXo?sXJ?m49TJnMyWoE&3GN2_&#Vg;mqY#tZl~8+bfMO%lZxX}fT}sw7!cRLEWt4ypI^;%`QS!gB z*8f&yS^KBKfwCM$2hYNA>nVfsn-TaX5bv3bq&DIoj3UgZSckw-fEdV4LC*X3_;bFE$Pcigf;xasK}i99>~h)+ z4=Sg6SxVlnplV`iAt)pxp&XPRu4cCYm2o#c8lf4%Nn} zdFqdC#K&}W)W&oq-_%C375zlV@h#$@*0z|rtC3>Z=%KxRXRz<7r(x?IjgQ!!3hDTi z$eU2zD}()DK<&kOHGQPk$9WzE=YO#%Y^*guL!Wr#EFG>zLH}Y&wB4rr@=vHI%xaNo z8ZthN^ySO;u;ICpHd5A)_Xo`uDC*FxAa}Pynm*5R^Ps{!z(%N6RK)<{83mw8y=VQE zUh`PJhM~`>Nx~{T8QWYb*rH5Dd}$EADG(KzwS7GRsrC!RJ4q%|0)B#DsJ<_YR1S_5 zLkrr09QaW2RIys9(Lku*{f2j1fZ6_G0Yz%|BX2>p7D53W)$l0pL36>L4qvv1jmA{u z%H|K@JGdW47yOalvkm?(!{17LA3~Z&YsN7Tsr_qv5O^F2cL3tXVqbV%n2qnhlM}EG zPbQG)$H-j{aytd|s=b|bkWHpwYFr2s#I7~|M3_zKzmxq8CqhrHA38+)&g4zShTzjE zIKfk}MNH4bY&wGOhZ8|()Onq#WCi)C2|2SaBq0kCauz}^(IID3CW?)NY)%pdgd1h zTn$HLK3cDlOhX@Goc5zzV(+>Yh+f(Pn^hEoUZ5Q`5bfD&-hpl79ocT)i9OFdgJ`)c zkbT|Qx4;Q-fKTlFQXUQENfNg6Q+a=$#|Q9YJ`gaKA^bAzPLG17WDE{`3GtDt#Sv56 zBpw8n-D$MAZ5V(4hRXrTvIVt$8hhE%5K$pQxf(nVs^V(MC=*D5wh$6K)IhMTX5i(^ zRC`40fs2M<@Pq)GLab}Ljbg^6Lqf*U5JJ9o*0xqx$GA?HToO#pIK$EmNI=y?Aq`B9 z(-6r37hM$n1VRS?zKUUs_p@vE5(-X@K6}}4^x8e_TE|nYgp`*{l=70eGDBKUtJh2a z=hw?>UN1iirE0hr2@Ia2tNP83XBwNd+V#Z+#!{P7yY^jd+^k*iFELhWt>~l` zM!r_cuG?UwHB%{Rx4$s{U8J7LWOJ{jT>Wy)YvYycSQpc#)v=SG z?t0}gi?LyrR^BSDY1H4QC0}gmiwb9TH_LS3r31{SI;js0H3z6|@bH<-&FAb&qq<}2 z;_a>m- zg|&Vyx%umOr+m2x>3ro2Gpx&E#n0wOow}w_Z|KzZak)n8@_9JQHHoWIdGhI~B@dy~ zJmi$8K8#kHagYjRV>^A$H#M=2@Vg$?xiS-6&2UI{@SV> zUNRJ%Dg7qmJdOz!msd`mk=62C<961Zw)N9-a2F6WER(nevM|#zSktvz{_G{wfAIw5 zkC-#c%|VBebzWpx2U)OUfoi7`XRFMf1>CC~R1HdW+Xoyw1ZQ3Oa_o;r+}e4g;_l)O z7au73Syu=3Os=`f%#+=Q`M|r<*k$=hKcpieW8t;(NTDs$UH44lRK^UMe%1xb0mLYs zDJbowfBJk@Y`s*;BgyWR_+}9TJ4iPRvp0;=`B*K;6GNR%TVaF%i=ne0~<9*Uw-tQt08bnOk}(h1RV z(Xp$P5Sq=zLqyUpdL!HkjIv4Um#9N%O-jTeQE(BH6WYsF?J_u92R{t1jb*-GR9g6Qm zzBJn3oEOS1!@;@|gW9K7N)Qfebta~Pp!KuqHb)I!tD;(?oz>`rOw7XlxT6*nOYIiz zHyW)^$8=8nwF58LFsj0OQQ_Fbnz&qOy=2^A#}+oV-z53TP+v?DHEwFMdQL=~;@PUl z8CdowHf*v<{rVg^f-ZbSF1r!1Dw5)%hE1048?@RSYPD%Dd?TGM8FXn(mrNyvp$t+y z1(>9OF4bO%2(W8{W5z0`RxZNBUYIV{kZ{(cQ#G2EmiFqk7VtOuOAkkj3(Gf3jht&U zW8|WLT=CPQQH?d$#5M7Q9TQ*h)xGIHd;Tm~F1m-zo1Bg#Mh;|(qBC$+ za|P*^1sP-!gOP>qD7nqZh-nat#ugdf1@uxRTpJc0kOi7B#*P{w!Wh!X6PXwNB5L)E z*zrrapgP{ArKnC4If+W44qGijI&@oZHcp$(O(2~*%>gy_St7o{0Aqo{)<8r(2+<6N zlpKOqF%+Mvfkh?a8L5k@aZpldE|{niL#;SbC5L2rB4rnm5md5~Y9+vrYaPn z3pd!ewXucdT4XS6)f$8$yhd@t5<}s9x`9$+DK^#FeA*yc?tup9dJjmNSJ34~gRS(S zQw@{R~l3#4u7UB#iw%TNa*BI;$@(uEvJJ~un5EOS&oM2SB2Ai2| zy}>rPk-K~7axY!(14$y==t15$fh~n?G1*o(=?6@<-DEoqwo{(!;b=r-udheX{zFQK zmW?UxF{b-~9&*buM=;OVz3=5ia9UvGVNZ~&m-ff~fPrI%T;2^Q<@D@6bim-UG5x#t zlY_FN8YLyFGYR|k96GqP`;aj>NRf^k>`Cu?(y&F`7jwA7UsA7RX#~V*RWJ_(9~0-* zdqG{24wdVMBL*jon=u1>#5kA@XV%T0w@9Ww?8)@T*3dKzoHl_vgT|-uXpx%Z0%h|=i*@m_+*%fi}pr@lnz@vFKqsUG1ME8 zSdnO36vXzDtL^T%OhXT;I07$r#_S2|q{*SP>%IlXyGprhmXXG%$lXP@?ArNdkl#Xw z8z#zhr#DK@De^n+YUhi&YZNzXy(GwVS6mLb+tx__nZKDV+~sx4pXQlvx%y7bZs)9Y z`D&+z369Pv+neQ@7vq{LHje+rxnu&LyWG3h)pi8N5q}2LMTqJS$ zRZMO6YiHno_qYiYCRS8rVJ?bcH*%m}r;EsNpqDHxg=t=N&Vg9ah=a7Cm_1@)>e;?- zP78X>V)m%P9c_ipy0>ZWjNnP zZ?wZM{YiP;ZVS?JfgyCR1U^Ft3(tka5wnvM!ZBQUy)N!!cbYBPVCguUl`pel^tPE8Vt|_oW?`N^c-Ofp0}_i^WwQPxPjHKcxkg4 zY2^*GNcTAXi%zb4g)Ya~s}$2~7R=@k8|;L|UT1Hhmt^WoNOzk7sTJs0VOCweQ8a;4FmN~sJ{313{j*%Dh zY>tkmcGpaBFZRVozZ=_~9-xP@_e2(wQT_8_FbfKt1I&+ICv}i~H0^voPkU_v+U$(c zNP$>#$B|a}Ad5#x02?iBq0|Tgf$iPHf(KY4?FISi2HUB)*}z&HWXWZL6sZ4FNx7J$ z)PF&!|HyVT4Q!@mAs~XGw@b(7|Dm9vqG?Rq19AIV=3$lvvq?Nebmu&DV_Pe`jBGcX z(4{F|ng!jsz|!&n%T>FCZDBgC0Md5o{z>siE7fdFxSC*1P{7i$+Um|)gfK5G#9XWh zTTAWOQq4@Iip^)5uTDpvH8z%- zb#EjJnPsVge5i=g24IQRWIv_Dv{FbZidYE*YM;#;79N&a&B{$3XJPl97FOA0gxI3v(vFLP@BF(pXv zFVos21`E`9%pC!Pho+eFTA@fnTcKLA(S};0-u3dqL>bUO6LU@jWrj%-=B0z7VcvtO z>E3fzm;%t)!V#{nO5s;I!u6+Cj?SLoD)cHv@)+BEzO)=Bi&R?gk-%8{D~!Q!at0f#TVY_aQEt^e%3(iN+9tz*`? zxYjsM*zOvtpFOe1wMqLMb;z|<>o)mO*J7=s4881Hsa@_9uGQLg-`lP;+TXn&x!hX4 z{opCrG_4SiKjZpR`>W>eZ63;`+~IZStK7cjb-$)n#ykD)yL5n(B=?6}Z=ROse!Y01?8X;#P<0Z+^Dtsc)hos8^&=N;{; zZsd7UyIxN7tZSqy_?|)2UC*FNKEugy3*hI|@!s(5?L8msug@*^^wy>HWJgbAccDv( z=cGdg?ou|hdLtytM!VhvnDbA3wEV7@Cq{1mC*FD2^}-BS^zv-hdiMA}p2u`yJwMQ6 znyT5Th8npvl3PCVB+0*rdvtcqGS3%LD!V^T_LS?kGHjve35^Sny~&fQ>$`ZPXQi%~ z=XQ9eq^J;u&wBpSVe{YcJgUF{;X9ribXLmW_vGtpUi*<}zAkXjSDu&EcjbGhJO#=K zZ`k*pWjN|*&vqM5j=sF9FHOGjnBA?=uD^SRs$#ey;q_J!HV`;uYZrUh z63Z@s>TH>E)6Y0!XJ>PNRLJIY!_@H>gWqcM+b{|f@YhL)XRmR?v=E42Cn0{Fe4RlA zs&0-or;|XxP6GWp`Fa~OwxpA516}R`q8BIT5S^-%Z=}m6H{Z;+&}A##-cOfpblDF0 zBMj4%h)Fexm{gsLN!1`G)pUM~$seKLN8PZ9|CdQTs_EcS#o(yYw}-s%rOV?6-{*!U zxk^`+i~YjG^mso#gPEa|KS8d89y<7!Ui_Obhv;&cE}GAnC!Zq8Ff1b%H|`Vnn;>cgT3zr0X+lXESJtCASl1!0|S{5lD zk%;dj_Eb}(p}s}BDKbm}^>%NOX^1RSWFv2)30<1f!)9(`J)}z>JrHxy(6-92*PPC`nOFSX|q%goc=wc6w4ivs4T8ijIx1Grc>C1i zPF!w^E6DpWx(qi(8HE@@xf+S4Ax62yXmKT4DVYBT1$k`vmeZaM0v$iKt$(--o%L9J zoZQ{UR~$Zi7D6B?aq29r_R1#?!4hs(`M7zDy3Uw7c@`}XD&&?V|KM=DYEKIhTw{pw zhL~W8azjis#5hAtLZ9cn1F>kv$vKrge%&0ajV#%L&8Ll@I-K&dB7aMHa*-c4v_<}F z!%qrp2X^CezR#M$R!n%O-HPF@UNFRDLrgKmRO$?38W;x+F~blu(XFw8gOh-h2$rr;-Y#_t*pTD;ia>nnW^j$zR795$9b_VH?42@thFa*_mb;e z*5~|nxqD_zEBOePVJRsvGAO-RY2olOtJrCQ1%oBXXq@#d59dd@T-*|~=`{5><>oH_ ze6SX5?A2@l-|3<;)%;Q>|Oc8x1J&&1(hE{tOEg}|DG;CP-ehcW&u9*si6oP`88-fmbd|5P{fV9XJc3K6`v6!R*F>?7z)Ak zc(b_061R%m2nB20G?uUk=U=2jGgbf%DBRfrHFj^Zv4HXMblqZr4bEfv*re zq*E<%hggU9WQn^tI3lIE+Y;-=8sc6`w!{W{c8|E)5cgW*KCw|&M_COdW zo8m1?yp8ha%c*d#pdxw467P!lEb+ehz!D!)HGG6(75|~j$6~7?PFmsoWoA%r%#QYjEWVh7Aap(lr|4H=qb0r;|Fy(7{02*Wi_Rs!8ELay)a_Xery-DbfK69^mRnXux01idSYg@yzPKE@iCAL23J5ZX_KTKTK0KkbooO)!d8B>`Nlf>U8|oLLL;XTf;8!T% zExjkGC;dV$sORX{>YfmQ^c)gG{X&f4dtZl?r~l#w#E^Oc;h=sYAJi{I1bI~cPQf1p zi*iE*s$WP!^$S6W-vCJybq|@S?g3bH@Qq z5D(_G1ZYU&b767OC^w&Y4-1I*@E}VfLkeuBxyVEVnw6A&EETTQ=jkBbG-#*Np#jPW zz%(Qiu=Onb1hR3#%0A4R9?Ffw7n&(jfSlYo=RR0^9AtS2fln0VhJw~nW(KXD40JWk z_c1!A7a(LP~$V0tXd z4SM&ncC=)-s}i~k%W(#z8x z(S&xOz3c>J`9ak7Ls-i_3=8TvT-1U|Wt6SjFlrPoYQ1O+a8Y|kv~W=iHkFH-EZPH}QKN+_0xoK{q!A2d zs}>6#0UlD@^m0;*Dd2~Sgw7V$0Zw6_?8Ccq=BP&2qYMthsc8K_K$C_tt~Ge9`aj0t zaj>vuY_h=;PuVTej`WRW8id^$o96_;? zfyJ}+J`i8~fq0%m9Zi)F2up!i0Bkf(f5}{pBJ`s%zIsp}&E5z{vG)ZAqB9IlLaYF; zE>*0gigmSh2DbxxM>q7T?qyJj^+3{M!wMV=1C?I;6esAV#f})(`yeYNh%9>MWn@7B z1A>5WKkHXakbaE6>&1I;4B*h5_9n$Fc?ED?anB8+J~enfdlf!H5id~L)_Q@tnesA( zco>FMv7t(DTi4M+u$W0v;8iA_xt$sDSJUI^-LMU;aed)1yk3}yV4zTgLk;|uT_8I6 z8?k{HmgWM&y%G8QKYdr}(0H>zHG<3lj{2^bF1%QTkTjsQ06!YR>|0>O*-=w* zz^z}6{AvS z^A5(}4={Xu488p)7)U>Z5c#+M)JJ@`@qpgH@wsq%SY^A)!HiUPz zUBi3WuH%>5ZsWbNMk}@LcwgH`yq_)fIqYK2@B#LCKFFTI2irUFVfG$; zxEBOeg_|8KgP$>$iw}ZzHDHJc{;`$lW)VAoB@}IKY|H;W4J87m)#zD z76u)nes-8G=b3oxe4Ri(w@jlZ@u$_SwWDd_|mhCM}Z)h0w*bc#{B9G_OT*LlqWRSG%PxAua z9HVfw{VCpplTBxaJ@hDVi4WUYmi-~#3NB32kZ;gqU@$MYPgAU$P83}W-VRAHP=Gj941mVGZr0j2Sltxc7#F@GZxIZuvLXUP&H!#duC^vt$@0*1oN-6Y^lM! z;{PU1PO~Q9+k`Wtgr7x0@{F?-hF+c}0#0&RsNpl^3JVvhd@-LO4DJDM5uY?Ax|D~Ax02>pS0P83WvY`y<2Eoi8R%>P zUK77FYkN87tMJ1opb3?;7(Pi&h{zZe;c6#Du@SmddPZ$Rs7lh{HhKpvsG2Oe{=67jr9rjlU#R>kgC*Y#ucNb|h~#%j9bS!oLG(<+a#8Sf`RoeHmY@O=>4tQ0b(0f*qBIxRvQO2`H-YPD()8 zRrWx1rc3Y+$QUkoZ}SyDe~T*~Au*6NQ<-6X)OdkKE+p zCcoHTPzC-|j9~X~Wdq^9pWLDQ%ZrGHRlV3+PGn>ou^7gaZS)5oE&JI0#A>-C&bg5# z!iW`twy8^;v-VwGnP@7`&FCOo!KbwiB6U0HAs+xs%!8~ie+Y8-5ztRR%I1Qa?<&5B zg>L0fse+=t{k|aT90sulZaP?p4Tp=6^OM1}L#5LOd^#iG!U_;Pw>9hh+xSR4qq@Hh zrfD^GpTnG4*zbVoft2b>u78Mry(<}gCNY~5C0%h(c9BGJT2sD*B28omN^|D{_JB6P zRx_TCFu+z5o(B~P_v7rLD)#Vt)lICR9k&PABLw=0&Qo`Ec@#67oz+LExlaIdDg9zT z^e_6|TE%wJZ)i&u+f9yfRcsG_P@6GyyP=BhrQgS^*gn)EmMk8+t*T<-no8Zz_Scnr zH(F-#7eFNT5}<4^qpX3j;ID&J_f4>dzQtznw=r#e2h>FGfx`9!jE*0H`}aRAw1Iz) zbMwByNqJw_HdqWA{7OC+B~3d+G@4P{+)-QRJG2Z@&&Ji3c>*)eknHV#1{?Hz{AuQe z`7{>_oKxsP(CP+ZAJ8YTag1-l#v@h!`4HG^EhjRIXC?v`kgF@$y*%m+Ukd( zmc>IDnLFd;g7%g-v;pC^#2InU{p@K(44=(l$(p6r%KcF(#%9SEO zcNza4lL#1T^PgB#ehSU@XB64*m_Yo2wdkMh2L3nZBxl)H!Py=m*dcIeQwJui$)*rj z9iMxDv1kKSw{pF)A5(V54ms$O#gs(za4 zHMfIHYC2p-?bxHfz-KUrk}ayPx*2@tS@cpK^4t70?sve@ZKqv^h`?+S84AciHgy-0 zR!{AQ$pYm=%Y8>A$#y=6&%Hq4_6-BO&IugSo!L$s5|Pjgkwp@XbQA|P6kb~81dfIo zmRC^HwZ~IZ9z3sp_p=x3I)VbY3kwHA;+Rz7WAR|oNfptoi3q4~YP+7MwPoCtSF$#r z;;WG%18_9bQb&^UA`(H_Y?o)NVlQ?|-p^iI&m8tg_OO@b0=LLKr1X>I@1{XnvYXf9Hm0zu@5J-2Pr>YR2s!duu(wU;fs_7~pS+ z1U^UM71i+j`Iul9a3c%f*EL#S<3);ROk(&7oe~-9XvM3V!h{FeD7@g8Z@loAqZ@v? z9lqRN#c=+INwZBKUU`08CArYs#Wjqq10w5qk;q6bPL|TEbmd61la*ffGR45vyia8Y z5Ilh;Ayv-JSb5UXBo3(B^}ioQbM{k75M%&(>{t9g2(dJVxx`xRx8K>|oaXSVPS(2` zk5k^mYAzI(iI?D=Tf^8PrDCE4XnXd3>wmvurzU-k~S$~5%FH`98;&=8cW z=byzUEzZVX3!D}ZC=Tbuiprj|%H{C|R-BympD07V)`&MXn{_+y`P`1)WZq7WGZJFX z540@b$dUUVi1W$WM#2RmE%C$_s}xN8v~9T;)_^%guu=`$fM{ zeuXb22VNC541hu$4~QD=D5G(>+W7&@EPZtSC2Dj1{P*(MFyEvL6j^e_7zqnt3n;wS z3Lb8-A`1}A_WB%an2{Om64~-@Z`}FuMv5YRU~JLF3SMNjl!K00jb!E3-b6brrb^|Y zA-Hzf$eKaW+t#xv4EOkP*dDPz5^;)0@XC%G`+k^BkbtpaH53uZ*xWF%x#7n) zMgSWeK{f_6zj4Z%lU7GWLF|Vxrhm;KL&&z1T#euo@QtzE!EBIfu)d3BiPG8Dk$*{9 zz2Ib6r-OSDF;wda?uAWJGQ)_24Tf|)!Tdnmt;&-zywP|aT#F4yZEy#IXTURAi=rQs zU8njbLp<6xql4)Spi?62q5?vHo@+-sH-6_Zz(^SVb&yAfTNkKe*=WAy3xOyyiFG#^ zU@U_5bYa};<%ay|ZLm_40m9;BeO$QjOK$*S0q-&00>YBW29Zb5WnRjLcnG;m7jR_` zWy9z)++YB(#Ig|-@kn|y%3!12D9bAiHpU><-f;#fzvS+@&bUl!EcK64_Vw#Ja1cz3 z`VH(lxGSuN2p~f6lJi@tUv=8LupQ*)`BwSbGm|gU2i-DvzBdlLv019irJf5H$pN;6 zPsJZvtbNMVN_nafhCi~@16?u@DBFbDJwsT?<9Ix;ep0Q8)t%!3`p z?3ptF;U`mpf%B%$1PDjjC{2{hCit+o3F`{E|B|%y`JYQkXU^MAcW$ zoFiN2zyP5~K9qF@a?M0vfqcU20c|RDeV(Rr?4808fbVzpScw;q0rKPmZ${JvOB)53 zTR<^Sk{JuF9wpgqvW0y;oYoTWS3>*h@uyPrC5SHow z%S^*=0|-ejofb%v_e}RSTf3|v)@RI`xDdq;RXQF2St>UjbGPCb6U{A^XhsQ=)i=6*t zbsKy#wWN0_hBjwe09!NWA@HEP_AvK;ECjIytX;YEFpPx`J;E=|QllUfeIWnYod*rO zOPMoZ3&>AC^v(V1UCpcVPPWb|$zW6`4|aZlMcWu{w^V;mGY!qntQI-|bw^`(Ndk*m zHr6WzjLbaRv0@0gZmbKWYzctxUD+JmEynLMfcil?rq4>74gm{;ZY6e?M{nSI;11yHDB#8I1 zlf82aUcrK6D@)8v0c}g|tK`jk8>#wJOls{=?#I*GQk7F}3|L><@`#lQp$@PFP`2QX z%rnSI#bL()!BEKrG3X0{+z%pd0QS!Z!Qf;tEFXrz>&$@kMku58|@fnTSIEs2a zEHF$~PQ#draH2BGm~5~qG!Pu974)6Ykrp^p7Pftprp{ASP#S%-%=f~26lh6IOZ79e za>~>SI0wmNS6QjD)dH)%{oHY(ffPz!=b+o{}-%)`MFUewn3_?yDY#~W#$nvY{X0ZkHifi!&3)G$n8DdK<^vUHG z5FGGxJ-b2fZsmwKSJ2&!Y^6rK{j$(%XsX!M)N>BL^E}Et>B2J)d21|p5{{k^tWpu&d?KhG6Ze|QC-Dc zdC-&M@CpcIcmMo7%(H{J0q*f^XVJK`wyQg92m6G~&nN^9Wf03!)m{4QE#w}3figU#|wy~%<7ttX?rU4Fz zC!4WjZzn}p>;V=B4AJ#Y$oe(pAs*UI`(_p&v_l{!kVtJ*#ezl7ovgijjknvd<4t61 z$~-Qf$ZZoFc!i=SsVo52-IPNriK$v_QprxDSv3TGG$dFYM0yhBSq5YvolbHYgjc^1 zq*ob+kMS_$ngfYi0VUTpkO((Hvfc_=dl#mp_d(unfj!v+khf35|I-kaC!tk_yx?)r zb|zt}Pt|reX6m$yQUH-^pn=ng56vMTD-fT4b}OFfXLq4G3-LDOLd5eh&NYX``UpW< zk&6?Li*g~ACy@3wXdC^o!`l{b9r&m-@)j*2(n`psrF;@c*I^yerZNC9>I9cf32SOq zI<_u|twfcJZ&>tMoF~A#;(xE=-Rs7A8ttOR(}lL){$w%OLrcj$gN+_i`v{a2Nq#!_ zftoIdSS@A6CBbuqcKDNPtp_wU)h(;7pmsg5)hctVwel+eOa6^m5N)(6dmv>n(tL;I zls`RYIl`?TwRUOqlrMH!^R&g!h##y`+9Kz`kJj&6Rj}**ls^|hhqk~+hY@I1!R9}# zr0`EaTWfvF4rYb$excc%W;OQ8pn7I@^ycfR+I8}-^r!^gIsLDkdoXI=lZf|JvL8uv7WzwPMD9XJCCaBcP3#l%Mykgh2S+0p6>076*;@Hjh@8J?=k; zeYg{SmHtX)#=YKnIrCE-X#S&-;FPKJEm!DvH)^QVpkgYvWYaRO`^+Wm~Ai`91J!c1A$uO$WJT1qie@EdFOoZY`~0Zl_XnV<$cf% zzcURs%V4v^UtZ-+;!&8qD}Er@wqwvxR`*xrr_} zQAc1}g%^ZpU*jG6|Izjy;87Lb|M;1?ySbaq&D~_v zNdjrKkU#*XgqF}jf;3S&0RjX9fh2(NstYPARuE;RNQc;^g@A})1AFhi_x`G{B9h?3)OU2&| zaZqn+h7h_NxC_s-OoCSegdf5gAo2$=1=?dyq>S5wU z2nYC0F-39MoNQ9T-;^#y@*kAQlMFZHxNX$rD|X3#5Cie8xgEFX6xr+&H= z^*K6U`Ui6a))m`~Lrj6)Gf&omr%%rQfk?MAKZf2Mnde+(Ty}fpEzUR}*5>ZpPJq2T z(Nkm2ZN5D+wf4ddk=|Z<_Kbfwa*Z=NfBPWvh4b~dW3hI;bI0t%M7xP|pX_A1-7ZyM z%*%>8KBNR2j1u+Jd(la2`r7{5slDuDhPXqmOo>WW1Nzv#!`GwzHb8`Nr@FO|Jy6fL z#EkbYD*|CHYNd8R7L|yI>4E3HFfaLyhrVSCX$-&0z*OJx;|HLmGUUlO?DJCu1EfYk z3i&HPegmGUmtjOr!UZKConY_3*mcs&S0cipaaxCXEG5Dg070<7}+J(G*F*aS=`UB^f0WDJC03IZrg@@&HmY%_GviB12n2fqm2FUL2XUfcZr)NITb6 zwDPh-2)o*NMGj1$;L+BE0Tle=Zc<-u0)@Rhdc=8L!1GaOwu1u8s@e_;mnYvNa4pFt zx_h*0gclf|xRP`p{mPHuyg=tJVDCU7`}F4W;dWBK=<9_FM1L=o8o+&z5QErbut!|r z5g=tQQRERrJ>o*HKj>McfJi5jEuch-ksdLMYXwpkDMoWD7xUv1j{r4`6k|PNoJWl3 z5EDFNqDM^fh{+yN45bc`u!Y>}3)$+4OH7LpC1N^126JfWzs^*JgK+uf^}W_iwae>U zrt$~ayG5yUcd5ld`)yaVStcx^Fm&(lTAvn|h{|v-g@%W+iYc;G(;~Zv?|bA04?#BH zb0|@(l6^U9O_ALnhtit_6uLhtvY$OKEp0?e@wCb`2w!HGOiNo-S~WARYGz5A!{|dI zlU7`RFDh+%S@DcCh%S})PCLc7VD8kJ#dBwrOlxb=Qd*{l?+|f3Ibi-_F74D*vmJnW zOaXrS5F1;WZShf;zCv>baEK{j`2r_MtAp2LJZ(4P(gPpnt1#Tn|B1+bJc0 zznL_>^inRnxB*8GGv6Ij|rGa>tpVhh)Lt7_LPO1GuO1_uwZO_hyeyBa>i&NOaA zQ5KD`n>JY?c35JksJ6si`bH?5Q$g$A2mc30*qse%eyguX*cF{Niu*0x4uz=dQ-V2= z2OX0tm6q7U=56*`VjmmJsjQa-D6@2AnMdBG0Y-Qb@~ zfQA1S>p>&bsI}CNf5&iWXm{(-sX9KcOyHH{;OCNy5dj7ZSUVw@=mC89MRK*_sQ^NH&$M}y(L82be58E{}V2cpy^ zIIOyYCWE`?`!`cD>!t%sY6g-l#kPDFjEl@h;^jzw4z9J##qM{pNW^9BmeB9%2Q$?d z(G_AEw3RAoDOBi|iEE$(yMnG3cR(Md8rG)orNH%|i#I`G=w|UP47&VAw@DY>Apr-6 zqy25NFKw44v_qEBU2+-jMc)A3rd9NS+)NM3`)HTkN4w=CP-rC$t~#F zLy#Ci=@@c=evo8x+;Gtj;slQ2L3d1oZJTSrXHLUjriK*0Rp6WqN?Z`lqgM=UR}La_ z07K#-LlO+}3$%sRhFi#M9EFY-7u7UAM~AZ=9}g2%=>6fs7Iq`xN*88XheAy}Ad<4>okxC#3!0i6r2@EqbpwIx`~=izdy@j|E?&`t_gd+&u2LDqyr3Kc{+sGqY@6FyxtV7?L^&w1p z*YSm&*FK1Tf(D+XG?U&t$r=IgLl}c+$G@ompA;$6?Y|jbk&E*eCeB|}{uhn>6AkLp z*VXi5hPBtRoSq4-nuQ?4i`uZ>tFtPYE>=ZzXw0QGwAeXz2`!c)68b_>da-m4jx%vn zfV+-RX&ya7w;WDS>iFaubwJsn;Yc5u^sz~w)FEK~SF}O!E4s?X928=2Mu0x(c3#O@ z(|0irP?kuHao(#o6e9*|uCa}ZQNPIZQTOKY&$Z1u@x=tPhVY^7?f{!Y#T1{W)&{z6~rqJGV~oZwmvY}K9rZpU_V zO+k{_gw5CHGt^xodO!c|l{b8XfS-_-GO^g}3>h=4sM!9~#oVl(l9v9zgyC9GCBMdS z?g*ceY)DC^-GqY5bjZ2V!!%&cIZ7~mj4wt#)GNl9*O%2(po{{Xh;T@tbG>-9f=IOZ0>cj zFT>RpU8lAfjGTk>jMG~{pM2P}3pkfLzaZSqC_QEZe_F52z6qt;z z$nurpE8?2AieoEDeFSy*UoDFQkIeXA?a$gOWxn3ZyW5kXa>x1#)SF{{%~j?+E2jNv z-AQR^G!~nJ)5Q3dU|=Kh#yTV+-A7f>(ia=~8-zyn{H<5msZY_3&fw#71NC6i(+Um1zu8*LhSgxgA^u=Qk5-5bT#h;uVORz%lUB)O!CsT zUJQuq_^~1aBHBGzh{Ulf<)MvxC7sr#(lPd;7u>dMP%k9tT7AC~N;kj(0Z}IBwfAmplH+!8S zs(zm2dqEsgM<@G+xD%nr(qAo|=!;ji8?E#xAKo3zp9`n6(u?vI`%=U$s%5cnY}0=@ zX$>nYnO-%pREb%hM2}ffI%6ikMc?F#$&fQ~>~K`jKOV*oQCt3$>HjcnQ{v1Zge>T3 zWl70g=QLN%?;e)|kzYuSN&P(y1obpJ%%{gG#y=nRri6Ry&+k?L65nO24=%%@NkV#tA@81P1UC}e1n<{prY@>!-Bl3PR{gY1zzK%$J2x$9?*1l74R|M zn--u8-e$<2=fG12t^0n!p+96s>?mXFjI*Hk{fQ&a0weW>1@;Gp`ij10{2x5NDv`+3Y(Eb|G*W6)8;-{bC_d|>Ee`BPE*7T7}fK^=ym7#FCjdA^QV9q*dwYaQS=FF*@$n=n!Z ziUiFrCs#NQP_)CVG*obGqYB1{S_o4!SivbsO(M0_3u>Ib}ZvfWyORgX{I~Nn% z2}gIY7^~D2^x-s}y2H1m22)2`&^VZc^e@rTgb&Wi*oZq_(l?l)#W|n}p;)FB$((L= z#HXcT-DWjP+6pF({4L0cE<#cQ`JXFE_dg^#2T9I7N0N ztmUgWv19z&q_Va(>hz$~G7Hc`3&Bn<(s?;NVtS}$(t|BygM*ufdPvLxxw zEw|Fmr*-=tHd5Uh0#_ikP(#ayJ`ct8?}`hwBQQGnre2`UR&DLtoecR>jsaGyE57nw zMR>H;N!NR%AsQRTc7=l4IuFPHYJpapWONTT`efb0Lv{0FH+76~vGN$TfEE>(5LdB~ zPg`8zJ?))Sp>)nD7!DG(w$%VA2?pa=-9Hnm1gnJd@xA!1;`v#1~pZHow z$Av*piD-I8q=K=^#0?$p-w_aU^v48+h97!$0))Tia2XiD+k;c)IS|Bo7$64H91IN( zhRS{f1qqG=krf;V4^v-LPYy+##8u?K0X;+j$FEF`{*$~=X80SIs__+@?h^dN z#|V5x)?h2}WIFpieqH$OYp^~r$comX&25vasp2t84Z1)dVRame$tk#Gyod631zbI} zy6}t6hxWVnPzR=6k^?11`R$9lS7f2{Q>lKiX@VX0qGn9%$SC2_KaVMmDQh_x+| zHs`_^Ros*p$;M4NFw~S?&S=V3DRL8~^D23WlE6|l@#E4e6CPYsbtsYVNUGDw`%-wt zYuvPeX_NFd+V&f??YB5k_zorh0o3DbET=y=YTFA#ZF^CuZ7&M7?MMu8rZ5d{%YOAu zYm-YiEkvH&xWTp!U=%NzAbj=~^#OlVK_fm65TGy~XcSIJ;5QdL8SsjO^ zb@>ozoH)U z8`#ADRt>MT-$?oalG&drQT{?1^52vtPtX|PQB0G6tAZ+fj816kC(rh(-BM*IyMPaR z_)@z%{WXAqypBk3;Ib%`jO1H@!F`*CLBc*szDslE`)c`Oy9%LCEVf5zy|TBL-~G|I z=xhG#rnUs^%-W?(?0i>bUyQ_lV4(U#ay>vDzrrpP1?s$jJznppk1N38Od!gipz>a0 zs|sJ(ZnZKdAtvz5sfQjxyL2#K@`@PgGZ|4gTKdBwDUUHF^j6#w`Yh)ISy#qOD6P0< zqDLlqBve-164t(?O!3H6{)Nhll+8?;=H+X!{F2U(41Q#yryhq6wv<`?Xdc$)LQPA!@}ioIf7jyP2lX}_ac*Fb_dIgIkQVR^cZ&W zJta9VLXHPEFnlLO$cb{2DJOdaumoO=(>XbXA5-};jUOeP*mP6Q@WNm-YC2ZVPI#^<%V8mxn`*k8XUh2o-LYn=M^<>LR914~t0LqA zZl(n?fOcBwrFn7@m+(?{E#_!T*!3@dT*j}L^8>fJ=1Fwebh(rt%lP%maCw!y+Kb0> zeuWrjp1d}K;aNGe>$#*ic;t=zquaQ> zQH9&Rv{J6&j1*Vq4liI?*KySK{MZlyI99afbh(i|Ht}OKkAU?&`nH&ItC#MS+xP*w z%-wQ_DR+8gHT10HT^@P2m$t}a4j#D2l=rgZK2t(EbEkX&vK#rJDIuV_PwqA)#4``b zz5KGzEAwQHSMHa!9(jO&4|?Q7{CmhFALie~VFed1)%EbmM?4y7)s&BV=@Fh+k9p~+ zGkPEQ$|vNL+?~f@rC!S8rhLjv$K)1`neCBJ^EkPOTk;t%Kvn(P@&5w8Z@h(5!;|Jit=gAK^wGa9ABYp%v=J!uL z@>7nw-XlNb-_Jer3y=KLBfs*M)PRdGcF+e8-RPIr$;v#|M;CrtSl_rXb1{>>xilt8&@HT+{{RBpnVW1p_i$A+PMi?R(COP(xLABWFjnul-fn4miV828G_v1t_!p|9*V{?zo$Kvv z<+}j^I^1Z-h(FcRFQfBS`3-iAdg06H3F_k;?BTWjZnWR`w21=QHH~NtMT|mF5YqtU z^L*0)1zrfkg86J3v6wNK9koTb+hc_p2l9qNUi-irJ4|@vf$6LNMX5)WeN>dy2JW!O z2=Ptr+iPtXZqcg`*4x(`Q^hn@e5sv@21&4tL<8`&O++$lcqPXgO^jsINU@Amqp4*y zGt$6psEr%!OAHu_QNM4nM`ao5Mur6#;8aN5i=gnSIBx4-bG_SwIHDW2m^$5*LdWYIkj9tk;57?=cqeRK}~LciQR31Tj&) z{D&1=YKh6B*fKJWEYoOi87+*KmPixnrV(d}JB(JQ5oH;zjciN&XtX&~8_vkFj9de% zY0!(bjCSnG71=N=hbxp7RcYnZ)25YQTwVcvx<|!hs(7P4+>>Xtx5P2D&O;mR4C4nl zKHO+$+8(YYl-<&}K{}{*o9rgI(-78Q+n}dWHfxE|>?DQsk#Xd{i40Si7lhN5U zx>({VBi}N*8Qs;-uef4eMi1mR|6O2IFWq3gV`~=HZqx9O)Y)3^iR2ps6Ut8?XohrHYt2oOAVj8B0Wh^unS;nOTaTXg(OyggcahY+s zWn5tdEMqATgQ-T#dL2pA##>vQ;T5!7vj^Lo~8YYQqk@_qg3+nq}S)}y{qk( z=}&P9my45@_zMr|=Ach=t)DTTwT$PC=hgg0wg%EquRU6A_Y&&UzhDYIda%(-Rh)!? z_^LhVm`?KXyZQB67; zSy_8at(_)et(S0k>SrU~M#oI}E?g@GF?^krWwlOQ3eKA2cvU7T%gy3IGUJ zidJ|BvBUEW-!|-0BQK#Ov%JT{%d8eGse=!63R0;C?u#?i{8EJg{tps)~COsA{9~6mY1*`jmPDvCpY5 zu((iU!M=U*g9Rt((WsgFv+n?WXPCu?W(ovbaS(eCh6mQDM&3c+A?#a(?^wKozK!x6 zh7w~M!o%sJk$A_9JnbC>S$7{6KK%N5)zkhExJoe=e^>#v2Nu9k#7PJ^SwILIOn3^8 z!iB&Zj%Nu%|AnnA%OqCm!+{uW<=HtKaAQv&hV8t0fS`97?)UN|0+0PD&$zv(021gL^NWsBFDnd7{p=OL_o`&~y zyl2!vD0zso;A~z)E%cd$n2xgbN10idz=gYOHa6gS5WjcAzP~$E3 zprjJwg2fOxEd@__C8R`GL5Ok<)Q@k%HsE711Iaidm{nq?D8*a2m?dThn}SVLb5r=7 zbCH&)d#eM~x}hB41>wC4Ww^CbQoevF6La8E4?%@y^+0iqH4a?!Af(AY2wXCKB$2~M z60wj{*2`DxaOvJzcW>UD>CW z^0WQj8o~q+=n(1d@^?r7ARWkGdiZ-feWhbVX5^$+9}xJ}9553w*0%u1+TXh&aIklV zAfrA-Is2&ZP)IEMv2DRU)PFw>K(rWLCC9K}4(?w9b&T(u>uL>zjX@U!)DEhy`xIeu z548Fa4IaZ-g%{+9;meTY(AYxj7S+%YybcY%URXmzgL&sObXSXmGz_K*p*IZ)E$1et#a6#CTK5yhI+UREZ) z1O>&g9#dVn371eszWWf3#poNC~n?DvfI-K%lYbgx(dJzMoDlI8;4_yvX{uStd$9VM~C*glIN-W(t2hHNV{mXgd$~H@i3iip(`TivTG2&yZ z%q0JQy5%UX%#TTmIfgn$C&lciRkzg8tx32Xy&8i8KetuaEkem|hpysgz*BQ6*KjEn zKXRS?flRue)+X@??Ev6GXmT?_nvSL4(coSW-K-7J%eoUX>`f3=ZpQJ;77)&@km_%P zD))A10_*^x+=&B&yKqEsHx3Ez!I{8)^bC#_Ucs@#-?Ue_Xdg}rYeYQl$HIC*G^c|i zH-OtQo#_G3yKJ>mEL?6hJ^r4(YAITZ? zv7ASr$jj+d8Mu-@lQ+=k@>cpn-bvrc9rUf-OW(`K=?D2T{Vd<3-ym@P9fHOxxK=a)8gzAgFkpW&e(|AT?8E*iP+4w}XG=37Tj8meG%N9AV7?BHP=(esFqMfU~=-}!kI=Xs@ z^IQd@lWVZ(>?#rYuDM`XupWuY&Tx9)9lp|?roXw- zx&@*N-@?(ptu@@A7p)$+0QZ$rp`%?0$V`rGqL0>yOA#mx9aABefKhOxd2<2GmkJ59 z4+&l-E{BZwFWj}e0s{z4I2Gx*5op%qI-D&pr-yVnpPVF`i=|>2MY_^)!g8g!nqrJM zfN;7T=|!RbG)@PMcG=VpocXonH=dzNTe8xVyQ6~-cMAy9Fy8Do^AD6zszd*@; zH(&We-Jifd0b5@%B+nQ-#Y(tb;$HcrQ+)A=+#^@MdxDHWgaqtfMV<{!Oh-agVDvPA`DaCbs`YgW&f!bWO#*rnv7EjWosm zrg*><561amEB&e`?U=wW1gyhx4$OP7xxz-}fcqX7(&4zK;Bcnc?L;>LWsBoJo2>s8 z3#fSDFd?3Bm?iNu&QnaZ!!L&@nwP2oxSaI@Ho`Xy34RHE;1gMg2O0`2TD_9bhC>t7 zw)w6@bf@MGHZnW0pEhY00Z`T*n1|R5mSYFCJVaY?%H*&(TfwQYI_ft3`nNl5ODzR< z9BIP@eJ3;(a+nukM|It;pp_^V=w{|2svg6rJ9qIie0NU0>(2c!_fkvu?m9&GjX`d0 zj9R)Me8>Z1a&u5{C=fizWvm{P3+USx$f2rdBb zBLJ%yIDQX@{0_s9M{*l-odQ&#ctro(ehNBd;)p4ZVhq=ThsFJ6j3@RaXtqigcLn+7 zTQLCm>}=ox+I=8Ed$^xZ=Q43wAl$zf4;-uSV=lACaRUyx&|mAg55SEb1#_tloVHd% z63`+5b8d<<8US=xZYoDrV_=YoAZuK8#9j~u2^5Ma#FMC|WD%TGLiZAQpo8S=2WZcJ zI&eJ-m^?d{4wm2C{j9S30p}Z@%zX31a0E+~9Yco?(Iep0jxd66Ej`NG2#+10$7|?` z(>w!b_&(XdJs+ZDJVRhZ=6Jpl_bK~923O}FEjPR4p>b}5! z3)YIZ{JU_SXc3%Moc4?R>81Vjvi}v-pS@po+5xPsi(fedW^G@?*(W&B<6vMy>08ie zNV4|P>)^G(nZDtoY6SM~qc=N6^3PkH>^Rsl!}~T$2)Z^?VVuq1-r7RoXXC&Hzm6Yp zr*9`&@$MRWCke(;;QuaWuj%63CR~j7YU%wNdbgTJz*GyTsG}tXI3XRCKYpNNbCPd5 zNxtPI`2i=%iJYFUlVp+wruIYl*V0FmSJ%L~kDg!#{)Koq_)r*V;iy-xrJnFOeW3e4 z4sfGE)9VvPm+9gEH2C_N^ZL2dlsw3*PoPpzlC2e#Ld@I$VYJh|MBVYyqpX6^QsdAd=WACW&fM zA@0GY@O!}%Jb+NU;1k#_?!#r~z4-qXP^kCB0Z8=@il0D^{sTgE3YJRU;;>A|>fJ#+ zCcBBJWTALkULc;4*NErj3h{zGB3_io#49io_^NzOyrw5SOl_emkBQ@uc|?k(;546t z%NE^Hyr(hsBS3t=;~UK2VeqNDcn%=qUexP(&98*>xJ-@- zvK%WUpvMJY=(dzT#)LZc?yrFrYu5mzTmbm(@>E_0W9#_^a3yH zm)@6yYc9idw8x4ae~OlyYB%5|HFhtDz5)+}SKy z2z-PI%VDu;P2}H&>wp-gvCvajd35@DO&4`m;^DLKZymE?*OA&B3(g=@{JQGqnuRugtIfrZ=emBsWOOFD|Ac zzf{+~kmS~?MZ6uO!O*ZJ@fLXax1sU`wIcB@kk{YCQuaQL6d%!4@iEN9YzDTngxR~BSz5&_6uKd8tnP=j1N1vAAaEe2 z(=00DLGT~mC>y$GG#)y(L?$k7#sjE48S^s-t3X!-LnK|`?Zzz-i<5)xS53U0&esC$L z?ehPr^}HGiDlMq}*kofj7l%tsW17flN|SM+lE#5HzKJD(`NPamnKMIW&eUDaJI*BW z5r&DO3+iFmpEMK{rNl7}mr}j?vfVtuLszRm%M_?Dr6TX9WXW{s9s@81ehz0A8_FnF ze99tp_{56O;Bt6PKofffH2C<;C)fhavzZ8%XY|0VJ25o(kxsC1;EsdQs;!ig8|UuG zz;fzeuh@x=3W2?{Yy%tKIn)Zju%fLC!HD(ASSP;3+zErbC2Wj;10FR3E$}T!Kbz6y zy>tuR=6RvQ=IOQoodtjnj--NxO(KIwVVEAU_hW1az6WikB#mGuaK7u*z!;Cba64JN z7vwS_=?0Ve*MDa+|87{=A@e>pM)Q>zosa4L%3*4I5N58hHp4uksG$j>2)nAlm}q(m zJ1idRLH;8~NMm$qxwXQbdk{vtC=?)oG^*=1YcD59GaH3_qH%E;RiZO&BBBfN`AIHR z81Kt8(Ok{0V4+MX7U`0(GUq7*0`jZCFMaYTSrm<4)Kwtlg|x zuX!y9$y$RLb%l;lK6RGeXq@ap6J$@CDGO<_>v3x+r&R+A#e^G2$GEF@PUcucwqnjl!L#`)C$fw;e2y5RDQ7&fb*@* z+18(6<=mHBfqjq)lF@OXJ)khzXS@M*IyXmQ%;!HJ&Bt z++j!?olHw0>KqEm2(vT|B%@Vhl9A-bUMOW7nff{~zx06R&7swGubmYS%v5MrgPY6U zfulc;{z0v(R^` zpul=3Ce=o&l3Ovc0^4*=Sq(h{22w~5XbTcY9=t9Z107iBTOXJ#aK53gp%z&p1;#E@ z`Ael_Dvvk|v9>>pky_8@AUs0-ir(OU${_O;WTSf6Vr+KHfD4x3w4lz!aLt8FWFVlF zF8tx>KiJR!F5@MEhG)YZmx$UgqE*4$cKbkPNC0&h!xkJ#>b3iYzoAVWlPvdOb-fR( z>-`vW4}ict2xZ>gp{h5>p5sdlB^GUDiLWK>ZBZ;B*?56-Lk}PYRd1oYd3dPm!=0)# z(8T~~M~0#97WI=J1Z-S)4pv`x$fOyW0oB~Wb5_Fi4gRqH#bNzRX;f<{LIi7lH-;B4 z}lO(Kk{)@S)nL%VCFaQP6%A@FqM!_-M0 zr7rSOjKjw;4j-o(@(G$FpTszPnr@ZPz*g_GK$&?__sKl47tOE~@X&L?-AWHVBQzYt za1X#4j_v?%&_wBgVRRWN0*_HQ4rAu%@o3=2-7GyGnP=M>>b#x7am(MB<~kxh=Ny~; z>U%7gZ&AQ**!C<8OOVutft#tvZTJajN}S{#-QN6<(7?42mM;QG39u6U4#v(T7UP3; z+Xo|50tWI?{3hbJ8ox=0u0FYOyhg2pZ?1bXd{gX*hu+PxN3@L6g`y88%a3 z*MXf=JXnN?ApmKbDS<87_I>+?CJ{lSD-ODLL{WKVRo~Les^Ym-s>!(6?Anjtx0g#X zQWgIY7NeGYWG}XXYhhBZg$q%*zcN#akL|*^4s5cZGvinT5CDj%j36kMQ?iXT-`z5wPJ(n{y*TuO_pw6-9X5l~R)Iw0ERoY?#Zd}}Yi1BoIXHX?vh z+TX(%23h}TwE*5N*t@4Hh8~)yzDjCUWJ0FfqL!S>7Jmn|@|4G?K8#6-Q-6G7pBR*e z@eo#!#JLdR7g^|Eq@{lm4p6vVv#g*4DP)=obfWrniU^rum!g*5~&VKW7P+Ne; ztglc%>W@&82H;GQuLFUV$H_u+MnU3f*sZ+>*R0z9d~@8MYOV?|iqkVOD=^TDu*Xe; z7P+M3xI{ruv+EfTJ?o+8JoLPWUhvS1{OzT1_`b}KSNQQNKVIVpKz!bS>O39impAzV zJ@Qn79(gK3k9-qBk35y2M?Qq!XV(Y(fF5}&eH1VWTI8ut&YyT0<%hq2ZqgTCH2s(S z@)f(jW*4-`GZ?@%m7qYLLEp0r=&jHkZ$m$O=@pdQ|ypNq(|7u zP53+_$|It^!Y^VtPAtG$!C}L2g4*?woml(Qm-dG)U?A`M&Q4dad}p^7bJWT2?1G5C zCG!A706pkwZEFX8Zx0f#4@b=`QR1uU7&ZP!sE!}~!Ol_N|6mu@cKgvjUy4FC|7R$j zZ~58I6Qk8D?3TaSokXHWDOHuf*z2Tgu=4%e{w=^Hjp3RG#HG*HU5(6B^8_quJi-GfwiGe zOCpG0T2@>!0~x0kb%mvz9MCr%qj*YX$=sI-vqoNBgB8&s86SvqH4+2S-<4w%hV4U{z0QD<0EjBHBK93i~c znE<8G_0`qomYP#M8+laCgd%c7?8^FD=K%F!2Fwzal~h)8b1a%!f-+R-2B=z8-Z;jb z;=pVMgq@3os>)~RaBUkcO}a(OD$8}riczb%x!f~Ob)AGtDuTTitWcv8IlUOFM}5Cx zK`Mc+du8d8lD27qLEOn@jSID?d_fs{41;B6>70^^(y8cUH1pZ%SN=nTu_>W6Hlx%w zEv-L5Ae{7Plmjbw=A6o$w5n3H9{LOIR2f}fwh-f^xME6aRYh?{Y1v|R>#w#Noib=> zpCK3Z?T2dgBJY-+TUl>}g`C zTK6B!(&PWJUrk9xIAA0U3&M_HOUoH%91q}~YN(Hnw=jwZpiM2hogW+E39XtfKyAeX zdb>LS2g8qQj&(OJ7nG@9j!DS$TOtkm+XAZonIem=APrOP|FoO8zF#zFTS&t!X!OFh zfL$%wbt$`Au?wi^rf99UtccEQX5m6Z8&l-))dm0vU&SZC?f4c3?lE)_Z8Vy&n%peP zKiR^q%uyCDcH>dT513OjMMq1Vhp=q!XSqLb)sQ88C)3Lb3p8(=li633N}?w05Q zQy!ux9kfI*6dF5Ud@JNfZ?x?E3?ojJoUku5`p`l3*$Ml*h`xO7qrb)Igc_dKa}v7K z15~p(S59r!NxQWxHMOz?&^1%b7ECK~&}5+5U06@bZeQnHWu{_D1Mr0!cz$H(z&Na2 zr&WOdw&3vsHu}V1BbdDt| z(FnmPsQWpir06&^IL^#7_VF)d#_NI6KHf4zj2|2nI?+kU)J-i#g1a`Y`DGV3;WUYJ7xB+#*Ja zuE0A6SkSk=r4V-+`(7EW^;Pu=!J11jDCFZv2$zK_p{*+jbA>%1Zvz_^^0^L9Od$3- z-Q#+E!lVj%G%D0hobAo{j+qwfla*&iUxfgeg=a)Z%)n|c5DQT-KhO4F!@dq2aYO2B zb?Wt1Ma@OWF}oezwZG&AH>_c7+K7CFNXGl$W{w0VLNe`%Qrf}dfJdcF>}f1 z5L)jAfCDbDr5!N9I%5QN#|SCJIOq>TG8|nv8htVjEjJm`#~C1QbCF{ua=H`@E2reJ z_=}-$xEF8Xv=3)jPAM8GVrxC|Y@Dt&1IOEtKQp0x`QOH=cSBk}q+E;dF0jW3owT6k z1{D^3`$4d(ar!aVCCXrRjPzQ{9Ky!Zv-VSS{%x_JTJkTWG~g%u7?sk`M!zG+*{bDy^9SF4j+MOFpfS1ZT|$@p0Cid zU&Hqs`iQ>8+jkWBj=t9o?Hu#`5bQJwuZIA&DiEnPJq+Ae3AhKIw62r9CqB}|ABEmH zN{JHP_G|(5C^8<98&fJq|Y#nDE#Q{aZOaT;xPTBAe9Jx#A zvo+mXj*+xKu|qrs-ADskkuI8!BddiXjFt*7jz=PBhlpy_7}deXD8dr(n4WCYF%Qhg z;Y4ew?jB$2v?oF{ECVgU{{b2_^#BdpOM`j*3Y|B$l5p=vv?uW^vH2W|Uj`c(f*va> zNW@0wN4f(W@l7IuHlGPLaM#;C8{%=^!VVJkA@NPU-3CIOv(xhV?xV%W#QDR3$SrfKbP4t zMg6=OYi(HU19;I0Hh7!MG9Ued>;3I0FM=tQ7ovvq*aei)&U{8o9%elq2@l)RwE0!t znHJS-&2KJC)lBr|sV_co_f1Oq$NUQ)jf)c#)XODj);s4P%gi`>-Eh$q=G!V^kFx@4 zEAe+^mdlt5JDP=KaiMk7C|6X1IkUKu^MxWutJY&-0T%@=oi8+(Jccjcvd z>c}mT@!_X+O=b$b1$jr7FQ}MW(!Z2VexA{JHeY+Qpqd)X5Tdr`OI!FTsvh_h_GTU*v2eSUE%eUc=8ILIy=tav z6KQGxV5`?XwXCEVsD$SlUja(PV8H1D<`C{cVwK?bL#^ll>rlf3~Z=0O-J_>`d+Jm!B9AUCAdmNzf9w{3@1RI6YAZDip_k(cpB2u3K zmyMMT2$Hr4Ja2#;>PA%bCYsa~Tu(McwQX@@sS_BSUVv^Y0yi@pYw~!k$y|6Q=h;}n zo1$vGqD}}_gI&$wDUAQWqsX5tyAB36VoYs)!1WNUq+*06)gaAchJ zTLmDPXzdni%D-c?#$KaCI_FjHP=;( zk6ndc2>bf5t1r7if%?;c2&|xkU^dc=rZ3``p(YJ8X*ec3Qx->lJQdTAiH-imC1Uy$ zi(AVaBvd-~&=z+=iM%kYhCQsi-`uK-#fUg)E@UEo%8Dx?>JT(ZeSD)Cua+M5xYgX0 zSid?sI?7fB4_RrdZ5LmdCSXldnUI{b`b>>fuWVn!=_IA?KR4Jp=vS3mcCD*h#Ar~| zAkw9pe~Bx7czDQ2C5p`kD(mYTd~IlvGi?%|N&H5M|KEsTj6?h|&@E7bcnjk)UQ@t= zL`uQ|1CAHyVv{bhaF#RH0)vE?@fM~AUMAAT7IuuF&Xal1i{B|0O@-A`D)0PQxB9d&E?d13J^eZ)Kptm5pj7X_<{At1)EL$~3q)(6 zF%a^W^>Lf(xSKWfV<=oOm_x3)Rt~t{POMUhicMM}-Gf!=UT}K%Vbga%qyP_KoBbeY z_-%Y zHB(1cwf&VC^>Qz_ zW@6o^k*d~?2}D5Iv}41703edIh>x5uHEA)E?91^CG?V{CNXE00}K=BfB>7V>2+JP1*{5 zW0SUgZ&{G53L~eEV3r>Y-wJdEQp9wAC>q{Z@)~{azSLGQEz6R ztuuB~KbA$O#MaYtkh1U)fbsnDsMP32WIcQyR;qq#*#z05i(bD315y_!o zSoKPsl`nE;X;mczy@Rx1nTH{x;-;O3dCLrPeV$x1LFZz!`}{8^yTMAblZZc*CXC(B zU%)32YHyy=J>0|9;4uH6*22i*IlytL8dW~BtaO_C`L380m9sR)QV(|m1j@W6QI^_V zXh(=5B|eRcSN8A1Y*?jNAHCWI%$i5@07jDB-mp|!SY)`GH(54S_up< z#`ra<8Y2~BP*5yO*hk=ap?pr+;{FxobB5q}ntKt7<%w#r%dp+9GJH~E2ZX9uukj~( z!$;ycrMPMV?8X%-f0Nb@+QovZ(z13Ox%;S5-FkLwjVE>sbIKQ%j4B^kQjCskt2WPd zHCHdqup&gRT2;&MupS7}f`|o@V+LlGjvrW1WLTY*d?Q4PD?y^8{ z_h{n|#$MXR=A|=|ES61KEL^f!vSdN#f%ANryHPK>jD!UCK5RkofyhVLN1H)Cs_|xK zLG-c5f?NQP$Jy@*+J#NX$eH@+gb$x);vlO8CQQ4~lPt-H0aO^av0!HAX$x0xpS9>Y zdY&_T!2;m+3t+q(#ht<@BRGaCL06q?Bc!chtMoBvavi+uN}p%NS&e6&+FgOG_N@l9 z+3|DjHQkO0ja3G=saH(&C2FQ&_0Vx`)k}BEEH(0ZE8aS5Ypbrf6`bgpN>>XtXDkeQ zFiR0>IaDNw{a#oT-CkXFt*dJqGH&4h&gP0M)$qSyIQD2^TuhSQus2XVL@cfBH)mee z;??P|G*R7(eW}qdKDW|aXz<(y5a-p>S^i{|^{>ciwQZnCY!g-i44s7~;245h3RtuJ zJSMSPx!`8hZ79g$t&iIHFRN*QjTJjvz}5(mgJo+Iq@MN5W;AaT@Fs$40*L4;#$t?Z znfQqr8(bX>gBq{nzzsta`%4esEevNbZ@wEUi4g~(wnL7}Y$VGT^(}QENJshhhu=L z#U+O%lm|B)fODV=l+4axN?u&B@N(oP)pZ|N*WJz+Dj;3Ufyx#{kuauxl$ttsCYtFQ zpk7EY&2gL6)EP`z`dUczpnaL4gZuKsG!jD?6yUW_Zi7z%F#ytFI?a>`8@XY}gA%|^ z4EG#8#-9b~kRwk0+21^;KVR#B=@2w_Qdljn)tSN9_#_jrZ2G*VlSM173LWL7*NS!N zw~=digWt4H9z)wJ_iP2K2U^{Q`^1v;$#{la9Ht0OC zKb^2~>`bFE%*JD&&BjQ&5hG{=#?V%bpxwlZ9fgqB_6FL1KOnje3P9RA7&mGz5Mj_w z@Iy2ihbtZ_xH+1Gon|}i6g$xf>>GySCKBVfdr)W}n7rQTLO*6j5r-43o#T9CC0Xa& zn~1^S^4QShs*5wH*flHw(FGV1@g=y(k>H#BDEBD14BXUs9Zd7ZXzu1v!hegit#O*SquB%2r@9^v1By z)BmPoqsL(2P-Jdy()2$mmal_TiSw#0*UZ4{ACwMdygasqj!?m$;m{KWMhv!&XC*it zp`c_ZtaF5chK!SID3EP81uTP4Kr)kn2En?6Cljo(sG7uYEFV0uN+pIeu(6@>>zv=P zs$o6rzfgC%!^Fqdw>Nfmv-C+MHCr(umB+dMi#?QtQ9RvJ>}eE^cGsJ_4r8?yC*VmpAnU&gBV=bexIfz%c&{1-J}@ z0_xY9;GWyyh_NG><@4|{A7@QGVEK}?4T{X;hGlMjL-ptztOtMN;VgC6J2hgz`KSE%1K*3H8D-gI6UW_-5tCiG08(!_PSn!sG_ZjD59=P#)K7N}*!ZsLN8xu$4 zqtD#RydC{e&M<69`R{x!q*9wV`jTt!yU>^K5(CwjBYi`~1*+31-vX!*+&9Wsti||& zi9O^A+!j0GSF2=XqB{@E_h9;VT>T1$b{&FSTa=*%C(>U?YXKh%rrf}NKHgYzzjhrOf#*gjnu_HMGS1_tgwvLm}W$dBZ zL6hzUH|GF!=3VW(PI;I6wndmwsTo>wDHvpDw{eCAm*=-x&D6?Go`l*{%YB~;bu>LZ zL2W-7>DSg`+BIg7s=yr^6(vZ?ES|Qo7&{0Y9Lz=1Ma7G!lxXfl3-R3^HfEyIFNjHT z@rFm8yv~Tcf_q3jA z7Eby0bEcZ%g|(RnnCCspCThkx=!Z>i@^$e)#!4WM(-Ul}2CifDgetqq*SS3!!J^}i zshVfZ5O@0mrwLOvY@7xbYF;Jyy{665u<@E~H^B;I zeenfE9WGvf***+gSjN5|zaFeYk^4zs{*Wy50m003XCU?GcMoOhPXStxW_$rVsv@e8C)N;EgMt=~Oyq zEhK@+D3-elnSw{(Soh^_zBuP@=oZJa%q+CaY!ssm60^BruvqOU*vFc|=#*4^*Y{%KMp^sf zEAD4Bj?;7_wLKJ|c#JQ{7IQk=8g{{Ah(F7*BaEntc!eSb80aIp$JuTLxab4aYG^fO z=R%=Ezi(pLv+PhZ5XffmpN^JuL59)jV5^8vonNx#CYGdsE*qgMJ?`v*b+ zx^RGMt>ck3CU|yC%K8YFqq|Wg(=8$s$T4jIHJDGUMV|oRzb*i((^#N>O~-w`dC-2~ z!NGFjP9Uk!Si=ZF{|84&8o+3#VGKh{Q4Gd7z5t^oTMQ(&BxC4-n@*AbQjYDi}eU!dxbJI|*K`5Ybqz9eV@*?{tB&xc|Z) z-{R}Hq}p$bi)Fi{N|WFp&4e&5D>lPTX5>Ap5m8ou-08bT*@IjWZG;P}1-Bw{z+6I( zVW;QNV*aQ7Zv*|j&C~`EbPo4#_Sv=5Hu<)jTA3oE+IPfpJ^z5uWotg})EDtrWrHMS z3-!}3XjCm75*bk|5Bu_kj?(K9-@94bmHMe~a+G#0{kLz2ptr5 zy0&**ix1b-qMWtrr}v_hw2G!mN{WhBRlV$t+963%6++b&*fT_b_44G1cr`OQ%CAb- z$K|SfGNTez?ytCg^I4(YS;wp$njDoRl;{If6|bq4DN(6vKp(qz_?u{nx4;*_t#0jO z57g=!mtT_5QK6I2MjGhjJEj&ogO0&WveZ1e8(1p&C@ht_2a8 z42$@|0(z4<(+K8FBj_^rhKTcuaL`$Bp|Ff((v{#rHRl;i*Ls-qjA5SB&m#F;x&aI) z|AOK4(ar3-5Im=!c}_nq;s-?XU^~-;CK~z6rCtPEV+IBCEtu_0XSOq)*7F05XD;1o z(ngaunY6ifaMP$83@_e-B6+pBTX>v`>J;Tu*LR9aQ<)i2gT$?BPDa$j;?~v6#wDp9 zQ(ZH-g`0Tv-JGsqiH7$#6PnW2pdxQq7COSHBLw_G%X<@H4oPH1wU6vdcUuq?Ox60X zj0K$?0oDq3yZJiHo)GtEpg4TtCr&qfMApa!)2EkSItq66)T-t%PbjGA8K2Z|Uxr1i z^47i-DCc$sh9Ajx(An+EELT^SXu=FHf9qg@mcZixmjTyqG#AO7-)t=nje?#R%tl(! z@p{B!%Ho(69Mn82sYzG1NeCq`mUH6qI6q(!hcWC$N-Q3mh-x8rt zwTOBYQ}Id6321JrWq(E`sE>I3{uben(%iUeeo(}$$un$qw2RMVPhU{}xGc_8Op^s4+xI5^{+Y#@m(rNGdK2cN@1TV1O0x~7g2Kkui*bd8C z7>T%^CSp`t>!=q3;G|;50P_d;()i<(@%D4R%BkXNcJvGxTEgjY#n1nnd0~1 zOP0PqS^5^SsIm0X=2?7iLJ-X|&*OXb4N$D>U|Vf+biy;tyUyE%!zbu2pTK!tXB`;| ztxo08_s!ooHeF+#vH+nHyTVOJAvlM{n25b~R}uz1>znljJQ9Rbf@;fRY`R7_-gKx> zvO|I0qRV_Izd;lj*CHsv78XHK%4mloqFWNjXLB@;FNcCbI1Jq$F0y5rE}YCj6p$<& zsRWRwWqO!$lZ-VECn8qE`t%ovk;GPv3A-43Cmu+kI@rsP>dnIjIW&{dfPyxMDNv+3 z<4(?Mn0(?LnQLRpkf6-GJFvykcof0!ukf3IwCcX$xsW;Oe=QM|6YiS!yL~{8_&2`k zfNpV4d>+frQ;V;S_?Ijp=V!PdmolD@BbXY=%cvl#8_mlYUdCFq%k_)#h&n_aCkX3# zfo(1}9cGxyf}rm-!`S9RjDoPPRcm3Q^Lc(o)>Rua)QoxpWsm^+IgdJqGdcB4K*DOEqb`GC^M#>{={mfDptlKWU%y6 zzw48ptr8D8mfA2ht&Mu2S4LdbF3#-KPBUw!l&=Car==gGpKz2Trp28 zA0?HJw8jybI51DKMQ>&=qM~d0anJzO#31Q8ER?VA-;|lJ&c4j*xOv!z{>?X^&`sT2 zl-60zycK61?Ryd;jsbMg!3dUt!{#APH69nv4~H!U(=!ALxD}pRcnU2?T-)M~9AKi% z(ZZsKpgW1svSdhj3ZX6UHS}#eyfvit0Zf>;z^Hv2^}K^R1^&3~5>cEmosd8=x^kQ# z_+_(xh40AcHVx~7CK{C=He&$pFhp8y+_MWp2C(+od{%Cl4ez7!f9guj&PJS% zbxP4S0U8aM6K?vnEXI;h$4M2VmoN0-<9Kpp5g3wW37c`iu}^)lDWf4i3CzepiONS> zcOUeB5FW~ky1A)h;110z@Zv5D58X6LqHT{zxFTk83iRFX$uI>$R?9$YN<^_c zGbAtXG5J2cLyLa~#0TO-OMK*G6#KY%!J^^wr&N#q%tzbjPU2s@{F|3AeBw(s8Bw!l zYa3`5^Q|Smqt4X#Y{EfH9P;CU`yqf?av1X+VfrNS(q0l?+uVp8Pb27M10?+Sz!8=e z5Q8PW82c%H_zd1Yw#5T7AcH|*urKoR5^vK@2r`+0CZr6aebV9k_beG61r76JwtgJask384X&=Kt{)0?uAyJg&{eL0 z)<&wToUyK^;pyI?nNoI8ZTp6n7d(kE03op{1=PB}m>ibG$if+TmcOI-xXlIaFe%-zp6$ zmWRS@*#q{IU|eF$t`bX8r#7Zk?e7oft2u*0l_PuEvbXGm1|VFM_`SB6gt}An&@+}q z%%_1cnj{Z1$-ZC0RE4P2hLow;L%uf4APbj6XihntmXsrGIa0!nUXiw&)UlrO#>#Ov zq`^1Za)O*_%Sm#wC8yYMu(wyS}t!?!6BhGt#lNi>2fAR0Zqnc3bRm#fP#+0 zjaqo%MaV5A5t=KY_Uw9z{o|#!DWozFxH#)ugl~zo8C|? z`3qaF08^uOSg3snK{qT3X+Mvmt-5hosE2Z&nr_at78n6(W0IeRZcDDA@oTS! zqTwM7?AnGoV?*Oy>EvgpVT-`n38A(DDH%!5>LNTQZ`T2Tt>rz2L%+i~Jb{OVqpXRcb_s;h-Ye3#Yn!hS1f}M_G@|Lq2*}LRXl@jK zAfSVh8`V73Tz!;zZiEq7>PhN5ScJ9WGlpDk^Mt5iAw?_$p#)Oj!eIf6+y!5N&?QOq zU=sQzdqrMpEH_3%3E$!eB#w@l9iQ*CMz+rFbn)n*yn`@C@2TNSzH<0PJ0m`juu1Fh$uwhJMDmo2gUhB89^a zZ+{J@b6BU&2iw^J+u87v`EDUp9g>xu1*H^IMLEVGRUGL*W;_?_$svpi-QmX$<3x^F zn(qeIoBn^Z#6L@1!9R<#U;K}R`JbFPgrWBz6zP_@`a1+|JxC@1QwI^F*QAC`LP3XU zkbx4ne9M5o?U2CBp3tiRVY2l*;XjNC^RsLASmGM+NAN$iF>;RhJ^_~dPc_}!RDcCu zzS%0S$Eah$I>W`+?yWdSYjhUg}fXhuu^cMm?;!5M;i0HZa`3$6hpoV!|hM$@(t zro%p14yS{O6GjCw6in)n(kLIEZi89{3rI~n4g*kgz~tt1lnP5>_2;Rf&h3h6iHbla zK#3bf3G~idK2TZHLY))fV?h@xR%&Tou$*0@CuDilD>JF>ho9Oldci$@@9y9*^kJZ! zzVqo_7aWLwlc@I9f3jZt1Ll(%L7UcagBUoe2yV0<7NztI@7*fAMfXRfzlm~=*n-Aj zhlzpeAM1=mN^diw2oZFS&N;p@=Y*!5sA-}UkZw(yPdU|O=$}lvAAmG_l9A@ziPe1< zkg2;xKhWNDp_sn}yHE|7v!{T_*Z}k8^I^Jt7210(xTiNjLViEEq&p!J-$RBdq~EWI zr(k6Jw0KuM0|U-y!AyG&Q5XIsyNdmCns`yp3Bfj(V@rnK5{&3=>W|YyZ4%qDT-~U3 zDMGSPAL)FwU4-nR2u-?+J=sESwvc8(^x`Bs* za%0uw`5}>GiH9xG;5#JpEX0iY9-Kt8SQxy(m=ANYyoxRm+nvu+b<=eVFj^kQp%c}tS*L%PkZD&yC80@yth zarjMw1JcQuUQ@()oDBpziHzK<)1z$x?rs845tU&iV}P=uMW5xF>i1q!C=HYKX{wCo$|1AgITs=G&D&XzKCHQ><=g-jLxpE2c>53!0t=+kB{Ek)t zC;z(RMI;yU!|$(38*R!|2??*CGUo^8a{n9)iM3-!$9~8x0W&u3;FN(19mq4l^Q0NF7AcQaCq|{Bbg2T zGQ*FXru+~Je>6{j{&979Y`RYMKRKM^HTqMTtM0lJ21s}28M)h=CuRg2HvTgFNS>CW zxnG+-y9+46F6!{>>5!tSaBdm0D7%w^HSxL z=di?IvAyT=@@wXSz_37`Z%K#?!}3C(gnrr?@?v%e#ybTP(!#LZQz(b#Fu9&hT8>`Sv4hEHY=7ubNdmro66xO%G#V0qLCfQWU&T-fznt zh{z!3ss*2?_f?PG74G1c;u`hlUEynxI9+YHJA674!__x;hs!!6a09f-ekIPefxc|D z;cWz$yTzq8&Np0kLFYiUR%iYe^lr^PVSi`DDMr8=Z3ba8B#kBE+2;|tn@*0<_D5~` zTlrXuShf6ekK7C4q1?xq%~|Tzd%|5qzmvbW#QBrdSwu=O!U!2Ou+pDtj4<^1WcMR2|J2|MvAFQ38NvPkQ(lml!71-#%0 zW&r{EFOceR+O$Ld+m>JOqc3rRdgQq_0t9@mcJ%UQ&AUUKi(DYtzmebCP!@iQv1-xN z7#-8eDRB4BULUmKGWZ7z@oRy#O55dfxhb6<)@9qSbXSJ$%9PKjZacy!r-cZRc52vN;dbheJHoFiIBr_6HoX|0tq$Lw zFAxF)o|BC;mf6ZvBuR-RUGnWt9dVlZFOHaPp^hA8^WX9i7=2OFrjgx zVvgeaYuu4lcpiS~7l=-P_>4I&#Ho2v9gN^FsRM-~aa5zPOB8E*G1%_ogfG z9HW3M%b@$73xNwD*Hz4=#MOxfz)YL+Vm>AF#+&k9!@SputCLw>bz)N;N|hYBWNz}+ zF9cqKlCNj(8%%D=HFtEDZOn3$&H@%Gmt-*VV}cw5AGD8Nit}5VD2egZ94Gy-{Z~? zTz0I5y(H18Pst&$;vYM*EnFOL9G++iWy{57j%8pPxdpP?{;@T|^r&U;YDMzK1WNrh7O6Z2-#>?_1)ZmiWM;Xav`fEC}jw*5`w}AO3;< zYd(c{SlsmX+hzDUh_FcF6Q;g_lW&Cd02v4B^mBZ6NVL_tKQ8%J6La0s03av0V!LtC zu>g864kQ=llmsSMpRMA8-C{Z>%nYqc3ol@^@EAX{!9<~7F2<2C(;YW+jIXOXCKrBE zx#l$H>Ml9&dSXpsDROFQf9QH@wV_*cJ@e+4rghIa@~x<~3i52=u}2!j$#9?uHMtS55nUc&?shj3n_Zumit|bGW9j`N&Soyg*>fS#5b5IZapbz z(PPSeo8`g*914CL1)ER5#i!{Ouh`I6ccVal0 z5~Q-f#~(02e9g1teH0D)SS-bgpFgGhlfs9@f+^j1iW83@6#lGL*Rhr)?+3W4>6M!c z8a!LMv>RNjp+m*k_mZvV9UEQ_UOaoW1U=Hxt}0+w$w<}I)USF%JVLv%s)xnWy$Dea zI? zl%1mXsB*T80iH*F^XV&T`A)I=vROOD8t7fG?H5SZ5Q8I+qrPMFZHm{?E(Wc(c8WS| z+VKBTU+M!VQN(gBatqg$&cIWX^5uG;5LgRnHBW%`H{qnR-1`1uaWcPdy76`2ElvTe zG}m%Y&Gj{i({_msF0E<6as9^=;bnSU84}<{(9~Wi{2}530*qQfd%(VzjbkmBLJ9C0 zjx|z#yyX}!h2Zrs7;C12K%=l^kfItWydy!*myVf`2EuwE6a~-4DEEdm5w0t+60ic~ zIuQ!aPhd_Q7Gr3Oh%NCGOG2C)mi*!Nr~k)E;jyFxNv!l9#*{pMRtSDgR?Q|qeqI!t z`1$36iP-Tc8j)(RC8Iyl;skn^YBF+56b{c=#{&k8e3M;Fmi`mDo0krOM60YKP)nes z%<7vV%P?$^*@)9I3D+Z#l?SoQi`=a}m6%Pyb&6u%)|H;coP%iy^@VUpTSQUoepbwi zd2gm^gWg^kOU{cuh6_|(^HeyuLu%JqqF?ECdk!KE9)2S?Ew!(_{8YHNo_~xe+XL}v zvoVLDx&$e_Nug1Vd(G)G2Q!Bo@aZxa`7r;G%9E`)|Bz}8EZl?H#?aE2V*Yi2J+ajD zkGpMaVKGSBcuc87l;3;?s!jsOTqZvzkrkc`jAL4XHPJNn@H@ z^p**=oQ!lv+U_7bCY$yHn#O*gi){nDUVyueaX?x1Brtk?M>mmYz?Wn~`;x?KD)uaQ zu#IipDhhNOyW++zz^9=a{NMz=UjeDjn3VkREd3!6U>1*Kg8nUmFtB*srHVgWd!dw2 z!kq?#sx#SL^};jZ-ecH9ihO1PCF>+Rqvbpg%rVN5$PoUFbbovB2fGB)(?J`_d^~l@ zf|PQL)@^AR|7DRGs!_&LjJ)R88|ZSq=N%4(GaLiTeAy;Z_Y_zZ)JcRN@xAb5^{4m3 z3)O)OGp0A3{(ks2yG?H#>LVcA8zV}faXb(t+EA#mj09tR9Gt=o|1vzrTZ#TxHC*^* z_zsz<#$5jeX8sEB09HZ(zX~|R8qo!E-0q;}20}JF7;@JkpvJ~Q7CRHf*<7&^!p^ht z)j6Qkeg$dlxk#OdQ}zNl$h#2e^Ti;ZFTn=C3AE>Bp#3%jmAMj#%2hy8u7*0<79fV# z0HZ*-)DG~<*-31ZJrJs~r?^E965Hi4sBn+O_CE%{Q(>F>m#@R$Yu@wZwin*B9ozeM z_58}Tn7Xk`c2M2$Z8+4B{%yFyu;ZUR6u#f2PIEc^GRWIjzX>@#G|cyO?w(i;A?)f* z_4%vNGMv`MNmG~PXZzIP{Opiw_i)VLP#AXBWNMc9^>JsD;q3S9;z*Y+VbivfGv2{W zc}J+vi#;fzkK?GN1(sMH1s&Ku{lg=Vq4r*)fToS@ z0$7Q0i$PmDSl}v)(E9R~ohr?TbIMn)t5Q3@foe|4J}6auHWs$BcibN7jWew?v^JZ{ zS7lc^Q7Kf(*RE_`cV^Q&UAHRh=4!3JdlXDeHZ7~g>mGEL>4D3B6|VDliFgaT&YFWK z)|m1NxXL?KLJ9?g-8{AD;;5}Z&uQ^_r)0jtv*xG+Wxj}d^lB%fo^Bm*GHo0PejA&q z#Z(s+|3ggst;uZGcC=5me?6Kl`>W4O+#QZxwk7h_XJ0{_#MVbzR1edog06EE+_QA* zq_;$(gV0NX+9pFG68WWu9GNH39uQ;H@FzX7LE1eDGH_Qp9Ry>F;}HNq0!BX6Yrs>! z<&!9W=`fS~Q9zy1FPH^yT1gdeETyK;y+?h{C;S|GG59zK!);PVrPL^tlBmtVnhzll zz24FhSteS{1Bw5TDT!E@BUs~nW@IpXu|zS@aGuo;%Zj7Nu-r8UlpW+P~3pQSFD=*e@#%A;wd znq}fJbBtVRQmtPR>RCj!M7oW%lS7lyvV&VjxEHElF2w*YwbXqtM{@PhCt9@Th!urH zxmN(=)SISNedrt)k|{XR>Z-ct7l>D) zcYvui$*f>mFHk-9L4!QC6ZBQr4Ud>5-KIr(D8jBhY8UBtk2_Z$tEGcyO&pZ0*{#=p zS^<8rs_wls0GTG%+_9TGNE?A?AFHw0^`sb~c3c`T?Q7ejeJRkG{-P8`Z%BSOc0a7= z6;+Pd94x;EC$MP=xK-u*oTOwb*SMH7tASmY);7-(otY}L zvA82P)-jw{Y2N{(#AqAmEF2E#a8=`ebna4mliZ??Tws{@t=Fm3tv-x=6RcV56ChEyzsv6GZ_R zq53-70;ZX6D6@6YZDBSF2TWV?kz(vB1BC5p3ENVjeFJX=7Zd6tHEF~wi-1o1 z=XSdvF&0I@Fm4DQt@;4O#RmqP1LP$ajJG(DmqZ6(FN7Cc>j3tJND*=cjqNUgs&{k9 znOs2aleyUJZuko_rW}rL#6nE9z6T05{h+r=4sTC@$L#4rAYtjEqaV-+iVz&8NW8$m z7TD^D^2`knG~9o&^R7ouR99W@oaCR3-G2&7ovIF9?p)%Zf!mq5oyFUZs$sLU(mxm7 zoChGzSKY2~)@h{Vy)QciZbyr6chQ4Nv%{gwwDPs})%Df23^LRbA!$W})hvr@E4iDu zeKRHyY|N`5fsUwGcF2%=p~M~aH@)cjL}pw~%80_+1iM7X#ursP4d@*$K4V4NNVqY;M0wPKC7CbmR9&n!HKDkf7f_C#wwLn^vrDd`CL}Y^9xl!a2)No`B~|(KH#VaNPMO6S2i%$%&$- zO-R=h1E}AXPJfF&7-3PPK40R_QX{W&`l!>da@wf}u5zxX7skF$2E#**tX;VhM;vCN z6x~$#!$?$}U+ko-Ln|Olm2m!B?#Y$8YF{~U^i6ru7?2W(fhVbH%E?v^vU|1jMpbqR zjA%C~XR`X&l}>B7tGbhVO*!WV>fxdXmeue|r>38VEazP96gnC7S5mvOYV^vgRaG@; zWuYqf_;S?2wZ5Qw%;WN_D=v?=hH^kZQ_qakLRmS@oJ2y~KDm6Pn4`)s47F2**9Ht$ z+#Et`|KDAaJTIys{v4=i)>o)(i(-ue#2eQ{72V^98_S+ms}KG<^;n6!ryD1-y6p03 zE6bn;{q?ueX5TihH1)zIPJbwno`cXP&CbfG1^^Gt7rH0ugXUE?e%{MH09_mWkxe#s*~>9BHg?%7;5{b~q|B8zwpwLltKDLQ zn{o{=*HVsuJ+#9;0JdL{U5&~t z3Fn}~8!6YhN!yU?@K9Z#F4*V^6T0>=FlSA75YOzQ{`D1R6gQ<1O``f6rt7Iq3i6W0P&)+Lga=!%Et5T~)@$vB9bwjdf+ z35fP--Q>d2L=`xrP#a3J$q>M0r|`^8+IO-n!{(nI^x3D1) zOP`FU!CfQ+0qeFh5HOlr>H)@*d3{3oyMfminybCNgwgiDL_D$F@Ee3>^;;iUM$nhU=4WurO|v&Kg?JUiZ-xh{TBT)vtSldv%yC-Jv>LxVu9#Au;9Q_xIuKH?S&Kb z{uWMh+=YnZY2NG<*$@HdG>F_ic#y~OLC>4|hPg^abTk&g7+_mtb*D_z`X@Wk)L1^D zhA7(#9Jwgjw0US63`LVodl9b){GKT1$5vb;E8PGb)6ypZcel&$ku z;DZW1Jp7;%AFzKJ!llQe3S;pd_6{=&aEvIH;Vm5O^H#UDe$_;H##@U9iT>So8U zp5EbfNdQ&ICHhW=4jZ613CRZNG6k`e2o711zo7v%hgVq)oTMF`cPO9uH*a zw1#I;;Tu=oEk3z+RePJW*~WBgqdv<>Z`~OlyDSDZ&?aXXH&_=lhF=AIT+))QlMY~@ z!Z)Q2P=D$cPWOOxSgtZ}_>I)>B^dcW;@JSB{*B0o2-|e^^NKd^JcZ)RoHM1raIX=)j*fij6 zAjn7dWNaDf4~PsubZ}s76qWKmGh3CD|v_3e7`eL)_Cx&C|8H4j+B2I*<_+o|_1S*6Z6GNh< z;XAUcyWlPtIgD{~H{9ctE5x)NjD3}7nj3}K`c(7a^I-B^>WR;=xB9+jBoVV;YY5^o zfBDbrMn_13La7c{15t>d^wCAD))VoR$oNRqa6;2E06@V#Zdkq3xjIj?ZG$g5*=8Md ze#yB#plAP$KRkKn!#$u&g5Z~*xRbO}$w=u}hh7J&PAy#Vo=?2*M>qfJ6Cd#RLq7b7 z>?>U6G7N7hX^k2XpYbv5QA4yx4bdJoAil6@kNTDPnz?ZaiEnuMmi{1NlUgXg=k38D zB=kQ7#bKb^xJY~{6E0recr86?!_iIt3sAz!3+z%ujLaL7ev5H=JBjy#685IBMMYR% zr2M>OLJW$cVZDk^!VvoTV2lcJYSgD=^jb2TMzHVsWDf0Lb9u?*1r%Y=gpBG|@zd6< zs0}VK%f)Tmv|(TGeB{Z{GhO){tn84Cv$5@&>JVg%PNq6DZx>$jd0W6NUA3n91l?^} z6qMa$cT4tQTZ@?os^%ftGbnqBA94tZ-T~Q1eC?BckyAiZv!4Z4O#9wetl^e7oi^^^ z25{&KP5>bZMp9O1d*okIT{B08FZAR&zE#zg(`u_pCZw&ZVlFlO{EW8h(XXZ7ZGn-t>s{voki;wh z;dE};`VVJ=R0Z!lZC&EohKcVxH_5_su>{<#5#y(5r$^vQ&ncTU8~mn|#6nuQ!Yg_? z4PLSFr~}pMowLj3AX^TGX)8oB=OJdO9L8{=Woqwr8SPGl0yPz8ud*R`frj#vxX|tu zMz2hbkx+-A%_|k=VM#ktPO@RmN}E@{p2knMh^^3?x8)3N>q-xnH}D;_=85FzLBmt4h54clBx`Y5C=gIAW< zEGwR}Zq<^iwKi1i*IKg9mi2NS*ey-(N2*m%1_HTqJs=}bCd5vWr&{tf+`wb8+@NYc za?VwkUL6QGT1{4s{<98F?4?0IGLxWeCoUgIk$Et_CdG8^pnF=7{jN3dCyFv;NI2e?@PI6(-z zUKO4jBquEBVue_J^C#7~V^P1*MSb*dsv<>Di$s(CD z{(P8XONlk4s$s5FH7oF@LY@6Dr=zaMw75UcpaN_@U4b$W(8U`(-DxbIP8wICg)Fq` z9SdczSHe7uQCkV7)w(xpbT!t$o%V?)zzjw;LjV|t(;TL&A*w_jW&;?eQoRTzd%sU2Py&S(T@L;z8+vzDQ)hqvYI*#XHnC2HY)>#N?>9Q7G z<2zG6-&HquZ5U5(xF_IQo8x^Dd#g#se}6*wCa3jykc~p9Z}# z5)1jrbvhdD!@hk6QUpQw#^!X#_ZyPU;UJqbHsU*z%QTF+&ZLs`S$Lc-TtCo)^lT}= z!NfGcI0rAOI&Oe*0%`#;gjHAslp)>lbvZJI7(E+}J*U~g(=9irDk)fYp!vY_ooPfa zUv9)pQ{%67jpwmOwe?G<3_ySNOQ&OkKsN(j(pYRsvRL|w{}qT%3Wd6WwM;d1&M`B+ zncq~!{H)33qK*YS4u@t3MMp#lNw(k;tWuJ}kt}Eunn&QJkujd+#y6F>xf$Hj8atSloXN>mNd7QivNsYu6WvDjr@uK_ z&02Qlf7r4sj@>eE)2N!BUiBj_uh|CvriZC zw{M(LD5BlBPWH0Yw2A|hH511MHhPA(5Ea-Z&B&4&e;p=R(+oEGIfYEFR8=;qFxw4} zh~H`S9H*qN^SW-97PbDwbxx1no(Am7=rn>Zgkco9HAVEAxY4At$h(9MNO zV0a-gSfqhLHL07Ca$`l!)NS)NZm_0|i%^F7GSxQtpE-~s(^9brM8abMEE!X!M%-X##8 z*%yf>0#k}W@3h(}V!K2a%>sI(XeYv0XvhEFl|zbJ!_&SC8dUfyYeMqz>a8Ld&eA-U z&E8c+Mcdj5UGGDc9+2ayu#&G~+j|{wJ%ID@Ef7g>M0|x~+OWrJMCcUrhIzCSsHsSiq0$8}l=+zK-S)-RC6<9v89h5~tm7_bLrnc>rRE)vpK03|NHqO(kc#i9k3 zav$P__(hc@fp11^Jg8Q)RC;46ao^xlfj0_aPQHy`U2)&&4AN3}c?@u*c~FizAI;~G z5s@&3n6<)BQ)>lUnW4-dgrk>957}~q<4MO+9qqE(#q&7?3i)OM1P0jAo zPZwg^%}-P927BNK5z?(bB#Lmdy1&ubD=>Q$S)sBIJNY1ih8=c_WvQw??3}B6qJwTr z2;yW10hdG2IO!q^Wmi}-6k>uFA%JcVxc)2_GeFzTmc7M1*+-lz`@%IzKR9^qFYb{8 zVB0rH_m#z@5z$<=myx2xkb!C4bJDPXQSTbQ zD$}94mK!LPO(y{#u}X;@I|aN^ zddUFA1N*8Bu6d3ua!#`%eaSvHmB=S7(M1{*R}RO1w2~0%Cnu?=T#-(R5iooharL!O zJe~wTd{TzMLqId(B{VP%UQ68TR52$o6<6Y>1B% z-XQ>c)oB+5h z#%(|RPJYug*JqN1Q$@K38}?dJB-b^9IgZssFna*zW8+j@<;@Yh2B-=A5+}>)m`P{A z%Ii$sU>>5KlP3y8KbaGS3u;{Bycm<*pgeH9YBs3l*r4<|Vm>veC_mu1a9(KA4m*!N z?djY3DHy%!?ZtH&)_e$AKEf;*w*~QsxKhX)Zao2)Rs+2`-d4Y^9caNVLlTK94InGA zD9EFNxj`N4OpQ&8L`uf*6%%0fGcY|Ty(^C72+Z~DhsF3Lb|w1JBqocNF3F7Qg0x8M zSn_?YUFzR{_)?TT33h#IcN*5#lwD$~3t)!*3h2#gbI-|u?*1zzP|QJn5@x)R=i#t8 zABH{`VT-#MHhq_%X_sQjZGtlY<(Lbb;Wh6H_=~e1u1N4n= zgs$;T;vspncwXKjUYECucai>3-fl?nn*g8r5Q9B~X_Ai9@B}@b7>BVwd44<(;6DQz zCKCriFA(U@VyZocvz7#UI`)o2VP5Jk9q6|cD@e^)wkosc?hia7g1eMZxWn5rg`S1SFA5=wu2UP6$5Gnlu2;}2pxqL!*h-le(2!3A0*m_~G z5QgC=G3cS-@VpLm#e?Z%XfSsC7mXeN1tty|PLw#liInL$i3r5JV{tr)O>x{1OmQ1g zoGFBGVd@P)C{;*x5<+3&*4k;ez@w?&GKPSL;hK|}Beh7mK+j0fGbtt+)CqLKJqa$v zJOpeeV_Y?C;z$_SOeyS6f-6HGGX`CGt!dD;4ubrXXd|D8kN5qJ`*#~GKf-}DW|zie zvS%Y4dZHaE#v3OG@+a`YMmCZZ3H>r0(qp^qo0D5S| zrSrH!B6=C*3RwISqKRs`>_~;xjYT4sw3wGC(l6ZIQpT&R*;KB zebLjJ#!#*9)-9!HY{P(?f+3%Vp%xlpAk&l4+xRa8(OYkHA1O6_U1ZbJ()E$mvPRvtJ~CLYQ}6TV zRMq+9NIxwQdwNG;#igBuxU>`9!A4cQI6GZ;MNKWq4mNB#IWom9x2g9}i(HT$FQ{G` zpE^w*F`2Vq?g0_a~8STj^&L1JPn5 z%R4<7z1P;ydgMgUG1BdaIz;!I5jwwXbcp%lsZr6}wM!;dKPmdUwnxc4bIqnRv7pXW z`wxX-k1{`MsW#K1>FSYTkxaEQ$L(#nc5?J|BSmjLBYLGve|Ba4kS8y|XBVjZM|$*U zKK0q0sK4QzS<$T~^O^IbyB*!AVJoA3ji|BJuIfuK#Rhe$r80NLa`Prvtz8Dyr}6bh zjl9#y+Eq{jD_^p*O6}{JnT4QJeoPB^VzNayM(OB5_q8Q(Z490dl?9>c9uR$k5P!0-7}=ExdVgD{n^2O4aaj2+-X2tV+@q=RRCl5CX7u}_TEniT zmV9ff1je$A?j1HGe}#q(A~XYM(Q4(#gI?Kf&{KPFMleIciBV4ef7l>(V6?Y`x^#U; zOrD|k-Id-CG zh_u`(ZPdu}n*4h8!*)AcwY}ZWkMv;X5jIG#QC3n#T`kV=W*ym;W>Y2wRvSA<`^La2 zYQ`OQZ}pEa5WVHit6kp4IZ?LB%HF)OP)~`*hi4W$G4*V&9o$@=A5&M~Vc(qK+#`xZ zdm8isyAQccA)IkUcLa`IqjMW1OI#Mh!J!t^yR`+4QA#VuOfA9O>|1bx4 z+{tr6ue!i3YyOTp+I*MD#c5z1uG6QvWZ#lC(*cSLnfhTfl{*Ejq#3;{nYxvyy?4Ho z2F(dNaOGnl1GIC8&kDa=aNqSbYh+*Gq5V+yz-Eo?Ct`_YEBgWQq7R*>R^lbjX#V57 z)_B(uX#l6aR>60PPN6cimWGRj6E(2dT7O}YK0JeLfPs?X(RXiFvA+>IGs))3Pgb0w zn@z>;WIfcE=V@0+Od4P&BGsL=L`&83y=!0lvJb|u4=m@j-5c!GJe;fQ(~~@TiEI5R zrjn`%RhmOnENUcIF~>^2Tor_@t(KS7h^J=U9!$x1-gLer|ZBIxXa9=!a}2H$`*P0iW{_v{WlkSEzjloGh>I ztt!vQzQ3_grlT{vX3bJlKaBKG%UD}gQM*?C{j!W!$J+Cb+;7wwgJP}v9jish+VC+| z^l;JiY}OzshcS%&e}w@08AU${Zy?F(XV7`O;x;hZG2Vj+N$`-m9=$MJiSv} zab0}Xy1M%KlB)RXwYBT3E2}E8ulI+q9a*W`l>5$+0(x-PZfa~f`cP9_AE$Bz^|C846Don9#OrHyescuV;4N-L^0=Aq<*r0vTbtrP>X%%t z!W#79K5FrK-a+b^-rA*0tJY!oU z-e^FZ``&m4a5cW~hb;wmkI;N@;T&LmOCyAu1%`?XG?1HwBF+`he8-EJ_Iu2cmsCoA zh$_8W(Uml%_B-AV3GH<}9o}(vi;Ve&xt<1**&ssO55I26rcyM_ct4(kt#&H5@aZ^9 zW@5{q1+;%QQVYNXH@<>KVA~rE+`)q`bq2ma3>~EBa9|Pm1p{eTvUhyQFjy|ImyE>S zd`kAL`Ctq`(D@BT!Te*9e;l5|5yD}#!ws6&(#o3jCj`5oNtA{fz)`W7O2x2y#0QLh zDp^0>AA>a%PI*Q#XDqvDj<|B&6?2M(lFeodo1U;;Yy(8b8N*pB)`J-g0hbfHzsqop zJ37i|1Z?yu4|miV+ba?oB=xgs8kYTp7{EiHNsLL38xqQVIv3k5^6*hB2a%F!OeT#B zc5-XPjzL@_vw@MIXUojQy3ZqPS59!y4AEa7I<#TL_16JXeB5(K1r;H~9$pBZ*2Q{!(Z2*9rem(oLd9?r6|wfo|}Mz zT!z_vxo(=_Ddi+v!KhYjqlw|$^p34C?>H=+j!>D2xRZmh5Nre!(hSnCQPHePP4hdy z+5GlmYH(SmKBI~l_n-;@!C+|_*r%Yw+5$^!IC*{4o29A7#6{5p<8AoP>!K5kC*iMe ziAGIy%#7QkFPqeRcSZM`)b9JDxrRvop9iCt7%Fu6uILG-lqa5!mTPAU4TY~n(+t(U z`rTgnTQCE*xYcF<@+DPY_kNq6(@^_X^iET1>wiRh8@IyYj~@Lv4>Wq7PnAvcri==| zjkZ&NAC#TdaMy>?PCm^Ex%h80?@5EPJ^zNPvbUl=)TbYTuwL|SG_9f47tt(34L?|! zG3RCs{mm{F`dwP{+E@P&-D&(J7s*&=I}MK^@mP05(68zeyTiD#itXZe~LN-iEB3#*M2 z;Yy;{xY*@}1M%dv*h-J4{deE{*!JooA*vgO&yGD8A9pUM2PgMfEE8-@P~Ao%fQwsFvrzb=A3Io%}UC;GBoR1s8}d@sFA@!4$nOFp9>G3QqYzl$*w|KTUmzGf zaLG-BnfjZSPqFDRC_#LpgQB=$z%OG-KieN{jMW+T+2Si=b!OaNzA`pBU4!+4iI2bk zoga!Ms_YggPu;&Y7FQp==Fe<6_2$_3hH-TAJ+WW-bmrCjKKSIXsQs@>-WttQH;%~8 zZh-HPw_Mth%ZaBv@rJr&h@EGgaqN9G*0y2HuGm#(g1oUOw$O~}us_DWFg(6~uf!}f z-G{##t2R8MS3VE6ORTS0THIoVj+pc89~XX-Cj|f6qo;H*UbnP(b#+Dg+R9?)C+b!; zr($VcO9NXu#uB%ZJiCpT?EygXHcQ-2J`SG1y?p@QNblzDJw9=-Pu%Ad_k;ZQiXDFB ze;@#<|AS0D6o4qZfvKIA*hLx(66oEurMQg`9}QAk%iBGc*h|`K51;G{(svFoe?S1z z0G)AK;*X34_7pEqBPe|UxW%*7A%BjSKUv~=UqZYPz~;W+C;kkN&y4Q$TZB47zP~W_ zGMoKBKJiz0qzjS_&z8OF$Km)|0Q}oGEb%6P53pHp>DW$8z01@;Ea;F!h320u?1La; z7k>nQbpdRdpYZZ~UeM5vpYjzn>PJ(fel#`eBjVqDMvZ!(_{tJrEA4Cg{y^rLq0BiW z$0V`ZrPV9pba-qqvv+ocxbG?sN@biG_=@(ai_5~X3`Xl2!VJ}j+;@svb7de}6fam? zwPv06T@3E#$Qh%{W{qAne#)rP^M=%~U02l=%=X&Vv|KG;sTMD^JE}jt=WWlJeLYkp zGqcTArP8PFtxUI+vnrZ3v&EDkp;%cvw|4CcXoeSb9S=L>_2p}!T3(+*!=)&kbo88> zV=|4bt*Ng%rC#;?t3O&?IH`8+vf?FcYs-P8*TJfLU42z?vNSyr(b;h@snOP@o~7zQ ziLd=AtHKlkO6$0voBEQOi;t*&^%ZuXc%ZI))#{a1leD$3lO$Z%HktLRHJ{2>acjA5 zK^>eMB%7A(Ly}@wXMUI->p~c{tRwlFgdP+*47k;6RxDQ?xH~v(p}Te+)bhvvj4qtK zmQ6gOmbv;NYRKgLNU>to8|S+68i~YxpIDvL!FVJe>ftRY?404pga7dW2&;J7#lt;D zKX-QGc%=|7jibnKrcTnj@9XNTR~C=1S+{CtRsG%%bQxde66P^d%+~oygTq!GGqrK?*I}`P zq=gRzy%+R|kJp5--ZB~~_9PL!`0Cm^?O~xu-0UkPvYVB?ynKC?Ca1A}s^Vr5#Ze?mo@XkF(}KRPN4%`Y z>`MCUcy(R80v2+bGDmJ$O=FX)s;Q__-+!HvURoD#+_cb=O5^;chYg_kG;aaH>udzf z2vYA2A=+%4CPKE%gaHfP&%0}6So>%Xfo8$L+0qepfHNXx(&#&3!>3>j<7mTmdbW*t z%#aKo5x_0amaSxKSU4X&axG^_YO7=Mt!>!`PV+?J@jq&HN`2FgV9RzA9Ecj(-j*F? zN6m>zv?s8W@T1**brpvh|EItox8X<5Cq<`tiqBC&kwSen7&dw|^`O*ZwAa>wEmI>q z+p-J$n$HM#T^ZD_P(q_J*fd)(WPRhy%j+gqGV%a#O={UX0@i<}tlwZyh|(l6??rPxdpm zjk3QSfN?oqxy?5k*f!a+6vjvZ!eGQnv>Cobb8(g*u}d%q+#~&r7Pp)Ugu!qI@$=bG zZA1H$7EQHk|6Ts9tnn>RUoL^Vs(SU=*BRMPinCU?x~igjX*F1DoA+PbUA4N?6IOrr zxU(1!kD`+HDm=kmpw=GDXrsRG ziB!mquzztkT5|vXD5PVVp24cahrR)7^xAMI<$f;OE}*>$Ku5P=yxM=Wx0ib1I#(z4 zr$^IAkN3ke&79rvj#1F{xI29Fp~DG5E4Pt)vzk<0Q>FH&$s9Gfqq}vwU0tWwke>0X z>@hHlXWZj=)b$lIr*n!G2RCQ<9`Q9Wof#dtZhXzuwLl@@im9Nh+BU(722e-JQ^J}X zgB{f;ZeLo$UQ!Fc8s#h55GZ;E-Tchu~>%M(?8s^OXuV7B&XER#@Si7d4Fx5n$b1Av~zm{uS&gV!}$1A z0%>hk9i38`VTK8aI`a!xKXt&0MH0s!jQ@DxA*(3`rHn}2wuM;lvpM*8O6b~Q-l#o zldLUMwQ&tG3+){W2GSHT7H5Nqn$xTqLt%Wk2Khbk0a|C8;XV_TB=n@^J-bDO-nv8K z{1Eu32(l1E@3JQI7mtlrAXHS~@H>&${7I`)k1X|?EI8#&w1qZgi9bW{={Uppigt_? zuv5f$i}v#w4xs~M1i*p=zO=pYEKBu6Di3yU5M4^W+eIHZUgs^Cd0WY#EdYa7gU9I# zP~;{oL}?&kz}h2V5Ib-W*ty+Vr-Sed2+*)g6w~ZO^ftE-i;^P>>dAsq<@9PQ2k{;d zsh~GL?yf^Nu$03+(L3j10iU(@1ETNFgy^?j_#>kKPBEY~4etiRZ!Ldziqh>+{)F4_ z2gM-x1c$QdVD@yk7&1TCKI;n(h`+z~51O#b^Xb z@G&sL7?=eyD#BP^#zD6dO?eruN{9)d5)BufbM7oKUj(LoCOD$mn79R)`K6fXL-ZV9 zg85v5c~=Q-_9}2nmx3p|TwIOG{3xD2j`{c!CdHcw6Y4>T&@&PF(=JcJ1CD>lgS zVEoKMl+YDW#XlKQLK83|Oq>r_LYIqQ$!&;s!V|>V{QmLReC(Z65Bp zIZzs5W*#|%1E19XkrNZ)e|eI2kW5Y3XcMI2qO$WKrP> zj9LR!m*wplJ^>NLMjyraUU3$=K9Jm>BYwryxd9FzQ|I$?L4YF%Hx-LD^u4n1@#4l8wx_sa|1uUk(X@&Ow614>=vd# zk>2Jb-|Z$c-88?gP~62^Fx_qv_pttZnfpFCt@nu?kOe+qcuS3roD&X-jFwC$s9VgO zGHub+*|Vn2hOd#{sT`#EC^$WNc{T8VWC49fc`8uY<+bZpR#Ga=v$?(s4$y3(OOqN4 z>3i%#s@sD{weoo7{5Vsqfr#V{{+s*J@phKXc&^VjP0nr>8yuoS#`vZ$Wis?!IVM=GAK7oS3{7X^wbs*hW zuH<5ZdIdN%73C{`ig6-EQHpgEudLQC2+P-=O5*aE92BVP|Bj{t{VhbU7k_9k9=Bop z2+8ad^uqB+o0b?h&SCgPdd7w(|8tt3l9-7&w?Bs!GE+QfGd__G6M;Y5unBkxV`n2Q z#>+p8zD`k26Ag#YW`rUeoO8796`MSI8=Q4e_OFW9s9lK158LF`+rSJ^?E&!?Le|>g z(BlSr)Hal;|E%>$hUbofFFx{}%s!T6nVh=13aps-bOa(BCJ_MDOo7mTpD^{Qm}etE z`R9ysIP1S9>4*Hsd<)P?$0qj^|2O3R&l-s4#EOn2Q+I21B}c;P7g?`d8Qez0HPS8e?oi1 zK}(OuIU@kM(ICCYfC3wf!5arUb^<7ui6BNNf!3Q0Bflv)ho|B!od%B)GjJr%L>M^3 z)nd$|i=mQ5XO>Vzfu5Tgi7fGULTf&OorjOf<8q?F?7BM4UvItkZyhH&i-y0uRu-Ad>gx^+GZSpd5A#AY+E zO|rjimZn=>U@MSrT_-006A99qcYr>7e|T<7LiXj)H*LKKrds?fLB^vA+2%(ugy#1T zi{vIi=5-O3K`jIm%Y0%vy`EQVPuywd`HCQ!c)ZjA3<2<5;D&pRKs@g?VyTaM?qGZP zp@F$R2<1OKIfcJ~Bq-sSIE|ML&}z5D>HGpJ?YRQcLf{jE&g|eoByT3RT;_bY->6f>LsQtq~P8s|J4Rh?$?FYX0hPrv=|?!$q#l3@VY6PPfD* zU|p*--}QEZ^E8MNTFn4sH(>B|iX5cq^B-`TO}&OO&988t1e?lGYidKSLm}Nl-8L;( zPzUC^VzAn$*u2+IGtZB#Lbdb9x>+ELHbIU~r`S3Ff$aUrLLf6IBOAmmZMD$ZlYi!s z-TcH(92Zc>J3$9gcOfEHGcz%Cf$BWnd_Tm&y?Lm zeB4c3!HDlssO7_z7S>FolKfV+Bg=^;t`XN_F=?;pz)?W!LiwJ8>B@Crzg0lYpVBxe zu2{adc2#X%ErfBPnRQ)8s%;vgwFMwhzJUSkwOwWyrc9oZef3s8koA} z1SH2qt61pWEDmuY97$qiZfEB(! zafQ|RlJuau^v^!8iZ5`-HvjgL>{jVbq$`AYT00EejPW-gKFyA+yGOu`NK1(O{vo}i z8nG|EO#PuMt+SW#jw7c>>i4j(7Q09dHodLbYN&+xCGxTf~DSWx!y#ZbKH=3 z*}LFWhjN&|LCTW>HjOUyhGe0zoAB-_?6y;c9ui@r=v|EJ3n^KEjKPC*HXFD_JJ?UO z2cA0&b`>*_DhK+s0-dTuw}>XtynYx2g){{5qI+F|48a47hdlveOqbhC(F%l``?H}b z-xWP>50s5`h8yHS%21KH==IFxf-W}-r_*8&?I&EY@U%dHKxPnzb8Kme5#Nb|oFp(u zYEV*zpP)?8t)&;=sw6C(CRhT!Zh|Es5?|*LK3>A8AJ!04x+iuauq46VEMJ>rgS!{7N&26p5lS`y+75s$1t=+oTnOkb1H9JZ z$vJ@4C4k3O0Co!Or9l7aFyDirC;-^|px5ZL=nFv`5=_#iJpp=3671_RdgFmj8?eWd zAB+WVoytY9n_QFdmIKg1c$)?Q(&R%77+j{|pPzG3bg|G_Go@+4LJO0G(|_d<5d5`J>9AZMF!j&30iUZwI{WP)S*x(fBvi9zBJP)BQJo zFH0L|7@cU-^ng_3tfT}C3vRZaV(f*F;RZ_NI9kC_(4}Vz{9~9X17pohpo<2*q6Q$L z7gUT?7@dG;r5ew5hak;(l#PCHg0#PZTI$;mKkhNI#bOY;Ng8nm^o3_)ksIry(WsY$ zN4F&1kON|O04NWg;xM{FUJTX4NKyFM#;%P`jv^O|A#(YIBgTHg!I2J5J>n3J)fZUe zgl_@=@6azZCMSle5H~hg)g{^jJa|~lw6w!?|lOw&c8R?*y z7HJ5lgJN2QFH%)B+G3;#3d}wR04-U)`MYx}o;Hk(z*2ZbxmE>k}(sFxy%zVY#bx;Gnpe%D1H6jA<@3EkF~DtgWo7kmC5ND@Nl) zX9XVU$7kcT?k+Wwe;j=qeUz$k8nhI(#!ykyXT?y+TLMM9V)L%9xU8?fZW`^)n5H{k zhrWZ1(K;omlk~r2o2vbvAT;7z1+l!T7y+`ZG>-khWc#`6SFKi$oS)f22f~u;8DO&` z4RMc8ksOyGpmuOzX6?HFSKD_$M^$ZqPkJ(wWM)$Cq>@TP0wkf=gg^q)Nu)+80i;6+ z#U}`w4A`Fvim(+Ff<(GhDN-asqzIZ=P?25~0fEqqK)&BT_e{Bf@A+TreRr+Rkh9C# zXP>rDA3u0VJF&w(Te(H%%dk8B|q1;Q6ib|Sl7*nE*>FAV=JSMg|- zFB<26`Jy^K=Zgwn8&6i@zQ6yl7120RTb|heZuKOJBYa79A1vOw;)kxVITfqTF1j$R z;XEUbt%{Ac1I6;J>O%VRrA-`nk@Sj-`RWC$FLoEaz^^)lP`@1iT6j4qMK_(7McVK| z+M3N9W{D>#qB*B)C1FvD*R+OA8a@ziGan()47Y4n_2uPlas0KGcu?>kto=~T!k~6^ zp$9G+`Dqs3S1s-n*r%bMD~PS$HFp|XK4NR9%9C56x6su03{7dx8AAXSw{(r=W3eH$ z?W5r4-rg({)HJZdfbF(ero{;F8I0k&&ic+L_&*MT|6_rJPo%zIQ4^g)02dInnX1po zzsPOTPP50TK7f>hHrqwC)xUs)oAp7-o3P0Wo1$FdD!OgJ{wUlL<$=FF^Kg%=*Cw6! z2Hco{%W?piw9|K~Q?4`2)g|y7J6m;rVDb;(moLx(oADEXEfglad?C(-qO&k;&;_%n z=msYR_xgy*%rK#>5gua@yu~+pGeRu+?KSWDwB5A)*a=+tyF2lw&btp8@6!piVP#J1VsM4eX=dQ*;2 z?J4=SqK5bj!#6Q`&Al%hjfL=pY495SN0p%VeYKsKaUqOk=4nY)1HO!myJtS*>9Gep z%T!BRU}ikZjyc?-P<~!AjL*?}?ACd?V{R5(jyOJkm@!1!PMtxnhp>hStq)(I0lc(E zB62=Cslc|lt3>eCk!tV}Q-jVUpbc&?@zG~;BJdat{^dlm$IE)f7h#82xEMG@56RqZm zViviwRTrIC#OHn?-`S{bos24T1YSuy_@S!ujvRs0_i&DwoS1Dn;tEHGqu{jy{_Se> zJTNoLo=2B~_i&~Kw=BES2|s;M)}vjY*a-T_Bhtz28d!Ybo`v|pGr+4I;kpPfe#NN< z4NO;R(Klv&Ln>?-92@($BGGN&?gJN{LRSLZL!vcZNQkzvVm~s+@IV2KRY?v7EP{} zYg5@kXl>BALsR<|xWPqyvey#qEhcW&FOZD$nb4s^<>uTzu;>nt~O z-uUH36TX|13%H5Tl=De+*LC8K8np?~1EbD~r+_~d`{~kncP7fT;6ZC(#9zFjm9N@? z4}+S=1xI~<(j9e>%Q%`5-lNJRN3=1HGQ~ye{;X52u_1~#MTv&^A7#fw8@_OZON1^< zk9ytKTh~s{em$444tmrCY=G$OrR&l|j}&+1%UIwa9TN9 z8UOa;n_J)on{<8jNMIiPf8MACb_=`1c=EwZ*WXJwfO13el6uBLw`}s6Xfn0>o3e|N z`+wZLo%_Ek9Oy;WO(?Ubi+VR8IK z=jF>^L-`(+L^%AuSU^ks=ZzUoIr20fDXDtJJ4!c#@2ee2jnni|9xQ1Twcd*dwP*A< zH1C<2C3XSweJkp*uIPc7d{s{#z{`|doF)j1;2F&EVm_1iRHdjs}v#7&XX?2<-0?v?Y0AFQ>MRYuFjR)Ych%Wx)$$o}A z`dZsZe9&He>b&{6IUl^RjY9#3JZxt7#fpBYJL~7S24P0f@TKNvcEAHM z(-edsT*?AUWe1eO{1SWwwEe37sxz5_ZlE3}lj;N05H@Kf(!r^%aw zFCN9a8Rp7()R1LQZI(T?S&r1o#BzvCY!QUI42Fm9lvut=M|~f?1B7y~YZet(lDdUrH0rHZ(0lU8%!Y`^UHoYEN-6!V z49{t?BSbc7RAwo##5okXIJuS23f-pT=W zsQ6c$!#}Z6@utl6N*#e8yB_#Chn(>C_DkT8GJe5%q4cbr>lVtcWlBrfxzaLpQ7V6YE{W z=_8aWA@8#weGabV;cIzBf9{aCX}e0Fwh0Hd?M4yuayLWXw!N-&vdlG%Uf(R!a&q(Z z#+}f`GSs&T!f&bdno8S%iefBkx_jqrxx5^73|hOsyy7(dB)P=wAO;1{dE( zaV_~=>Oa1!ZpKA)`wZHyee?+jgF8*DFEc2JIxVe-!pfkHcP4vWfGMb{M z`yXsX6UtCWleCD7l54>hMcm#+e;_i>y(q6 zgCTT#hb%5(We~9Av>Mv2ObHQUM{?-y-#IjtO3GxGzuze$=nS%lL7U~x&Zzpp5l+3S`05Kj3tbDd=E!rOjt%=t4S0%+# zNq2Wo>bF~pTk#ddEgNKu=J;1iO(7|7t?618q%df^_`;ze9#_|2p;m6#E7g0qG~?*q zkmNm5MJCYeSENqaiPm%bQIi?8CAQThTeQ(*+=Iw}kJKr04&BT7^v)pnc&dI0DUaGKV@cVISbDqzr1>@X z1d$FQY0zt}-H)LrGsJ81{JYsUl`8e_K8}?C%95_kvmyQWDYZml;IWoELOE?L;i5W>oA6Wm(=a_l8%=T+9hI@I+2&r8qPM^}nO{pCYtG9K zn~$t#(0UL3&q}VeZ{1rqQ0KZIc1^$j_3n-hNF76GO`H=hIJi@(#eSK>`xjrD>z9Qc zDu9L>hIdUKs`;R#g+8(|w5snjBoP_Jz)NJduhe>_yI@H2${JhnuOYY^<4S}@5ti84 zufes8a6TEdJY(}z9v{gwWIPQ#paco7C2!RF^i6l2jzP<{K40a^uye)#jXiw6sChXQ zjXi_b9&?7MGJ+j4oU+o9i_guQ+o=*wKEso?#NxNq8aiB+P{T=rn(`sfIE$LUQwb3X zT6nVkq8ucKp}j^_-pEJHkob)fC%9Vg{`%e=7@I-cha1PLTn2{%Jt)`B5-j+XxuKo^ z01ty!@#=|>#Fm@kfz6l0JUl7-TUijRCyd-(5Q+&IfYx-qr#!;{&ifwN!SbyvR8Jqi zzBwP!GIY`;G`ITZL{O?{P{Cr zW`iO-x*knJ77sFGg&v@7B`g~n+TmS*aWrn5G-$8F#bZ59n5zZ!*Q6@2smf7PaKy`k-eauz^GWrN zfOccY%W#Jd`R5u2A3-f-(As%+dsSknD)BLbXyHypE!1byZspqim0XQphdZfKBK}qi zT4}1`Z?TBNHr4No9=Jz=?^1ZEF5Y`e6%_EOAhK8GQ%&5SzJxA>LF={YhpOzz4|TfB{ap{VTO*g{buTT|K?# zbb(HPo>hFsyzhdVQyNpnkC^w}8m`i;dB)R+Ob2B`>kVJvH!=V^1JHQczfx;?f)jo5 z(-0c@t5oEpH`^TaMgPU1wei22sZ5n)5Xt?~p59b>P&$0a55n3N=}vh2V6gd5K<(_L zqB4s-&5kq=+rGv%0o9Ws&#oQYXiz_u{=`hr7G(Tc#!};Fh^iiDM`0S{l5}0)VAKv= zfQn*6MV~>z%AchbHZ?sqX(VbJgVqWzRy)1XVSqykEiTw5|FkaQ#z09)iB7R;sA{6P^kUz>r>)+yX{*Ut>r4 z)A7^DARh7y{I`rR+hoW4ItJc#eg*2O9V#jfgzKhhc8`=u&8ye zb=_Ux1;yiCaDdi6Q@(ei4?QAd{^F@P{Z2G{46QXW|8~<(A4o4IdwJ8_M`ZF+XFa$7 zEV@4i?OdzpeJ7>`Xnte>>_j1N9?JRL+*S&%%wFqidAD3Rn@WNXA zzX`-^&VX!9f*babT!qJ#7}3IiT(R7^IT7n}0Buh@c({Ysix$lHGSQvmQU(7LP~qj7 zHucVX1ySJ%>CcUKJ$ZdAnnMPy#G;jgp$k==kmawM<&>@$!Kp%oNd3T*2p)p>@q`_t zqP*j1-$@zuF~)`8a^Q@PhVq&5F9&TzU&8=T$NrUC=lM~+E2z%JF36L#Hr}2NreQl< z_wVhCr#%4L;zvoh$(qhh@3&&9lTphr7t|AM2SlD+Qc4iH_!qzXi2|o zq>@&(N6*n}I1*TKQfhVO45AM1Ao0CCYYJlud9IUIRo8?s%b@ijI(M?u2h*0{WGX6t zgQ&gvD(cv8Xi8FNAu4;%v!qX-P8FzT3|jqu8K^Rqk+*@y{w~|o?Rod_O+^F3pzR=T z4s)Xa_IGKj4GmpRwM1vaprzkYsM1HDm7R9xS*Y6nS-TV;N7FWsQR&lFRHnttpMF0p z6ZjXSieFMuIp^`BRm(q=5Me9-4~8b<{usVP=Ai~GXT6=?mrDMSx^C++KP?w_WzZJJ z9%U+h^eu1yIp-vKcDHU@e@2OA&?fE1L6yAPf>xlPjedUq+LuFNW(MtiP3X@m{pxdy zMU>cm=a8t$N2HRIJWa%{vOv{757eonDypg`8u-HVQlS^iUVil??98Cm|EDu{dJoF1 z>w^x`#&qEF)R+-)I~cS+o7Z`jX=aR%Kc!ueYq5Q=zrSrcQox{Xo69fS=_9G+f>f%) zea*sMMxBm9JLmlMH9NikoIe%u5GTd_iN@l*o1S8dJfkS{h`S$+`co#om~r|ziCg21 z*8eNuREIzKwV)TF5)mmxUnF?fR5Drz4MbtsQ2s01H_mbaqr z-|*UJF~YlaDldoW7_@EC)dniB`Js5@od)sUjvqO1YxlLr_i3tf8=SalVMB{}euaKz zOC)yIlX7m{}>eSA?G=y2WsXZN5YFm%-zXlPF<2Mn=P15TB& z=(EK)uBLgR<^Z(ogdYsjQ|=|DqbQH_F=BO#A1Oj=QO;#75{{YVW2C<>Nu~6cL69`Y zK@cYR#&B?}=Vfs5Yd#j?ZVTKY#`Stts#T*(_I%mbp7*~G^%=Ad%u^(3)0X;}9Dd=- zD#%$|Dsingpd43M$>bH@E?8e19UenpjXk=r6%1`DsvP$Z7kuZV4g}QHUV+5hn*>2` zp|G%3%)LJK6_g%^-Wp;5+Ae5YQ{{e{Lc!qwXX||tAZE}yzb;=&hPprc1U?jd$~DCP z-d;&i@rzHuoS&t3LaOOvWzurE5DaZKYRsw-Oqq1AwOm!3;d}0u0@s^C+mV!<7PN_U z;tDjFVavz2uOo85OAN7ImJz!QRXdu@9H`;y$!4FyGanqg9g)QI0i%Q8y7`6P#7f3D-vp zKUY}6Bfm-DTsW)@ks2!~W3OSj~flexE`MdK_7E>V<}yBuukAcNCx%a*<4Z3MqK-az&4z)z`~qc_p}x1}FOqFB^qLgb$R<(`k>W!wSPhn`rpxx%=f7!t_C&^%- z;eSaLUib?tbhz(8(_02Z2o?M#8=Q;4yG3AusQj){oALvIvv1PscMrImdWjnwX)rtF z9$?%~v%n=&$paZ((tQL!l`MiIXhu^UVdBjwB3FD~=@t1G=6x8nJ?`*2zGQ7~Xdt2% zwC#^aHf)GqfQqqAqw zc0Qp^1sNVGYG(*^q$Ru>Y4J-JxDd+dgyi?VDa|&HabL9YC6{G5uXQO0e2fzjIjMj{ zH@1(9)xM6tv){3VQgyW|X12d2+^t#$KK-;IbasE+GXDMC7sE|>!4IHa{5_k;GFou8 znOy<)t|L{M#ghycio^>9qC(Yi0m?Adj`E{I7mMmHy}_t@Go4ZM4JJBfW4*}uZSSl2 zlFtkwtdtO%>k15;VmEr463$7ZB zv_D*q-e0zSUtdfHwtv{qo$<%ROmico_2zp(`6Fq5bmax#K$aRu73sd7lpHRr&R5iC z=sr{%hK@Gf@mp%$)I@R>xAv`1BR!?WB2P&CqnX65@w7A$MYGMm=Z~4Nw~RqstUNnQ z+QNKah3b0A3>3x*)NU*PLY#WseQH}2UL4=owAj!@KamBDk! zgo$kno}tR;e1oarn7@}}rhVk8Mib7W7Bgs*Ie*08Fnh$~&Wv%6Dcy0(lHNF$!bejc zY9lX>{TmVF7bH^~m;LsSpMa5}wZ_aJJug#RHPJVPw)n|>nb9$D-f)yAhMF3Ng)a++ z1e$_+C2U__-Z%W187sPgXbo-W_XTYrt)A)YPd>JG$B*;XuPT-Z&CE4hW( zZrb?#^uSHN<%~P#;pa&M^wOGZevQ2S90oEB+Bv52cb&P)^cHnEa}p9=+v%yb*%Cw| zvbqL2X}JA`8PT0XG2;S&r(*v~ttrVWeb2hs0rzjvi$s+eAZl-x6xGttOcQMTdZza% z8J~%6mq9zpc`I9`pP7N92V>KbmhDk;_v(A2?j+5F^UIKEkDV*vv>baRM*1OC>f7#> z(dR~^P%vn-Vo-0D{uf`1x}WN%oP@bWkLlP3uXSSU_foAwy z)Dg!@bWN`R{?Lw!r>g0Gn8uzBs~C;4m#JX{QRE^!Q7Co$5cmQc{sqPlJ;nH7Dn5){ zD6^p|7*+k79pyuMlT1aF38>0&8W@UVkYlo_yK~y@zgPQv%%d=9v+Z1zAe|HLA52?q zLMmB^XOa!$Pe!w}M7(KF=OJj)8R}>((X^dPo{c;hhB7D&7ZQ79YL{9%9YeB){A@=X zd5GANE6BgOFzw%`K*pdgC{rg0a=-Q?CQiVjV26SUyf1*q)q*j#U*?1uWDbMYJy|sI zag4cn#y~ov$V9o8HU23attNwZ;vj6wejeJFE2ZB0oli(b%akYcA_tS-ap*2EVYMp z)0_8$69GuD>&;cT=1M1~j{;@$H92eOXSnr)Z~k~_F?tb(&GmlwkEhiq{6ihRN5>8C zlzvka^U?tAlvS@F!H`TNgJjK_9E8SmWOEUWH|>LDtvkTME8`#;wfi>V|9%6iBQi4!8RsnG1!@ar*=jEpwx;vfhV4H4B9pz%qkdyyvI(M zn2lS{2GTozmN-YpJmAZY!^=@H83t->w|&~A4zk>KF!TWZr-MaWcb%%$$KE#{UPQX{$!u`yK;&hHQ<{d9xlPw2@dG zJY?QdhoYdDTg4KUh)U)g!)cgeQCFgi`lq|zMN7`0?TN>KDjD*p@EfJmjOz4UA?F%JT}G6w_r#Zce>qsaI_e-CA7L+GJINM^cTI1V7PeW$WJeU zm?2t|+MoAHhT_g>wNue835HicZKmwP=#c@X)c%!PKln~ERINh0lH@O)mIuEp)00tL z7~<^YTxm*$3F*E+!YqdUe)#(_tnqCi;v{ko_28s`tHKkcN ze_LLW<5CzK9`Cp5`k zo9WEZKhhNEmnHX*CXT+1AQcd4X?}KZjrLg&%KpXe;AU_pzUu5zb$D~ z2bjE!5AC|6G8Pv+OOQ06RZ~rVRHKG0eq&zRb0iS?!=UvDx4bVHI;+lG4TwmdCqfeF z47f$`9S$~M6~SRtxzl8($Y5D0a-)FDz9w-q>zZrP%-tqW+7u<34={nNM*wGD7zu(p z(Gu4>8aR_l;viU0X}_5ix_rXqO%Hrz6l<7jh~*I~yO_ur5R8p=CKwQ7sUviMn{nsk zB`&AJ6ii>-fPPDDv(Z&d(5tyKL3*s*z1GU)vuqbCAcJ-Y+$F;~Hf#qs3+GWpxjhyV zM`St(eCUDg7{|PXlbM2Hv_2N54H79td?T8mRmo8oK+W!&}^ zDsE1^B}~)$_AX*S_2@y?lpS0W| zv?Cr(assAR#~m^!JEkntUy9p<4Zb=CZRK2n(KBe{DDAk}oqE-j#cp+^>+~v2@G&H6 zlp0_b46(EVv?9r6HKEUYwF2p#1eB8ncg!a8PLR0B1mIq)D{;jS%s$i~IFWFrZ%@B_ zuqH#4CXN>M1%p}+${@mZVkVWRSb`s^IxdMoexf32O-z6fwM?``2(_MO)PNkp$0!qh zlqg-l0?Q{}J5e$i;x#c}?Wxj7rFj}CKOw-Ayw}OukeLh9UOWS025qZ*v$tTHlNn&7 z;czcS^iS17^wD{$xaise5wsu);xfLG(_z~ge|4}O???A)Nmg?bP=8EPQ8Vv(tA`(i z+t>N|j!g$g>vRm-!s7qBN}mzq>qDP3lqvoC&5vJy79Bf-R_V3NRQjT6(E3i2q>rrE z#`gv$5E+^vp|-EoI(oZGpP6hi`emyRtm3v?OeHuR@`KM#_gD&>GH7QqHkLcM{WaY2 z6y+fyOE8R>0F@&<=LT@3*ri3DUDS zV)aX1p(H@-nT-10PHObY!RvhDQADwMO>n-qvgNf!=yMsgmExI$D(B2Eus4Y=pwSl^HUPYBG!ISS%i%>d7*Oo2k6vk)&X~&1s?%L8ySL13X zc_Vumv~z5IZrbTXs7@W3rNxFR`L^BD4Q=^}U!3>6qtdTg7Z6Tw+vrE``6=fqJnzmB zr=h>3qbp4@EPeY6y`GNS=ucm{Ui}N`7_^n?^>CHG>|g-8hq@N^SnlHArWl4{fWV-2 z*%n3E>HX;Cx>Em=2Yaj=G4sZtE%IFxRr<6noIm6PnWEyIugj+ErJ|0qGa*UILdy=>b9Bl+i%;PiNBGhcwk=Y5}0FXx|PJ3~GWF-dix} zWfTJjEm_YoGS1>gkb4huMFBN8M4U+z1uo7p3^;V=haSVx@-b-BIbudInNuwhRMs}g zU9{~&&K8yvM7SCuFG!(F4W;X2w2qr;b;sHcK-=Wsu+c{51o=6}zkM4)&bAMwoSBU* zp+fvh4(_o)1Y4-8SC9{#ZzPjY&A6CSfs3T(jiog^G)D08jltA90oO*>9fMSJPbBg^ z{PPhraQQsH%k|I%gr!_%2z%&x<#>5BIWEp5>lH=ymdeH-z0nY}qYT>ARqs{B*q^Sq z^PG1Z=CUgo&LAK`Q@1DoB6Xejub@CWYD>qToDS#Ml{Ll z_2mS#@eEm-h|c*5h8$tnU*7ug+D>FOgLc!xcC(DGavc)Kaz(yoG(((+6p1TtW~nda z^lWtM#}G5_k_KoEJ2G6*#?eii2t#v-*jq;gTPU-+Mcoyh&%t$CC~EeX-NEeg^5!z7 zWsGZ-?Tj0H3N>-W0qGA#w*Wzn_L3m;_n;)2(n6N2HyGEWiw$=tD2fiZu*7n5=ClOr z$DR&UP3qlJrgSvp8s|I5S@$-!U}+ml6!8juNLf1HxIHFJYQX*sqo3)~LI^}p%B z#nNlohASc(b=-5$G3>Qr=%GpXkGC8QF*K#MMI8z(Zw+Db9u~ELAtIc{#C{VIks0Ua z+@3W`T&jzYLV0Z@u7Gi;ED~4J#-bjL>N{!o@o8wa8MJevuHlllY+G;*QnKW>sMIXT zIU3`PE6S29F(r&kPj<#tWl4*^`b%ZRmvA>3v=(jHLNZ)B6pX{=w$jo!8^SU&B<}5O zOZ_6gdces>;zm66Z(fjH-qvEf`lb-?Cyaka+#~zYL_?ci2ln>61h*W0USbp*3 zb}zhznlhNKuTV@aS&4_AP_-#lYV{iUI3;W^Dbf{OqSA&XJp6WzL&C;qA16V3L7`Z< z6mlNEvDzVL?JNI44)+i3SRcpzlMm0^Sl|8ca>QQ0);^YCA?x9b2OT;k&H0D2*iKZI zjdErD*g_|>Ee+fn;XjtPmd0-V@t>TwmX_{seQeQD%$;BhEoo~>cHe3L^DQ66pYFD` zq`B|-OeLvH)v4G}6!nC~MC007!rWuGsEAs;FDlI~?vjNnQC+fsgAH74AOEYrGw#UUe>h~UDO#_n> zT!H7p-JEdr?eN2PGqOJVm}W+LBn)g}MsDr0F3OA~4p|Ztg~B~A9NFf0#7o3~odP|c zNK@UXCL{-5j&}J!a#LHH? z)H0p?E{dS#oU(#bR&uaPr`2ve=WwuwgBv+{t((@-O>SCGH@oQ;+Q4bkH|lh&PMh46 zOmjG4vzu3~58=|O`Y zqC*B9=HLjwK1>hlbkv|@RA*3E>ZVq#5e43Q3L12rP8igadKuJ@`Umo=inC7AXoF7C zSc4v+Njg1h&}llO(^-QaqsMhRXV4S$q(KGLPQ4nf<$4WzipxE1&@=R`LC?|ifnZgh zontQqKC2pn0x!}bonA8NWqQS+S7}n9$E?nIuhHuU4W>5?DxpCJy-DQ;RnicH-lDe+ z`UkzE)4K+}M`sLrpFRjYc2i=Y`|~k2ojz2Xo{o!C=a)HB1M8~&>V~l~O^$}u6iC(l zOFd2U)q5&^NIwkS|5EJUh&e2W%VI-&% z{yNuE7d$XWB&j>{CH0^Vcsk4i{Eo@GlGoeJ^O$*(`qWZiZs8idV>S_T%|^KcICmgw zW5meOQ34Bz&_5Kf`^b(cQDs(;M3z_9kuA%s)sa2R<9L)DrM8DjuP5h-Qq7FGMwHqg zCbya5Fe6COAEq!f#c4)dBaV|%N8wr8qZGjnL?XdcM^Ra}I`SUD$0Q9y=ugA|2}9Q~ zJU8`0r~A=xWKE%wcoG>_&u9eRiCi>_Mk9rLU5qBU*TyM|E)7ylRz1aDZP<-mqHz>P z;~TYO(6uxM=`mDFW4RqaUh)y)hU{n|?jRyo*LdQICN`>V5A{MryW_bw@?S%r7&*nY8fUQf-u%@Nf5sl|wT%0OPr zr%2cKk!I_fb&6V*A|(oKWhNsMjz2s8yg|yUqii0e&73`J@+MRPDV=p zs;~4QeIye2pF5hOk5ikY)Pz4L%-OX7fcIk!(jc>%?>*e)6Fqg&i>vl_Q!~-WE&7UU!bCsOpIbsOKn&DH34dsio3g}U&Ku$uL&Y$+ z7%oO|Pe*diQ5=lc#kFoK5M$h;RE*_P<2czro)fR*U;+mdIhe#j83&U&n8Keb=b(av zsr<2Nx|r@1mE6t@x0ora_-nH`sMbY|oBH#d_7}4`0NLy>=DJ0#n8z`Y?fzncTU;*| zy09(8BDYv9Zs1^vE|$7!6jvQ3(l}VgfuHl1bK5K2Vx?Hc!D^>i!_OO?Vl7YI45wJ< z6rkO+#CoT=nP=#hFtI^wbc$Q~xk(ow+%v>&P7%<>?Og5-x42W3s4#Jt*b*kTicuVl zcXTfW?BO4O`ECyG(FN%E^y0}jl`>ZJ3Xd7)eXeaG5=stekM;k#fTeo>oJfw?5hBz#a7_^3N)WySwI4X`A zw1I9kXcGca$FBi?y_0TKMydho}Z|LGpL%b#4HbkWG7~&t|9Yee;-cv`yqml=S_r(Xg z_|Om^5&CzYgO9~~hWIC!`hu*Vmy&%r%;zvXLBz`u;FXC5&ey2YIXK%?b`WNO>{KmoW;tyS1G{nEfB}4qlDSy#+ zT{IYyq%fr9m;HvcNzIUUag4bYwKG=46dBTi(o*NZDP0ErNq=Eo4e6F)hBRcjAkWB} zd1Y0pQ>y0I&a0SPTbydh2pJi;bwge&kBkD&%qy#^oK{^?o;SH{mLa{;1&}agG#vxM z%rj(+j8&;Nk)g>rtVZO<4yo}nL6<&5Cdwv&8G~MtNxFEzkja1&nL^tGX^Xlw!{p1R zhKLn$hD-(I$TaR_ItR@}f+3sB7V6cHy}9b1pFORQwz#$0AyQRYrAU(FdLBJ`$0`lH zGt+GYywpls1$?|3ZxX0pFsLD1i+ARJ)*hZX1EMO((wRwQD-go-fJw+?^Ey$28Mq5sa< zfh6p!P{!rapvG=$ho<*n@Bn0U;MjTQ6PpiDC#qn7BD^1OQ$UfK#H9spygSuzQEnz< zDWIZ6g`md+$PK123fxQ_xT7ZEk5WZfaPyaWuQYlPY&~P1L*VSKR=Z$gxz)&1gFxM0rq}XESmKX(9yjqO)f2NFk@b{rn=N#lpKZ@lK|Qsrr}kX6a3}5N z)DB23j_^g)Q^ze7#?PWH;1f%afr*?{M>?kzZ>5?1?6ieE{Op|N=P&IeC%<(mjO5}@ zF7Awt+)mwn;jxhgg`QZ?-PA58)-yg&n=~XB&wMW4bxqk&pJu&r=ezkLj#KwZ9QK&R zc`h#1^V)q4eKBw_zP*_7wWhi^rjVD6_7N2eT3y51xM?&%TB? z0Atq_yR$H<{ffe}a_|T&+1|?EAFDZ_IK~$f8(l{OeKFaw(Rp=L;)}^aWKA6nLa(Fq zUAJ$iIA2UIqU+6s^~Y&&9S!lJ;xQVup2L#0H3^Y5bu{#H;&B?*IDNR!?Te|W5v!xc zzJ`T9=7&9C;78&K*z9NaFZvXXLWf3khj?A<=vp(P*U=a=;;f@mtaLq%^~D5foG;pJ z(ihE*)YHi0G`=|67p>LPb(d#v0?!`j>uc(1B3`_cE~hxx;QO2epe^R_0h-Ialf6^w zsoYy(K8j;QkUF(E&KGxrrh&QNNf0s2rSZjbd(*kSINJ%TlysJ6)YHsRld;|^=c6>M z)P|-#*eH+y5rh1WnB5mUqSWE@$3XT&qC8I35bbFlWrCnbc<+B8^r+HB!<&wQBAjrg><`EN_U9Wq{K~h zm)J~O#CF;$PSP&%IPDfs(!Js<+AF>VOYjRF0C~M%CeT6IgpPnPJuF+%QJGK2WFghb zZWNUL!QhXelX5(rl2hmrzpSE1%cZw%*U>w+dGx+*8GT^8i9WO)rt`Lc z(ieBY8015;}ocBxd0@ItgV;1SYBh`jkh3 z}JS8hSkZePjOrEn!}Z zb^1=H@BgF-oqo{iLIbtos8#lAQLB_yM5iBxKtObfI_UJ1E2+2mHx0T-?gq*r_y1lQ zjL;=YL#>OLD=zQ(-^=Utvkm`1yFHhvn@+!6q=JiN|2K_n0CyDqf2yq0uR8t9{Qoxq z6P6lmD--iT^Lz9C%v@M{iSMxV-(#tN02E%p{QZE<|1*O!HohH;9Am;3i}WcW7K39K zO)#zJ|9Viu8ygSEy>3FYE-FRmX8G zRMqeqTr;z#D4b&@YZL4>MUe@S3E|UXsCH&ils_RVLT#rOoIB9db`QA|^n|D>MS3V< z7ZO~Y;M&))D<=VTVSY}6tBw{h?_{D)Fl10SV;&@Cz>Xm{!yt38i;IP%F2Y9Lg`Fk{ z2Tcdm)M5uO6K+}~!st!``O7aN=zig$!y=06h1ZMtE&Y$nh72+ibdUU!69#R(rZ6^YbJG@!kJoct*-DfNw_=(3#&Jd1dxSnn;;vZ={JiSz9^MM9MbO{9wizQ)`fM z`kDCIj^?@wuZRZ1?QRS^WmB+-x(!N^jIG(wF$O?4*J^TFRMA^n1xJ z4|4PdPx}ZK`V+7nNUV$K|7U=+0Oy|=29P#N7v$gl4t{t zr!B1#1z-W%17-{91<{e-5JmKnC^pw^I94l@@r*Bl^ZMUL%dZ44gR7Vp{SQF)W>{;2-pi5D}IgwY0dx9F4V! zMqHB!~YIl(wIJ-~}?0h~d zu=xH6KV5XglyH-i*0Y)EW=HqtXc7PT?G_F;XkAKPc#&glNOA#mVM^Lew{c29CupJ~ z*yI#JcR0G8H>(1I?$p_$6vOSps1!q6IN0i>yZH&rQdio>^|!nIw1akXbeGOHr9E`7 z8xXsfQ|{x~J`VPC^9ML~zZ>&#kaJ-zddNwKoOIaDlA@E2a{e)$>fGQMkMnm@XYpsp~oTuWyg92_8m+2_NJa}R)QUA7!5ZK4W_lr1G^NoZETBzAqs|r zK+U-{UMM@8rDiHI3tE&NF>Z${{Y{Gr)YK(9!52YwLazmGsT-2`Ivc90sfG=zrNh@sX?F7=YfyvvmIa1mjXRoaky9wouA-Thvy_D2C9PX!@r`hxhvlo*aH6u zJQQq`{G9=!jxPN`{S1)xH^8|V^b`GTFn_6{hUw{nl;gv~U!i|-r+%|^S|J-+;Qr$o z>U+C*K$RE76{&9*gf&%Xr-}5Vo|Bcf77+j-2GK4mtY0>m)Is%x|8 zHh#7s^g%K!Qu^bsq!t*Hwf4h98LWh1qsD+*SOkzel17a+Ek|tsvFb8S=#Pd`q?C({ zMV#$A5-JP+-~{svjY>`AQj?HV<~NNctZ1>!Bo(Mo1w_GnJg<=vErwu4b7X;C)w8^i z{9J4fbF(=t46+j2$1IJ>RMfM0rZ&JG?XerX0K@eJ&hHDJX#h+omz6PjAlj9{eV8W8 zzQLbvWC@Fa@EHk7)N{3AOBFL5b2Xw-Lk2oL3w1SyEp(jQSri&V7szw2usiQYONeZU zp(t=P8R&X6%k(j&K|jTMS*^m^^_1X+#S502#1V%v_9lnXk_}8YVQVrI?LH`I?DJb5amDN9PTEelCbwc20>94`^i1vgL&f${PX+~!B~ z(Y2)GriLelAE&4(D~G~iWlA*%g@DCYN08F$DE;bedC?p!p}=-C(EnMKhxy^@Pqmu0 zK51DE=hFhSANj!3{m>f?BI}}1KN=g&(%ER%?CK}11(|~-oZF~rcW5+hVOk7>3SC((lrdKe zmc0Kemur^ohSjp`y^#Igqh_ME) zJ@i$8^A271QG0KJ{{LIFl=W=-aq4JRO4>pxr>F=DJ3t(UP>inOz#;R)4D z*eC=CORx#*SO^1)5Q7P90?&$GK!bBZOcz6EumU=RgAivwq}zcC?*LnUmpS=tB0Ye$ z^@Gq#1SWJn`O2R{HyBv`Q{{@#d*BFr72PRa!@P$SS$%T%=m`y*ttmI-X9GoKyz$9!=(83J2wGN}vii z#x#vzr|VP+s}xP*UrDV4(5en6&G-Dz0Te< z3G6-NV(*z7IB=!W5{@q6c43mcftH2wa&c^hPAhd<71(tyCh*gzEmht)qeMLvG=>KD ze%4kMj2GJj3qJ2He1TJ+``i$Ih7KQc&F~(B)ME+uwA;QYQE$Ft^j13~B0PcAFPa{8 zeK|>JtFfeN*cuTZsQD<)(TDm9`YrIpSA7HFUzY|dzaHDPCABhu*oq7o-dh`3^R+sy zGw3E-Z!qanKTea0f%E6$QVjqeJWaYnc7WkTr(4(z25Z@^G>mOwo7ItJj<~>}Z{iaA z&~0oJyB(i40Ht>tup!(fsC_^@-9FP`M;-&RV6IMgvm;Ed0sFx=12WYPxDJKvaJy-b zWj@*Qi6b(w{9Lm@$8R%K(_dstp!G)?M-#qVEMY)N2cYSge~bZumuZ5ZggV4>ILXFS z@P`m`8dJeMl*_NB*3_mJt}KE-AIfE7n}M1z zpvPL4&Cb3p$5qMWMnZBd5I`sFxxNs72LN>p1ygXD+BQ`y9yd}_u9!G19NrK477iZ; zqB>eViZXr(6%JEW2mpr}-tc;g*g^?e{KDwbsHaG+9CW#k{Z*oX(7Yw=w}LDud{^MQ zGAQQ+L`*Q)6&)_kmZO{}9PW+bT&3R?|09k{k>wof}j{!~fOy`SMy z_)Eec*RZe2K2B*S8P!0e2n%N*EJZjL>cs1?@Dtz{G!bq!QxGc$N}LLuHXU@V5(soA zM1oo7e6Yr56g3BbH3IWB5Q_zl3iI6&JlSZNjr=SI1u%gF@yrD69AmO>Y_w)U-%4SR zn{h~qBwN~_6sOa8ovyoxm1Jg%@!v#hfhS8o&9KAC;{*n9QATgqRXoU&PfFbbh0~f6 z@n0KMfMej%M}FGTl+j*rf>(4jodAPdVnnKBhrZw=abmV+#`2qpYlM>9iCxw+?R`sY>l{A>Q`YfE>&QV9fy!F&BHP7Lzs)V7@@zl_5TEzZlbT z1GvAX;Pm{MsO8u`E6|;lXw{FIWlFLZ6M7T4jrG77Hv%neR1dWjPm0^ryjF0oC{d5L z62l{SP~C!U_Oy-yIfzneO>Hsk`KVrLE4->CQ$+edWS>R&KDgOs(9I+ZSXU#sKe*Xz z5#EYUYR69t`z~Uf3*72PFq;G5-K7NI2l%GA(a#lbbaGV~i0c{-Zq(URF_zZ383S?v zcg36N7XHKru0CxeM{k9w$=-?@3!83sS=_|v?hYs2=?2&-$d)=wmRkvK4sI4Jx3a^c ziQ;zZ?5CJbvz;tfX0xkeHp`aT?5~(*`YYCt<{#36%AN`a2Y3hJU-_Yz&A1%(go`?t)FoRO_Q%!W_jx7lKxin%W~ zPVL<&Q`Lo69KC}FbHsH*98i&c#kjyrm1TiHAC6FOiqSdk#;f*n{R~|61fBG0eJ_sPvsUvm5EaUC06YS^EwHerd0}Qsk>I7ekhuNL7 z19jBlT)?IPJ>*Mix?n=%&(4#NDo-2HvQs1PNdr2K$Ju`p!ISiq0sZ|m1{AH&>2$xr zT6z{wk@pgHWT~{?6cW|iHln)?!l>y~@M#;7nedtcMLql?-{5j@vUjE>kDC4&>YBEq z$omdEPrkz<>HA!A;8hPwRo_-r1UDCm+l3k}jd)d?E|S#S)3r=B^Mp4+JwHe!2Y0j= zdxe@JBd4ezzH-O;vDX-CnOLz`V@HHVcw>6PVo#@9(B3Y!Ag#ib8m(L=C&l5`V&?N&V6EfJRM4lAlpE^U92<|gOgL6+1_JFn%u9Uej2F6IxAlL`vAh+^I7)KIBQgD)>Qg9_kU$^0XJFw+W^MhGd zo1wm=bx?s)5-csX0`}~PnKppsSje`SLN*W9N|}I^i6{q|`VzI&fj>3YW`w$XJ8IgH z+X3CR!QRah%ofs);8j|$3H*v=UntpuI1@r0Pk$nJ-)45-lCt*#=iUcCeII3j`OkxD zxDZMuo_u_ZV9YPX5zHPcPGL2%$utmbEqk&w8phIMGBC_^iS(=ovpya4$SqVM{ ze)gVC*-8DK(^PW9t-}r# znFq+F1spSSfmeD!`WOj&;3`-G*TQVD8))P5{H%g(!jd!8#6Z;s6R!7v1b+ zEvSn&z&pwfn!1T!K$Bc_BL}yEEYa&+HN%uSH0XJaoppzbSv!6V_K+Wl*BW}j({CbRIbvo5M zDaidg?xYhsopi&H`v`a9QI0{~d4|sF^q5YM!w-^fbkaF~KEc7096Sa4NLx$ec~_QL z^na#+s=61n;EjpinO-m>^igm2LeQi)Na5cJpgLXS_5{w3bOmZ&Z4&&bx5yVzl!6+5 z=!RMu#08qqIqJ8*;xp|4bfD_|H6p(`SV#j%uQyXY1pPTM>Es|P0lRMMP@yADM4Jgc zbb67g(#r$K8~RrDD2QpQHue|3_QUBrL1Ddyj2b+0;*h=*hYW*hmo=Xk z447+wGGNX8#Z>M7%F52)xW(V;AB|||A9PXWF9by&JV506=-&n`nIE$!US|X9g^zV= zFu;Kbi+TnJh_2B>3Y&pIGhpOH(c!{z`2d2-87Ta!=v>4omDo=7S7%DZ^A;WL(Oygl zemO|AmVYOlOlK^AFw71%Vi&wI;p~e2){SMU9<`v2tYGaO%4PbfzZKYHoh4r&0v4uKf8hXkI-JAt60D>&@Dy-!DKM-3c zw1W{eDb^4s72AE<=|YFCO`#sE$2xXXCp4pNrSRNXy#Sintx#A(E`<-4UNAo24aYJ2 zQ>K@nL$w|u;`}^xOP@CZM0z5S;v^91$ymK95Vy)fhNl9Sr(-QEG4vTA$}^$yn+{E1 zwK*+p&FhB$m>B6iiYpW3H#OcoL4K62!Te-n)n`L4v!Ug|kj>_><+TG~yB1M9KGO?= z+DuFbPgV=kLGv3R1H$M!U#A7oHbO~d{SR|G#iz2}Rf7x@0&T&5E_PJCaEek&y-mHT zP&PyAH{(62*6@OvKpto1T+TuQXRWru z7wt3vwZ-6EOn(M6&CvfkbMeez?ExIZS|PpV7E0zUi@OK|cLCk7Hw)ZF_C6Zv%>j3j zYjPKURLHZ~izqY3>;9aB*0=@O z_EvLYyJ8&@ft7e+ZOBc7T6dtdgzk$IndmFF0%A&A};rCIZnWbV# z87tPrva&(lS0g%^+{tsdbne_OjHp|fYI9s>D9=LS4cxRiLbVQvUBS<1i^FzD3Y-gs zN1a^&)SkE%4(M%Rj!WgDF=05hrPE*oSac}UlfvMlyTwDo&L|*9Hsv-oX;-qqTP;eLnjYKS=nB-nx=Ww z@)g2+=MIeV;+^82VAHemClS1NkLW9Pl)leA^0-(0WP_~9(s+)Y#qIs3nKqcSPb3Qe z0hXB`U{mV~4i3^71B#HzsL295QXb-zLmWHIu_GLNm4EP$HHbVTK0Y3a3_~;N6v$b1Xwk;90?7j1cm;=!i<({9CHw zpm;O=B1@j#0*X1TELs6lr+2Dsmu9>EJ}zw1XRACTC}_^6hi-82W7 zGdIB#=Zu5Hb6!yb{zXiK-*!SoO$@bGm(V}())|LHwliC8B|V0+Rp}v-ZqD~qxaSZC z?Z9wZtq4aU$W6mRM@E1-ABDk;#ynq(`5l8Crs)$vw3yvkFL9(*SJK z0Vy+~u$chuOf|4u4YVtB!L70)odAMSs8?#@!2~Mx1Gh3)6ORJnU{1!ce?3^!%S-(` z1n}!Hf1JiM$xHn^njVX_hz4go5j8Y)_#@EXF&IEo=yO;YccSyJqg#^!Ol{%wRgPS? zeLRA0jscU>9gsW~u|f|BK z69ua9l$@ZT1oQ!Cdew$Hk!H)KJVqN6)zl$C^o8}}A01laqXHTb5CA+gL(i_ z-J0@%2b`!!UDy{=1#dbdnkmm|85#WZ9dXcOYQztIEv8$9EB*&DwxgLgucd6|Fk?G% z<=kYGhPABuVNyC~DIK!}!=R~JKEFxEsg|=vq*{1fMg(gH%1zRm6t<)(&p9YcpFJfb z)$`M2iY*hH3?{MG>eDfhBqt1!PLZyn>O~86%MkfJ^!+G+k!G6%SDPM{E-5S2-H*x) zCo2J#uikh*`1Mfvf)-u`uiaueymq2Q*;$<%C1;0shc|Ff^2uJ*T=rGnN6YGPSaak+ zN|FO8T@F^KM$3g_xElJiJ683-R(5xdA}L2B`&xC|wem(WUirqz!YKYy>(^H9eH)X_ zf2HckJD9ZDTSOeY#xx6fx;p(b3wj0NbJIkTsEZ~56OqU%NiM{aIY{B4DF>+>q;ZhW zK{H)64-+kLXsR1l(+$V#mRz$Hr)7qTERoGYjxKU_k?)2@sCAfVBieGA0uI{2*V8ZB zb5n)v_Su1hjvN45*YNSpHGFh4R^aGnp6KEhT}3yJb>|vAIOxemdU4R3gFYPe4TD*v zALsVx*8vv=&Sh{KEE+aG&}kD3IJlmJgSC2HR=cTEtl=);?Bd!msuJrsxJef~yWb!&FKp=XR^dh_h3dDOB_)Egowj;pkjFr`ddq&XcJ1(f${^GZ>uQb$UcA4YF+!aTcqQox{L z+SA!yE*{~^kMgHZbLRcgKuCk5#Ms~9S7fY z@B;@IIQWr+pAg`f!#~*J?jMHuS^Q#%Uqvr;^IzgOoDrpWb@7KGE{cB}P@kj1p8{t+ z`HUz|q)LGlDLJrlpmAX5z#(-@S+#M7^f{ac5UyJtF=Eoy%QIx0P0>m9A_m=bjQTR$tda7Wwd!L)WC_*BZiC@N7Q5YL^kOu z6R=9sCljIWa&e@IA(J>MS*94WDaTSVGMOgRaR`+A)|`VDGDDXw4cSU&@BkS&3`xY7J~?{f#CzL7vQFApl^Y3 z66_D*I41PKcybI*uc*zjy|6Vfle-t3(7mR_x)%pD;ofYfVN01j^j=sY9xzoFd%@2= zfF5uSY%Y^8*lUg>+&XPz@xy!Zu|p_ied2KF6EL`f=R1PbERqfA9r&w|1QDZIO$(_-tG1ryGp!CKBHExXac-#Um@@KfQ z;LVGr6DU;OzEpn1D+G7{CJ-+Y=`2=AgS_G~&ofEzoHjOu0^i19#rZsw9eUVobNw`N z7~DFaYzhOKW2gM5f!nu&-Q{w8%4&&O&M6kbA_CJX>7o)a!cgE4BOvNA6xjW;B_@cg z=rRi04o-ByNY&hI+K{*5bKG>8Tm_z%%emz&bd1ZnR}piVtHE9z)@;0nHM3^p zH3l5>6FjL_!<=h1%()Cq^QfsarsJOkqVYQZ|2Iw36QRtG2LxTDtf=k+{u}}?e??Ji zot=W;2p7fCjYz~JED%*(p3?{X=y zs@JTL9eQ1bvmZi}QHFgCTQf>Ng#KA@#(6vn$;gu1fKs+S#vB8f*sdO1A%9?{X4DB9 z%z)kzMq(VHHLadoXd=gOeBvy{6-WA@1{+%J@p(=cM%fCzvEJCI-Bj%J#CqEoMz3~> z*l7DMDBs+Oc-*@vadjBSVs?>dH4e5%ds23hV|BP#?S*#k$$bsm@Kt-Pdu^;sjb0`5 zO|M@Kehmb*K#qE3m8|hU1w+fzqAf^Pdr+)SxV50GctP|5x#|aMH3Y zkpcArY%)&+@{i&0;#Y_{(ZnglpGIFlhm83RVs7Nr(YJ_&A$*v=LyS!`hv1$5G{$}f zD9o-fP%s!D;zM=NZt?{17Z>U{az0-t1B4drJIu zoF+h(I}xhfNl@jMoubLm#!g{p;KZjW;S`m#e_3LXDhB0vr`FRnT=XOc?W6F)r>L?t zH%K$icq+P@r{^Oy^TFC)Y>$~Le639KS6S!hX7Q$o#gTXY1XWub#Kj%(IZsfLL|#oj z%`S2$xZ&e8^ElP6D>R_qH*jtoXTIn0*>Jo023PUApM=G1yx z#*@5kL810{ofAh+lEc?_W6P{2Pw>{eff37pV@c-Z|jiS49K8 z2@3qFlwy)J#1t7JX5#veYUveoWVBc!W5kUzR%}B2PU#c7z=b_1n}`!KSv(XxYLE}5(%6uY~bz4 z+gX4;#CgO^gbPuSin}zdRJ#0l0fN|69}Q8rxK z0MvPK>w~@`1Cd>*TqWAjMMon)PS&&Bb#cnsud|81&Q{%*hzv`~0`*ixu){6#+iX(- zm2q6&ZMvujpLkTtFium?J|o+)wyXYGd6zUx-Tl1$-HP4$l00wT;uu_XUS_*YrP#i& zq~B7HWqv3B)XcQZpJcRd#=89>FQs!#bxpFhF?DALCfqq;7^Z5NRKvcqC8_19Hjhf_ zX>+OP(!CzleZOtG>H45vX=zK~3?n!p*;Xga_Trk_x?8c*R2$q*(Q@#G<~EnxZ2G3R z4!!lvLZW9DDv=r)qf(b9xa#w52`Ph0`t}%BoSHg)-n`kx`T4aq^XE>fm|8P;T16gT zd|5Voc5VKwit@^`>nmqg=FhI2QZ~0dzp}c#VqxCw>9f1dtDIL=(PdCsW%b0NWz#BH z4o?g&8f2>+Yj)tXeYO{+*?~XubKlyKNwgtT?RrUC9e`J&Ju3M9{kCo9J(|IdZ`c-C zldivOJ7Rg$o_gQ*oHdwrAKQ9ZX)k8^=XxMP7+Gvsfscx|O(1azLI$Ad{hN-YE=9ErO>de~PE_^Dn8^^kHtcR1J{_0A- zoYb42P=48{uTIzK)Xxo(puLlz0Q1rS4hC`n1y~df;$nk!8luxs*yG_OH?^`#-8WH> zQ9q2*T*@&d44(Ebm+I11yFnEcXxFN7Lp84(iepq6u0v$~bA*5tW`*Fz1HXtk@ z0=8!&XHB9qb!3v!E2pwL_4>J$^D3*SrJ|b^sRk7GQ#dF$pr}X5G;X8~wH4G{ZEmYI zS0h(zvFg%jEnV4Ph>KLOta5~@?u8Dg>NHFXQxUVIQ=RIqC8`f@jq9o&Z4sBGUVk7q zRk;eW6k`e<@yb`I-Og8E7R6C2X1@t`d=hYtmj?)x;8jDe4~zAe0yi@P@|mt4+3Z1i zifpBDQP(W@Nk&K#JQK_3Y-?(XuociqCM>#H;M*?Cnm)>43W|R|^9z^_l*>fe3~674 zK2rhS8gErTQ3xGDu)2jAMl&VgZ}sMolXPvKxSy%GCNr$I0dmy1FcFr)C^-$H5Z zy?=_lU{|8O?6;5hq;<(qkP=M8x6K?u2Kr6V<+~tW$3RDbgGH^;$u?jE+QLv)01>kt zc=f_Y!>YY9tXd2=8^gTqkNGRSQJ2!i;)T_c~LxjMCOj}lnM(69^f1T~-rlMiFg{Qb%6(#Hho| z6CCPnRa}^QxfAx`3vLV;+SZN)ZEHuq`WENlIJTJgkRph>+CrmP4nTU$4_WiqiLg)J;yuJCMRun(rr!(IQf+FO>~DJ+=qEE z8MnK*0e)MYv=!%L;5yEJ<2SLNUO4;db)=neA2&V6Gx?;>9w&S1<*?KGa@y;p`<%4T zN&B6qvmSr)0Vf^gY7gR2vd$ODvIn*x*VMAwd2{+=X9?!jxOPWI4zG3Jrr-XQNl5~gMO4E(-_{iY=;TTd+`8Hb_S zUVIXQS8c_d`4!buDpIQ|t1Ao`->OvKp4vd$Ihv+!@2QPRh8BYT(4T^fe4n{?9t%qkF6*P|LdA|W zyz24w_C9wWu;s#QP)jZ92*|QQl~W8ERRdD6ESO%#=_bgsp_)OL&D53>fL*%bxTc0) zqxJJ&gG6f|W}|v0pf4BFX&$gfK9FDm8gGOB&=x7}OwVstthGQX_Wx!?m2bMTA%fT- z*va|UTmoc3=4;NIkm&(9L4q|z^5XHA!S4MF8x0go_{}3Yz%?aoiWY}~=!`8Cb&A4E zv%C>Fs2e2DnQSi{ZM<74-AoSMW$R_XYhWopWE(06JOfrSY*1E?Hz5cVYP}B5%Jo9P z^W}P*{C6vUzT_+pzQ6z7OtF^b)4a*33Arc5ry(NE&h6f%B+AYN^bF&egC zHsw08oDo>eSj=fNEM*RWejp%l1Qu^BR$?+%pb~2^7pa@Uw{FEW??&!JK$?%h%Ka{t zLJAoROg|o~g6qt+>xW5qLes#`?$8lKn>!jYiMH0D#;8eH+O1f`F-U|494Mg_uwzH{ zBs4StRY>r`CU#ilMb03{B^K+)#kBmB`rwKGFHsi42H<9_IgX&SYaWixo3`}6>a77< ziaAP)nJzPXWntRzMh)X?j5O=6i!7K$zaqn|bpIeYwhnbKxu3Fuwsti>v zH=D0O6+X<(TeU}PM_fxDmG!;ZjVjr%e9!VvMByg71xr@pIsao}U?XlBO|Hda^K9f< z`z0S9|0b+n-e)-9%>MQauhuqUfCUxj0qK}I=ierrj~Fi`TyL=eTBqxQOct4+XBjdK z;%o?xfKIN$kxp^|;3OSIfOtc_f*uC#5)A=fX!*B&kHgk+*pYGGwpMp8BMB38Tt*Ro z^S*0uL8$Ci9Kj^NF@P+==$C@&@B@D>2UxE(yUSP(-wt(>V?E71!?9WBp5YCEd27`I zy|%6g;JKWDH=fHZe`xj?L3!IDzzl*1YVR$y;#X(W^3%Awwrj4;7uzSSS>o7i?trHU zJ~1aWMfrI0BJic$9sZ-40w8GybZc{6h7oq-WAK{`eH=gM{hgipeXzD6B)^Q%oEBr# z?ETF{KE-5wYEwPp!OWYCP4_pv-r#d1HQX@7Lg8a$wASV^>gyYReP@-OXqCEf!4{`l zj|I(HERl2RRP7E+MoJTXjPG!?`nz!JYv;#ky7H_^ zh*vLP40i>`&CtFNH-(>)g<7QBj7?ant*|POZg-RWXK<81yHuOEaa*)mx!ZazLfy4l ziww57QTxRbc9S+~JuUIGY_qo58u5pBXvq$^JSJ{HH%BnIDcvZGXON^g4y|OlSL0Q%O z3a|{bAbZxPR?n}hQk(bSrpS5I%jTt)&8rv&(8Z zV_uE&Aj6-RnmTfN#atG7D{@jRr=~8dnV(uSt8!kd`IVa4^D1kqQ_HH$Q~Os{&7W0S zUDmsDZvX0e71JQ=Eb2AAV#>^lxv6HOwWv~2ojSQ@-gLxjYAcv4nKrjbXn06pjZSMr5Jmy3l24R1y9i(bgc*cQF}ro-kZ*X zvb_e3^bP>`B6NGe&H_AzXa`e7ISmoBXsDPE71rG}5(?|lVi&G2+6zTikj9FWkn+By z3Bll5O_x1pfrSVCF6A!6&q1CjW2Lu39<{ zus6t!Z%x!Ewu5tt$m7PSzH*hx?(>*NYP=(0Dn`8$$L7QybwPEwuW z{?vuCWLFx6_&C{%%4BbvCaX4i-EFI43RU$SXm_|ww1HQ3|S2o zbPd$cbHqwHS8R~AVzZnl?tvTDZn;R*%O&D5xl}wUmx)*8%HWD{dq3G?Jvt5pw%h>6 ztc}nn-AV;=6AhA^sRHjakvB))p}vo_KV&ZJYdb32#`MDPIrQR#VaaOJ&su!&Ly!G! zo0bV|qWZn-XraETbVLWgj7cT~0>j%+pQsKc&$#du>?^@f;u6>%+~ z>I#NYAE(5$P%pn{IMwoN?P~!1%c{ehsupAHW73`h>+md4&kOMBe34RN@lA)RHxEYM z4nQzP^rqTA#=Zf09 zR|_1t=M$K%++YD(;A%8}X7L&891e2%HID;$r{OLLH%jBz1Tv@qXUBDFuT!BKyx-G= z$KF(J^5_{R(p26+e96^GQ8UJ26Y`uSbl@vAQVa-p(FPPHDLUV$0eR1oWq%lF&&@Ff z<*uQF)VTeDeNKCsC#QAlYo43tV`OM!0HvsY|*Zo>t+E32U$ z!`0@CD*`{>8mkV~Mx^jBxD=@mhU;N!!}-WOwPu2Sw^}*To~CZs^@MQNZ<`FKttlly zmlvHZma5+;+6T{+rf;pukK58I#nuehEi|X*wieXWmH`xfnM>&fp)(0@HYkFaJ4uBA z%1l-|1fV8}xxwwgTa*7?-3aZhsZG4$Ns9xzhZZq$MvNaIOIx#ecok zkqUdZ)sX`;>`ik4RCY>qA%LHxgWw`*r(~U)>g)xO1}E;>b1Q3@goEGf|NjL##p=PC z_H>ndt@+f~l8=c?aN;`NzX_H2K9ED>|LdR8_BT~IDwtw@BHUFIWT)$`^` zS==WZWZ_^CuJ`KQ)`Pyl5iT z>S}uwXl)$e3azOho4De_`122(pAIB|o7d9s(sY%!sqent}^&c3Z2mg)@Hajj-;21nNtgY?>e_(C(FyM&--ua<@ zj-50Q?F^njYjOpyNB!ICB?g?<#t+PSu9Nz$2B`DUJ>f3pxgkuqK&!`Wd#yThuWmt~ z&OPds*r>&ceJ_0YWp#V9_Arn z5re|vr>&OTLjc{3o8;dJXkz|Ql}?=)Z*ry@|@5Ymc`eph`O0sr%>J zSAmZ6tm3*^ASSd7Bbx_n%L~EB7d=>YXv^^Sx3w0K)FrRA&01?-9P1LaPr$W(iuh+H zu!Pnc0K*7^FGTU-6MPQR(w%!r^%$kc_T~DPocpC&@3O#S)ngrda;O_)QJbN*N%mu; zZGl$0p6sylL9b)qskYw@{`$K{Uv6^c%S`|_P0>aw>OwQ&>yC8=pn#`$rAoHja$T71m#axZ^s^o{r{--yRI655#ZT^g{RHE(>@k~?e~BhgLQf3|gA z9eO4Tl=s>1yekCng^;sHuy;_k4 zFZavknyAGKfrlCUL>gFKpH99I_^B1MTumCGTK6LRs6g?@Av}D0>*yl&+#>sA5mdR0 z?XB^cu-M)ek8O+X0nB(iSd<&~#HRTY|Gvldq_{yB;&8|?PR*GC(r1!@^L~4bI(36R z(*HOUZ8$(@^L<)Tv`A;IJPs*x3RLsie4|!-{#CkczS$+4@PlNy*#&3c+>jVn^UE5Y zkC;PYe)(dTNLJ5h(=D6{-MqmEy;jk!{JKe}%{rf-PYm56+6BMQ#JO;M#?QQPHKo&C zPQKtJjc(TYuvc^Qcj=t0tk1Ao-p)Jj{_-{&DHjDcbZe5YnHE7@hj}z)mu;4 zed@`B;ql6|#}Q?XapD$_S6ivmAoc4C`xup08Rct~d&f%q2sxvbn)4QZdhV6=QJ$-_ z6VLMyIjv$|&$78wYRW4{@Qgs~s6LK&=Uf59WcAD{d$O#tnz-{BNa&ZDfhOwO)%G}? zWJ&uD?A(QC`RH7TPiSK2@it}&PQGW4Q1)KXq5PQ}p8+DfTQE(W5}K!19h#ba%${jM z=kbPcYsbhoozz2H9og6_z|1Dw$FV)?FFiVxG%8XkJ39zPr)BviXCdUnJ~Iht0cJ`7j<#8Tj!LHoFWlaLYwd z%}Ez2YTK-+G+Spnt{$5em5=_w=)x!QPMd6|`uIdxbM?Vm`#3dssmoLYSp1Tm&*tF{ zSGI#ZxXzyKG%c-XX_iU@FL8A*0s?zjEC*mHT8SPEQY-@OyU9NG3K9qdMJ<~$rJ}aB zd3zPP#F@eq9;b$NvPG+#*4uwoiSryuVOlL1S3$N`pq8`uI9jP@55nO7$OgM%!=(^v z=93Y743lMZ%&b^6qGBG8D4O9SX*ke)Hjn_LdN0o|Rw&QJtND1xL|=I?0ItP!n|XYR z8&f~F!H`L&*b2k<1Q1PCk8QMffD;&-V@xJkq(Picha+V(oM&wgak>RGA(xrp9B4!! ztzqk1W}2>QXn9PWpDUYWPGmPKY`;>tFSOCt_xR=lrW!Z}7P^(j`n1WLiMBAZ6qqeo z1lAF1fm`R%2-*epOMj~^F&GLEJ_328lT8`!oYQNR++-i)&*9tFP8Zs3g^pN9tbI3r zH$;ndv@Otcj_m4K%di- zCW$_@Nc6=|y7hrVp}*NhK7Y9g9pfw0bSTsLkmMk=>7Ndw&KD-wp~y*u;bRC2p_f<2 zawDt`e5#GpMxcsBsVdX#VS@_CnrbWMT2SvXNWnFy2`-M#?a&RjlF8%-K_R_iW0h2SO?d2?@Ohfe;8%Ll+Sc0)*bH zg0dhe7E}ab1VKO%6j1?%1(7C7?}{D8Uhf6$BISMmoJ|3}_ucmbJ7>;0bEf_?^Uq(2 zqPvaxwu1!2M&A_063Af)s8LmRz@uT1B~_sWP@~+&k{#kKjiECNouKE^rP3(W57j&d zXh46orVQq82N*ae3#bMSQ9iZFTNgPSs@!{D#Jw}V4s`TYxjIOYw;Vc3O1Q{)^|8Q8UejZXM)@(S1i&q1DSpNSzyLR_R(rWV;_DaYlI{d zC`Tn*0dZR&e~?w>^C5Nkh|1mJ3)CwIw0h-$Ru9u>6#5}l(W=oXlgtMTwFopo*)(+I z1uA%Pw_k;>?x_c}Ry*DYUe+wK5Re35;#6m>n8K8Ym zMjZr<2N0l4#rcGo2|7@UIjn9$B7kZRYlLe}Tnmw69Nv@Ji`RjS?5r$}Mh9*0*xNQ2 z=s33($dk_-4QaQJDnl0C@~R!xP`fPZXZ$Ly;9b6VfM76->L_}1Ud}x{8R^N@~VV7(k8TB{J@v-x7P96*ld9Ypyb6mJv z_siR!)FbOavdU!Di3OIas6iTWKWlpMFjtA6kR%Fc22tlT>OvYtcICi$HsVwH{t{?^ zs`jYt+#7I`d+dZ2A~ZNfEZ3VmXcpb_jCF-8bVVhr6+s8~FdK&=QHrbcG@?7MOXgx8V+HiROGW=u3DwXB_yDi2_{MNB#{ZQvf^+$ zLsoJ@W20K2BE3_tGu3>da%op3K`F|>hx1M~lF?M4M*=^h~%zZL$qyiR^#l@NwA#`uyn z9LRm30hl2QKx5!5;QnNgxJc>Wk;- z4)*7(XcEVQp{Pj;;8v`~{3$KoOmd*T`wl=>;$p zP80?u0|R|R6C&dtFgd(nYQz8uh=VA3Jcniu|BGcnlso}x2{gUvH-`uiuVQp)Ckr@< ziiAmk=w%w%+SNOmEE5(f0CxndB!WjU>c=8Xl#(>o)CJ&a1iA;2Q5s>&3BZ=~=f!=K z8-ew}$#C`{$Y~Eo-vFy}z=C)yP)@G?)(y#>VH6CfU<3st-PnLf5rB^NV80sUCDye` zmP?`cuS}gzP=-t_Hq`?Nam3V=t#xH)y+S-KiY!V;k$F@2Wrx`py3QqI)Pc}ZnB-1z z7@oi$@PhM#Q!K>PQrN>58IwyPF|~N|rjbGx`!i0rbWn?=+sgCp+5b4-PTUB5{Wj=1&go^r2ED6_!II4EVyijSB5&Vh)RP3G zi5c~gzcz6Sv_fzkp(F6YLeY)KBlHOk$P3;>G zxmW@UWaK@7_e(G9K6!YoD_PFS!Y*6;fjGlA6GH%607UxGJ!*#6QO7@$nQKKGf^#aQ z(<6?M$}??Y=p<-{T+&yZ6k)pwCqWzn<%FT(DcD4C6m)c8yKd6!SGmrhsyYC^Ugn4% zAMkjppp9ikN!JyU#ngT5O=VJ5>8Dx;$hPW1_NF>8)=BE&aGS@Do18MdWNK;37)Wu0 z`=()B4k7a}~imWKQ)=$lzUttvk5=DD(57 z?AujZ-@asDne1yDVp1LdX?@c(>R-3GRa*WOm?ixNJ_13KBF&o#D>$RPENAoMH8O^D(hWz(!A}JpW z4haKe0mfk+vqBx(D-Q3l@`LA?GDsotBaun+sKQ`%xW?;xc0&OXg%EZm?o`544t7)x zr|#`!;6o+SES&und*l-igd~;16PtP zgmJTV*XHGt;qxpQt%qAvC#xEiC>wUrN7L~XKW#I^p+u?LY$FdANu(RcVa$j8%kRGK zW)t-a6m=p%2BB*b2qG@kC1Z&}C+>zdk(sQfm<2ZaJz$~FVZ+5-aHAh)cLczio()#@ zV_;Q3DHg$T<{r2H^fTr6C1$gU(F-MI$lLQ$CJeK zyqZ|gYls(kj(Ab^J&^}yLmY#M0y;L2xxh$?JP-{|YrrhH$p%tTf~|iLZT*8C!LT7XG}GSS5@LOg5UU6HXn=atxxN{$H{se6*8*JY z;ywM=67JL zgPoz9cfF*J>?KWO3O+K~$0j=q1Fu!tCtgUKo}-kXne21usM41Ir3WmYuj$P>!ragni=b>G$-TXwyvj#eJeod@@p5Q&AwUy2oQ@=TAkC9#k*%Am96r4hVCAM4 zuV(RNi>FZesb~~WGhr&H;utt?uilwIAnR_7jFT6> z-LL`eWp{=;7o1>)$H{VZFIC};wCjjFkWk!F_kY-blK-}G$p??)6hurgFu#_u`)s06 zg?yeUy@m1YK6&Q2UKE*O^P0-qkqzrSASa*`n&>gI-~^a3ThCNJjYVeEwIT5KIsy&^ za6Afq@r2&JNcKeD9>-7DUT7uvivZ)5VJ#j``$P&D;XcXgVzd0T!nWz+5kLPXUJRBJqM zZ}JW{@5npZyfg1&@~$@T1{+j|PU&^E9=y$>34NlYcu!!YycfU4=Dqo?HdOp?lNqP= zb}jnyekSj4^V|6Vn-AnACLcsMgKa*9m)I}?Fx2G3Y(AWiu=z;zF&{-8y@62PrRd%| zEo0-|d^8_p^RYyvim~F8$MMECAI}?`e1gqkFv;eV`4pQ^<^60vjct_TZN0YlIrf6d zvu%C{zmq5}ah1?QLQ2ex`qZ@Pa^>548*K)^>jn`L-R5^AtNeHLd%QELZUH{)dfYK9 z&S9a%%5zvqof zPV|zI@9X>dkCm2^3LaF&oTNro%m7(%5qRW(AwYEhP@nNPuHbbvx&a@da+Mj5V2)+| zGu{_LQ_Y%GIyuChyOFObpZpm8>7cnmptWEb>&7`wwGxbcD(j&eX%fD6E@6L5q8;_;+t zGm^nl%`CIb!4ui5sR^`&kl4BS;$Cn;IAp%zd|4j<9D*B~690B69%sRoff_sioF(z^ zz>oSKHTZ$`=RdN^{2H4EgpY(M8vw!T57cHJl*L@wF6-k^KNXlwEug*AAn#AuGI@5J zj%za1Mjm4GQ6yE+$Fx|-sL7m5;4G1GA@?Irb>MyE*DrM0yi#=poK*@hix&_@f?Q~U zh-95a6zc&w(0PypeF$=(Wz`{Ndzht{hrZOSG@#m$V9`v-=q?O(O(dDqPczvf(m`h^ zorE|ZM_-WqJ4_VG24Cq75}d*nyOc#d(FzDh!)yPEMyxPh5X=LoAdIry^Hws}$ zfGh|v28m4>rJVz>&R}C>}3nZ8*G`_$4-ei+1p}2dsiG} zABaQjTX9$wO&VDXF`r(7Z6pK3-Whb473k^lI3iI0lIYHQm~L5!lG7liPD4VXc8Aql zS`5BtclK1%uB?NdOkP1I&IeD2Iyi;2wZhR~S4cs7t(q5f`xhJZhcYaLM!i}Fd9aGR z;0HZ68-_qph9lN@3TBv2t6DqDkk{!jIHUkcNPhN%UOnq6^}39+98~q9zTQUD6A2y5 zSrcW+cUEjx`7l5vo`%86Rfla1tRhJ4*uk3Un7W+xqwWd(Lpn5WC%zSI7uF|+m5_|N z9>Aec;Na){5y6%Ih$TRORq%cg-dDwSFs_-n-j8b$4k618vd(dUt;dn87p^ZCB=RQe zD~nys#u1?y55n;R6}(=?=;f;oUV9Rfjr40#Ax{O8;Hr6^__i?MT$Q!V_hRT z3lUTmMH4t-ob38sL;~?FlCL|B{J*0ReJjou-U@uNne6zp{&GaPBlXjI+}O#`nJJwp z&z%9W^nn?!FvDb{O!%WBp&0Nnau}5Sb6G<&6oA__R^7&k;^;8i2KRNWEcr!`tv-&8 zw{d_b5;qxHp90hLHq9WLK-gq6Y!IQxEApfU?_CbD^Z8%&l$Nlw=!kmRV2{p4$z;|J zDC9hLzsVk;h2kL_TiwGD@M4deY=K%Zly^eqny?%pc6dGtA+Er<|AA&$!CfgMFE*}_ z%Z{ixXefHy=t@{y_p7cJ!JB^v=3;;t$1$W8hr`Y98?`;?$*H9iCznoaCcCWgD57O- zt@4LphyYK4n5sicpVYN{TPku9gh=F8m)kcX<~G1KI# zpFdfk5oi0 zxklebb<>l5|3QGPJd6L!;jO)b{p?#<3Z@>|>R4dO07EKbeu5o!1^6gJAV3TEBy&+A z2Q(4)I-VxV$cZsg0cZ%r^~NTZtTsX=+5oFOZY<4V9O~X20?1qg)u?QnNd#%9KJJJ; zNd*(;l&Fd(R%#-_E-J-=aWzB;0RWx>M2Cue+@Z@Ep!=C24PyDo72QYn+CZ;^kp|yR z7SmWLfh^mBCoHf!wgcylmi%_0sNEn58{7Y$ zr6$OJCPW8GrSIGhBO{$9R=PdpiT|cKZUs(K0x_)Kxuj%@Y$-g!E_imuwKTz-^1^=Sp-yVLRUjXeahF_$V zKanw9=ro?>zj$EI^f!wA4x?~gW>M@9m^6i92`V1+uS~lBrLpkB0*hbvKqKsm#X$%= z$*&cmm zuahFmBcer&DPk=VM{nZkVS+~_iYgSVO0gtVR8)hOmWSOeQV4J{%NCt8MJhC-MH)R# zH$?`ehvY=D$fQ`7DQeI)n*wOk1O^F+QJfUH*yu!_C2Aoh|D1}1oPBpuhmzH$pq>ZD zOyQ96q-f|BjYMM#nwX*~UGsr7TLLz2JfgWJTELpHQ&>w%UO+)BDx;8s*3>a=C{|>O zw(bBTH&Y6jyy-4r?`FK{;1L}~Cki@yM6u{XL06CHCSc+@hM%PZbyM^JKU4Iin_i~4 z1q~6sDY%t_KBlON=nZNE(_^mKbe`F_su>ig6ycRE)Ras1sNgkN1cPVxk*;JINy^Q^G0qeySy= zS>g^$+-V62ID5nl%JeRJbvHG9CItby)y;sICyd`c)$`5diZzZAxq4+#KV?=u=7drs3{h}4VQS#5)gLY%|AB$SjM5X(ta#3kk+IORyxTPuP7p*kgRa<7EB4#=ZPqxqYwkB5y7yzhS%trKOpB zp>CAF&q(t^`YHX zhr$oEb)j}Vrey4$h$&vGJU`yJ#;3~j6Ck$z=ux8=%6f1jGHg{PCk)E+r#`JV*0ey@~C1Ygqi{LMwpK4TrmR8+C;W090n&Dow^4>Uk zT1WMcZ*_H&J%y1df3|@273)Sfxj_d*aV*piz)(}4B;5izlay(3Zu!YcyP5J!t*}x# z_>|$7+XnfIbuXqru@q`Vk*>TRe_ zUVhtXB}+1)u$DjFcuGzlxXNp-^7fi7uxZ{kWaXv^!w(+FObMG2C^2lMAGiZ?@ zkxVWbR$MZswCA{6NGDxRd(Swoi$$;mUH(Ce9QPTFo)vrw|1TTghncJL@8z#g`ZE{R z`8Gk;t`pWsj{d;t!oQW9!;Lgq*vd#;av19Vx8=E0<;F%4Lh^Nir#JWLLwD6CE|EzdZuXDH}U2+hd%+E_k_?Q8TNLI*Tqg->=L_Wl@E)q+Ht68Jh~kZ40LSLF!xksSDZr#HPK&oO4{Y&{IAe=<#e2%Oxl6o{7(_(U z%07|a6 z9zFQp7C%7iZ&kpk9m&(!C3^NRn-_pJ0XhbESxv!Wxq^(ji|e-dH<9x9@V%7my!4$l zl4{xF0#)^*_=(sRHEb9XZexm{ZSjlv)dtT69lG5Xzu`miyZD37lrb*xr?{llC?dUW z+jsBLb3o7bor;>%K>sD5k2TU$>!L8A!Kh9=1*`aKn?J{&x5Z^~#pVwKJjxGZEz3dP z=3$gL3W2yP;QO$r#!QXdn$R>`bAjKXL5ac8Ok1-w)70Fy=Fz+rXG$f}L|@sq=F`G> zncy8!@WbhOxUEHCM6^hhKl_ZHE)T~UBRsa|$53cdcvui`G?!hAwHTKct;NU#XN`g? zTCCU!c+E{QwK!Xg*Ai?kQJk{1DoBv^iE$foXZJ69c*pJ;m3imSk(yv}BvF z(Naw<4fKY>R^3~7X;Wlt=~{+-_fw<33_E8ui`%7Dx3x?y#@4csC%>D|wD~-KKYUEa zHHosd8d^5Qx{Bc!BV13r)1?DeD{y&+^p)plEd|sHfGJ6ZVIt_)V>Wtu@jb z+gcN?sjcN}&17mXUk#Ublh$0GYUxPm#`TsUV061`?c}M?uve}8 z!l+Hmic0*RN}ML55R?-bt`oMbEDh}+dY$s}Pk-XLe$pi(<+4DWSH>RWF~PVmjS?>B zoU+p8#vOjI+;~k(lj-CAb>-P`2wi*T14}s9P|uKmHIDY01_%WmOYf8e&+4-ceeh`5 z4k7*V8rYZj4u{c~m8nMDqUZ%#iwa(=Ca*Mru-d8*J)Ssw(7177lW6k(8+{c_=)E zwtc0f7UH2N$_}YUQlwr=Cf*cvl`X!Baz$F?ihgJG%^p*P}%xf%dlH~n)3W#Xw+6j6@fPcE=q=f#gbhXd*3AQW z?cv_O3|D&+7G_RMQz)fZr38Uv5)S+D$yffOQWlX^fdBd&u*Vu&{S<#d*hR5Cp z;sB?`QY8UXAVsAC0yK#yP?|)Q=K375f2!o5*r?R1Qp(`ByLUO5tDel({0$({+=@P#*`p`(gP_k3qW~efdA60eWKkAAdSY3fA;p!(Tf%5c4NRO?y{QhRp$*^D|T_UZS3CfODILc*_AC_6-7um`4&l9!bR#;M zKMcWE7QO{eXZ?{|m_NrKwFN%icd(c~=1vyd2SRayIQaP7%;Mobd^1acZ_mvv5sE@p zf(***NjV^bRolXnb6ne4$`+QI<4@bd(vd+1im6V1d3Uf(M6hyjvv>pY$Y#AE@9?(c|*QP+h za~8N!3&D?CBi`pEx#sXto+Swh+_^XeBtqRW37tW-#&~o`9)7j(tG$kSI?=V}ZfyhJ zy1@7&q4aRfrEp@(X{tub&93+fPBVUm*Nxwq4Gvs5uCe0JN`=Ndg>quUz@i&OIc^6- zj&Mn*2AJGrG=Z@RL`K{Wp)9b2AUo5k=5|tHS;|@>g3WFZYtT7&J8OvbZx3r!jD7#Jn@zD4hKEV>iEsTumH0Et?Wt-7A*y8#fd zbEziffEva$@YFPL*ff|&(+t)PQcFEGH|wXtTCfHSb^|pp8?X6P8;Mw%%c_%*iDJdo z=7V6Aidb<)BxZsF2ErWEFg{$t+=NhHj6ic~NYbDNG>DHxcTn#{kgj34$wyscZ91Tv zBJdo@x6@6U4}*Yi5`k1Ssc^rtt6R8@wZ>f==dR5*R)o9Mt*q@*22i@Wk1NR9b&6)~ zTj8!_ABiBQPoS`IJSs!QiplfeKJ~5OJTo3eBGov4F`fX5`=$YG4BImN4NXC(r|JIE<{04}RAr_`&mv;ly}U=6&*e(RWfE{8_x>-0+d`=={e zCi!tFt7oZGPh>jOKg_9VkPRnc+7bJeoU2n6x<|QdFe6mHq5HBzNn#egpNYpVPALlU z8;Rej93Et&gKP{uJ!lN@W}?5xp&!OmKM-(F#1H*Jwv}N$bTZ6RB9@G@mSMTs#-?m# zQ(^vM9j2l4csZ)zJPth=NU!0Yn~J6HaGRqD@3?m=zl}}X#_qs#MTb`){RpVtOpC&4 zA)1A0FfF0Qv#MHEmZ>GN+FCVMUrT17eX!12P4r?eyIZTpp3&;CEn0oHTWi4fYmL|$ ztuZ^VHQ|QVln2~eBOb2Z#Iv>LypvYIhia|(46O~nM=RpbYB%%ewPJoi>&j1Q-S{V3 zcmA!`gJ02l!Ou`{k)qwIrVX*yYeV063?GYWP#;VBI6fZJ<{+EPC#Y?OiPTC9M=bM& z`RZI|)A?TkR-j*EH`U748dL)Vnog#ng`^OO4=p5&K`z! z1**r|aMnv3!AiA}Y=kz7P144w{vnUe+29gQ!u|}G04!TACd7NL79hbhJVZM~%ef7X zuG%sobW;-&Y17avWh;#H?hf=2tuWw$z}b*VzWlJr)VKy|W=Ky?R}W@DfQ8=${W&^m zA*8+XGS(cT)c4%M?(c&g=>d4khE&6Y=qD_&4{=C%WT`HgudB5fo!w?Ddl(y}|B<|H z>`_{csdMOabm#-59**9p?-N`bk+IWLZ~}pzn#SDP9V}9t&Jwj5tc5n4wbSlFFU=`m zXi*W~C!n@4ib6ULolddYhzV>X9r-c&KL( z^Dm;{kz8~SB0x5%gYY@Fykc~!0?`6Q6o@`fJ+_rC*4MD?P?iD0SNu!xExqyiB?V7Z zWVP-H*J zxs|PQtzunk`d4pdPcLQRdH&U*xy;7a^gmPf@R_oQ*byprg*E6o2W*G>pRMRc7kO&0 z$p0CZkr&!W0{*o~7K6TDhrx65oa@3i^mZ;r?KuDex!HXlvt_-q6dhzQ;C=)2IiRUQ zqs!wdriS;$e1pR{g4zO~VmF1_rJi8Lg4-at2~q%&N-+l_0{QL)Gr=G?KrfXoSsx{s z`&oE`>3?}U+epq*;4hMS{F@R?Kv_Q7@OM!Lo9PwSyo%Ppf~Jt?AfNwL1fhNi!DLQ? zi46^<0VxA;=6e!6RGR?uNT+BBK)ng>1TX62RrOI@-?b00?QQ9kVECWJ!ywy=*mA^h zSwR;!cFBB?Q!C0zeRTCpda#`Z3Fo_R@zG~Tf=H< z&$6c2Nei{-Sh4mz>#eN^&ai>qr@ai2-^gCUb-VT|+pBG1N5KR?4Xpj&z#uL{ZRV2p zI@h$F@JGCh*Vo?Q-L!ptfc7Szryb;vYKM8?CG7}*Lp#pD*1&z$PJ!=w8br9aMW%K} z%_TA{`2f&Z+Q(cxm4(Abz$I}0z2LkON@&5nXC(Z1U4ky@6Hxi5XC8JDiU13E4UAkj z=qo;kw-(Zr@`X4{x%t87AeZYW%+X(VvlBPb&JAL__LIMg-Lw|%3bH-9Lx5w$0>%Vh;3yD z)?GWWgB`?9c8D_B%nom5N3e$ITiMYa>=>ROPcX=Q{)vi?t%bdXBzFTYv?x-$%Hn|Q zrGf}l1ev@pE{%d;U?EHMA@NGm z(iO(S;4?%u%sJ4*i!30JHhXycb@JjQ6huw94}3TiJUg<1_I9%f?z>KHW46>+kzzvkD19lWq%}h|&6BnR3-YG3?x1*@t`B zM->C8W`?T@GhJ07ij<7;OJ$j^G*;i0&hlLutd%R9wR7dLv94S;#g)gV!S@G^p#$Fc zg~lr`0KuWx`A)R949AUAPj=x6LA;l_!Y{-A1>c=(LC|Io-y7=W&gf$5WIw76oZ$?1 z$<6Fzn4mdJb#zD?%Hp7_5%ahjqpzD*>S{mnjj$T^CrT5Dr(qDNc!Tc?|+cz{_k=P+{V7x!@hilosD8A zx3RBYVa5*jHKyG+udx1r2K?U@a1GhazU@I1kgKXFS{J0qlOc9p7qaB}>@6q&e~1I$ zM;Pn3kxuY;;7yZU9EY>-L6#>`<$vM&Z!Av`8-jmLaz}ZRUBvY#Tz|&(E?j@X^;cYf zD@cU%zmx0_Tssnw(cjl9v>*ZY9nUF9fHaoigAF$*drSG_jEl)YZ&Gl5f*sTv79?CR zdfan+MyrGH8gdXyVRyj*g0f+{@cwxJlA*c z1__=R@-DO#HX8znuAqAe@Whahp%}J`g4Gl}O~Er>*sgmP&V@W|9ea+RJa4h}7JI=% zqV^QLL;-|zHn5EzGE6|RSKt?ju3PBIRu2oZZFIZc!(L-MJgC;|9!Q4o@=&mo7KI;;$}W-W93K zI%}d$`O;z6a5zPbt|?=WxO%7TH`xL7F8;%w>J9l!`X^CZnR3+C&g%)8OzbjcvXYz5 zv?swr#@HcUhITC(0U_vvCKMe>v(;;i*r;+=bS>h&+c;w%k^>jH zvI{~EnBZxMtj8dLKqmTv%rhJ{*)b>!liQ{OGB|O99kl`Cr^tRoD|D94PQ&xss?_LM z`S6QTUa;faGAISU&E5%_YiP~d*x0~f85u6j;i2U{o4wCIfM&9dx%ZLH;ANL=I3V^Z z#m=$M$dChqFDUqug0Cp}niP${v7uJ{9mw6c7MtvQ8@_OVv@u2hjVIl2Mo`!qT|ka- zOUPL~f{PUVL|WZH+nA>}{c5w{*zZW$x6p=r&OdG72!CPZZBj2LMa`=w18bg917SSP z1T%;Wt||F~3Z}lid;*@P3a0oX<;gM8S#tSFIKG<%7PIX6o1QCMp91t1-*0$>znyaB z3%S0~m?gh?$8}Nedn-0tcHSHn9?W>xRSgHRRDg36tPo^yh#WnUurfwKT}djI?j3he ze(Ha9kB(PY^sKs~bJZ0ckE;tiB$@dWtOHiQ`M6iO5-6D*Xn?>Xi4&8HEC}j3f!r-H zCYVQL%sfTdbnC0IxvvAkYYV8;c!ndPjiW{Akq<)A+kxR?;t_*IYoy0OgjhX!npMT7 z96|RnI2%U_po!r<=IS_Ae-g^vzlBAG@xeolyaKg$K{7fVb&p2f6Y!=Q z=yCO+`d`GlLRqOBn}P5?(B~cl6^+$ zsN{?lMu@z+qqIN|Ja+J~g?Q8(k7%F2jTzT)g3(R#J$Re?-xp>9NIk6oHN5}Z3+Rva zhZxL%dwUtAz)N8G+>Y-B*cHT8C<)+#QBv~YR97#$&~yf3gGxu8Dz>W`Y!U931C9t9$Y=yQpd1EPBF|C<11q6(HB7<4O5~rfP6aDbN}v@+ zoIV+eDW$8}D7-dMrNt<5&1MI+h4&Y&|Np_WBMN+}Vcmj$E1Qh@L7LYLk z%FF*DNdUkFnmMHr@sZ%eqjkj1qnY~|3a9pbijLGVgf;NX#*au-IOM@jCx;udSD9}q zYlla92UsiIMeo|lYULYRBQu6ZIeHi7Wr7i762~Yy#w5!hfQBA(L0!6bE+JrSV|5~G zMHFvkb&(k*iqa?zjEf*ocwwp!>m)(eAm7N-5{w?8bRh|#qq>vUb$wdRNo(c-IEecc zvFj-?PSpo~91ZE)h;6JP7%RR6gKh%bSfk#Ap=k;tilK^0POB2wgOKNc2&~chATB(N zv3Udl@G$1(0u0|G=;yEC%B403o&p#@<&vkKH+R5$?PW}LGZ51mgHw#PzZ;7~kJiRI ze>02@Jzw1%#c!1{nPx|lz0EX7supj_&pA>r65B?k?(mS}7{zALGsw`#Dd(NE zLBkukpRA+J4%r7cm38n%tM^6Ar?bqn^7uA*qPW(=JR-X+i+4+Xj+rHwWt$JlbP9xt95v0b_EURhK34W_8_$ zg3AM>ewC{7Z9}FYYOn`w>?YO8K=WYo|Ixw56hhFIbyMCcVX)jLU2V8IQW|VF=FVfv zoU_fKmP3MT_2U<^MK)&8BIxGAKDjbZPUaJq$!<4=3JWIHF_T0OvJ6S61sUiknV2Np z32e`aI89aDr(m);hVz{F>3EMiB7LSxPYT6lu4;2+t@&mZNWz44P_g2>vDfF~o`{oeOHzjny2WGIln7S14mAlh*jgu_8@%+=im5 z&uRn9GxspjsV0z!&IsJa8gW5#u?6tm+8cvM9BnrevR-(k13uuuB1 z-ubc8RmB#Vg=-!pRm#SNd@r=c8`?H&hiYUY)y=@(;A|E9x8psORPO+f$eniP1X8n) zA-oAIA29bW_^+13Dse!+0J8WN?GQfX>qh$hT5||1_G)O3Nwda;5N4_M zP)v_8RZM=>&}^(AX(qstu>@uYW{h6O`QkMk zFLtww0qi`#s&1N&CMBUGx}%#67%ph6I!s4K)v>JgL>Ko&*H9Ups@lTrkYMV@3aKmuBvZf77aj*AhR{lU2*_K zF={43JX~=hMl-OrYNm#!6sRsjI{=v&I@c9y#vpzfyUryReTnH2t!Tw_-~|xq%pG!* z7;)p|$*wecbO%%;+F+3w68e}91yne!sc&VGH+~&(J}*;vFI$TY%n_^*sun~4KWm{T z+3?WE_0&{?Xq>(iZnZ)Bic+Mn=>Fi7#%yD;AZ;lUR;A^p7W&Cq1<3L>EyQFh-9a8` zUeaGP%+s3;`#R*IaK-$>Gd5D#E^{I^8KFDnL!(U#;{_@IkZ^7aFo@;Pebm)>V zJQ($;dD*MdJb3HNW9MP-o);^hg-DYFQ#pCLIW|VRODT7DN(+WYFf6V+d~JGl?TT3(fX-wl zx7-k4aeGL3mSQ#q;0lC!98+QjOo?$nkD|xXUf92nF*!sv;$U-)^!MWF3R7afkSQ@B zRCJRBYit}mH)vJH@)Y_u6?R#98pYBnmVvASyt>6REuIAvTcnSRg0eZZDSf=A#d9s5 z=jE^hTi+BdN{nSLVkY0c>6O%!WRWwoGW zD2fa~0deLQ<=YBcqwRR1m!v!?05hts7yb@!HaV0ir}Or7(}7ZUw0I|zclN+=a53F? z@j{@p(B$2yH@ce~I+PuFq!+TBcnGz=>HHSht>wMlY%0H%()6Ks-RaZYDCkSi`gwT& z0Kc6oI>6)usfH!gu0b9?m=AHY`5XokS}KE3^msU|&GHc*b~}eMWh@^>Z5!<&IVBH? zD8c%ZhYjTu=^YG~0S6r6+)Ob!Fu>8y%`}q(6CB4!D*oqm&^jk_AcV2}E)NOUm>k-a zQ~4~5&$jqICZ9u{4T@`L4vK3_ejjSg=h5x`6oB^Hl7sfznLk7^&|X{ehbe$IWoP~< z#THQPF^Vmu7)Y=!`Qs*EjCE3JDK2_`pB?<;nz@$;-{w|xEn<$zm%wW{xn&3C0lJ1M zy0uEyzvhYx{;XSmVLoAkW6b)>k~ggyvRMl_qyn^u2S->|bxpv^)_kw^z9-q_%PMr=e;NCDRMYx{iY=t}(X5A{skF{cD(-z2N{QIUU?}@ZpC$5A3i^3QA5}PmQ zE6B{5v|%o1DXEu4BiE;FzLKx9**d=3dKoA2PTsP;5m zu3U(+>O*2+gUpP!TKd;&Ws`9A)p zGA^dQg~)_wVI-}NZIK6+pV==q-_PH)Iru*E@fd4L$~q2>uEYF@$&Uh6s!)G|mlB(U zEwf~kD_X;VDBQdjsG2s6M}2661A}fm#i8|c`ox(OT29Qj?G zH705u|IOyV^FN`$PPx7g#9#7d?lie)uG;|ktmA*lCGpnmo)B$-($rP~!1r|MXQ_Mp#FU3Gkwt+={C?g|DMe>9h9mMkf2fPN~V@`Ex zgGUSvK9B)84@_4(D0ylk+dpKTfy@DM5Ep1aT@17g$05Z11k~yZ2v=7Cae2BGpi}|~ z%kNhrJy}Ph?*LWh&)47{;DqONcmyki2!GF`Bs%t*nBx5vc~<$$12Uorv=BunVREBS z0%&VyP+55(qAbWM+aY#>T0zjth9Hy=B;7E7I4~f>WBh(Z;P^fYh9H70W;e2k?Nbvr zXXCaw;;@uMJP0odTUa7xhF>CN-yPwmsw58x@uG@+laQSs!WhYYh`5uojbxJ}LU~i! zk$gdrrI84LKOJv+Cx8>@)Clyz2+Djr&_$4EBJq*7lVxsWSzB2kqW5N2!-d;Epex}G zq%pIJ%^goAm4Af-LGaq5M0+7$mrJcigDPqV0w=W^WT*%~2wOKayiOl@bg#RG)guhb z1-2gpVhwH@n@EJ`NjPduhETv%u&1X%RP#<8Bc}tSo(U4rY~bK?**zfi%m)?bRrUbe z1B%l@9H=i~yAhC{OJa}mYzPi^h0%+CI77dHYzh6vSTZsrg+Igyh-eA#ufPb{CSot) z*Z@0i6nzr5pk~agN$*Ldwcy+f0a3 zgLMHC(`%Q>L^wnUXw7oK3uj~pKPK@pBIY?rF8Toy;8ZVEfa>iWFs)F%IZhM1F$n;l zl@RP#8IFrIH>ek#kJnWCSPy;d%%45Td@p+=REs+JiV8Dw_*T|H6YOElAqYg)UA8k=THOQPgdi)RyH@l?A<2DTJ5d4I#d-eL0Ic4S9jpLr z$cX+OrYnIoz*0f<4%UW%xn3>;GRTY>R7CA}Iqi4Y9^l4cde}I`$Djhq7`!ZuKnIM$ zNOa*8^ut{^^R2@9aubfWG+xB^=zufkn>ZB`*`ooTAH<=6PKhDdL+p@RfUeWXCUo=HiSntQry* z&>1530rH|DYE0`DmNL*DwnG3F%>h%6L}JwlB$kapyZnGlnXiU@7xeZQxvEbTNI={M z`J6)Tr!ghp#*}UGrUFjsZ+MLF!V;Sg9UpPBK;mgaOu>_%op67d!NgRHABZW?j~Xcx|&9fsqCw zQHXpxAh|n6U&;W42GotZqK7G0LfqOd?gp?6jQ^x{dnhyT#9MDkaLx$oBD9CNay7nf- zx(-3#>m95g--8ira2@)KrUHq%AUCxJWgo_=aiVU91M){8-@q*g2m}Ni26C(hV}{ES zdlU4wPgJ_Cps#$&0+etSigprWAj_Z#G1|caMOfy+jK+XRf^PpgAWK6#E}}&X0Y^!o z{=mTippsTgRp4Uh{v!(#M|+N^0`j_QqBVuX*KA80s9zNiaBHB1Lf`l zmN!z;YNiI37By!hbKa4NjS{3(fXI-fdUj-W5;8@!42L=}hO-vDAWma|avOhh)dcIOV&Q#g2(ZeP%3W2vv?721l^jZ@I2ZU>Zi9t^RzDy6!RI7(x%On z7=%Lsfxkj9#0B{J2+PlJfRhig3jBeX!F^bsNHZJO&Oy5U3mVxLZ;PGjDj<+HfxF8# zUBqaS0pK7;+b^VAQRQoR;J_!SiO!b+PTQ1|Y)v_+x^IeU+O%mxwbaXS5RTa`D%|hdS=V{z^dDm+Agp*K7wmLmhkylc1cRFC2;u13lczvs>R+i+z=Cp#qPoQU{Gv6 zc5yxi*Y8w+$8yZy3bN>myIKZ&H?*G9712-LT=*)Ca>6ht*^A4N1LwKJgCI$kIMmNI zMb<1$5f`)CTMs$0EuE|f9BO2%o>p$OdOLo&m0Uw*7<;~B+9LSRSX4z`jD_gWz@ApB z9Jt(qB;PAmSn!tx*5WD-wQ_@%=47+!6>C=`y#*W2n@-z0aI?n7G(FRgm8)C4@5 zazc(B<4|8~&H~$V)pKwRedJ?zjC?=MeOZ0=Ub_1&hl0AYhWjR;%JY+k?p9HX32S5? zz3U`$I?0`;%~~H{UCv4cHMhljWTonNZ+rJCuDXP&+`R^UX@Ev6G;sp?) z@m-{xwF3lTPl@||mH4X?cNIewva`>X%XdKv;x5afSgXWcUx@bvn-6!-w^Zt*I)OyR`B5|?$;cuc859c0;ft3&vhTGrE=*1ikmylE7{|I zDn@lwn=V~;yoYMOCsUGP3T4(FccgszUAITx^sf6G)jgT-x%=xXcHhVDpR z)yd-k?;V{ye>fdBsh6jY)7-U#Joh_9^|~WH(0~EEKGNc*~X7KvHJyz4F33K!|F`tXxlYu=p2GA1~!E*I)DGs)9ba<~bo$ z>;>Un;$+pv=l$4eOnRjEbRG4| zfWw>1bGcYPD9sWt_XcCudhckihJV2q-ut~OR^`06NtlY=a>aYn0hnx;{hkxky!Ig{ zb|TVV9Iu)%EXVc=^~IE2yPXr;Q_IF0M}W|}o_)@N@60ClMTggu*xY`}8O$Svwxz&H zzT3gBuRgFl+FymKo{hfJZDy4S*kg{6tB-@p)AdliM~3&eBZ7Ii*)<(V>O0bY&0+Gy z-({ye(9mLmeZK>vhn}#9C#k&VZfW0Ya~NZr!(^*Cc-W@AmK?%ROxS9_?zFegTlS02 zLUrzteXj!z!;jjzPCMiYdx~3CF#H4ih=LG#_6s{t}pQcHc;{xDR{}`kjl<+sdpxX&$mVar_aIfR8Q(|2 zo0Q306zn(o0m}9uJofNI6gzAxO1TGggkvT@4vW|vs}S@Nyr6}jrr>SL^c_ljhGOqh z@ScZA;uinV;va!UQ^Y^E_*sigxEAqGEq>0!>rk3HWWv>e*-^y4px{eN@D)A#nquEj zN#D|o@95e0o)CT9;{Uezd5i40mhuOcK~|Grv|!8?wDDgo{wodaZ&bP8(f_UZX^a0s zE&9_UGp?=p?=a&^j3bETPlX{@6mrE2>x5{|Ri`=EJYY(544@ELkcG=FbYXZ9m>yvX zw+BzXsGtDru}JvnZ5X`>_rlC&q$T_qR}qC9l+ObvM6@MhFvQBLt0lmIED{N(NVG&1 zv<*gGnE|7&2%v^v1hcLp1*TjPq|r0*9u1M<5!GSW6G0Xw0q?O$WK#g%V-XCwid>QB z5w*y6Ymul!;Bdtx^Q~)Pz7<49Q6G=Ui0fL>kWw_FpfLqaC}>In*pNk{83kZN774H+ zi^!sDk!a}=1!TvyNEA}gni}5*z#@t~qOG_Y;{UMWin80Aq66J@q?=9_jJP6oF`8W@ zx>zveiZ8lRtGWX=V8vDR6ukhR;uaFz3af9S!+# zX=Gj5b&J1_tTi-pQ1JU-?6o|~6t{!(({9A%No~hXY(I7qtai(jk^WeDZCymw;M(8p zC|>mq*o{RxQw*dDAWBRz$P|N3F$7b1ak!l*_snq_viK`*$qSqOG4h>1>^wb@vz9ev zB=|f#FyM7mdG>(!if^_Vm)%cC8G`XoMqpFGkv8lo)M`F$h4uJ7a_1hu|Ao zYY=Z5hJ>Tx-iO0BJ7{kX+wi4pc7}H>nZ@j5i}7NDJbTS<@1H0p*hK%aLGVFv4rSH7 z5|-Kn1f0)pF^T;|N&!FFFe*ERJOYig#Z)m3LurdU#GOj1z-HgG??|HnHUFLcq5Mjk zV!AD6@H=44(-wE32ywTVDK`xWON|n+qic)V{4QJEBj(s*uDEycIbR)c{z#Dk6~B+H z3$~XN(qO4>#hAzx7oN?Nk8)qV+5>HIzjz4aY>W9kV2g*vBer-{%(HQ7jkm=DdiI!@ zV2XvdSR@{o7r3vPwwMQGE#aGJEP)jS)=&-;zUuOb@I~a?LJF`sGkB3LmWpMzc#?wU z$b}d4F1A=f`8_38+F})6kmV&)thU9|;u*)|(@mPMm17uRYCfN_#um?twQ{}YOS9IA z=WMv=t0Ldge7WYsI72g@s*L6h5bMPYHhN(L0Mr&QQgvUl#mi!&EjEeGu%&B@SH%|C z$hAd~EKF{v6tCH0hj<)^`1CxGPuKHtk^I2xO}yZ4DkqQwtV=9ogou+U;IFEKpZ3wQZNrV0D(9p4x8eL zEsm1O%;V@;K9l;SJGK9WGJ&b=U&8n$bgk3^dt3;LHLdq8B#RW1^iBvy{pG{Fp)+urGD*@JT0_rL;V@oI3o;0-f zq>@D=)9eI-uX|yu309h*3e(dTmoTNt_@wM-_*zvFR|&wcL1qhrTf$W4C1G0<6vY#O z(HJ@Yyj?|pYxuUy9!dTvwPMJ#Ccx>WZAQr+L$M%5JYz%#qb%PVjSrNwH~6yU`b1Ya znA^Vo+z38m`wnsW%5h{fXLeYmEDiIGmWB0W)8v!+;qh{1f-5E1KHT@Lu)VNv3|9l> zFl>^%Xo@J=)dw?$&)LXq&o6e4H13GZ3dZ<-Kk0!{IK3p`-w|*hI6Hx)~I_d7wU=?LtTkHPu~zhn9>nBn66k6)N?qBRrgpN)D=sEx?-_Hd0!DvK!8x~ z2W{5L5UNHh0Tu>z#o9m#Rl0v89aa<-2SHI?vDT?8mOEViw2Y{GtbFPosz$iRyIizJ zl;?C6WG5mK^?+4Byl#XSB$g($4HSY5RK{-`2~tp*F0D``A>1N`0fGqTCAT<)nkf(Q z39$o~J#z3=6|B=_aLBqZ=i0tuba0#ZW@y+{?1E(8dI(jA@ApS@-`qRp&dfbCbLPxB-=oeb zjHT>TT?=EW`&75WSa_f6ZjN+K=iH8Jr$RL|fD&ptiiA0+apt2&*p8FZB2F2)#np7A z#4}Y1+K8hN9gXA6eCQ{ zvx2vg=zYu?*`GMe|AJHef061(^?)2lAS2w31p8qm*pHwdzsG#3oBeN~;{wviKb3Qaa4p;2~|JQFHq z_e@CW6WLLmppar1zA`~SJozJ}?ica}qJK7(#LZK|Y8sX1;F?hs+vIzsGjqxPg) zYEKegLPm%zm6DZOPpA82lw?CUJ&vjLc8{r^60B6cDpj!%ifstRScY?~>Zl`}AU;*E z{A?*-#y^FCN6Bz9-C--_!WuC$(3kS2&=vF;+uFd9L#nrkxf@ogKBEwg_mxBE_5}J> z%1KBG0J7Z$CUC%16jKFVj05UH`#@)@QiF?wnQF)(HB|Crr#0-58qR1Lkr6>BV`Q7B`Owe_}tfOkXD@bo5lOH2vX6#1wN?i(6q-#@z5-G-sZuGcHH6Fjh^~dp9 zj9n>#sAl3aq*)?evv^`eOXS2&s#*O^lj>!vNk;KwPfK=oz2fR6o4iT2tY6xcUnbMV zOcTLly0594ph!%m>Kk#n>ZHj4secX{8M)Kl&lmJxkEjcmAsL!FP$^JIJGcF z7fdf^&Pk0q#i_ASu^S6!VedAhIP3>z7N^C+;3~wzmFk?0DjG`@=$&{PcQrDrB$g}f{ZhJqY7n3Hex1 zHL?QquzK|`k@~ynBuE1)I6R@+@_JM2IaO-ze|x1V?YBNvRj7q@pc-7)f&10GEvoAw zRaQHP&zFw(`?i??=+zOz0(5fxDSe^9>9yS?W@A!RDeaR<8H+6;6uHgI3e3(pZ`md$ zWXuc@*`F`Is{K`B5ry(JC~2;$RExRISi%KrfYz2){M;I9#dDJn&_HDqEIaLfVvjXjmbJI+4Nu!N~!6= zxZj0531%hN^xbuShtQ7e>-A!FA6XuODU)X1Sg(x4SGb9^tfn_-fKMi$qb)!gHiRXR zS2t(eT31r-DypZcxEP?lz=H82%jioeqP(IC;JjMEZ;56_U%0EOEJkx!h!(Rftzv6+ zDND^drt@1^OYUMJ+rl2|H!MK!tFM?ZzvlSv8$DM2U7w}?rO#F0>kHM7I<9`wH^T7T z2FtRMY4u)ElkR8!-5rO4*~jVrlR6dM#WW-?>H0sYQ&;OuE36wJ)5x~+bVI9!Ze(@Q zF{_*U$?Avq@H6oqK2tZf&eQqU1-ihB>t@y!Jik`AWQWkox((0byL6%Tj_zpvUUz~y zFS7okK?c`dtbgh=V6S`PRiT$1@1lF#-E<$jx9)2X(f#ZtdVsxB54P9nA@*&0s9m9l z*$?XB_HjMJepQdO-_~c^U+YnUkRB7r)MEo3^u$1SJt;6ipB=bJPYnQb7g(of1P-GN zEDL-T*5np-MCY==gfwp9K{0o@ZfAXJq8*$q>HF19iHEy6?c^cP_Ng_xDWR}_QeCL? z38m?e)Y-a#j}g&Vr~|qgpDV?hiB3{;DbN~#+KqX}UZtUKLAqqMm~E7-Y!3UMkNqmx zU!!E)QnzB8a9sUH$0a(|M#)GESTVdp;7DTKbpv}v-JEqdJ@AOeWwElr?b*ta>`Sb@ zEduv&?Di52Sl_^vYvtorSYcMv3?L_6xF)T@M2OwhDk-m<`S=7%!Ux=c5|5mMjP_2H#j#l?CDA7OD zXS(2L$s&;k&q@NWU7~+NGTOj$8wlHydRmcj4Pc-ol%*a`0gDoK!vQ?_33rvcKl&0p z1=Q#tez9yxxLhSq#tk7aUhinqW57=$YZF|s5YBDury>RLjDe%2uP}S#$ zbSbnrhc7WVZWtm_3rj{8jcaH6h>~hqugrR!5ek6AQ5xsn$T$-3zLK1hLndc`ZWr5M znaW2Q>tk-ZSE@Ux*|aTKH1DaXeou?akk6boQE_2z7xC)F2)UQ2CVD9Zy-c;%%aIQY z+PTlWX5f-(Bs3ccS0fEt6H}HOMzq*SVk!{h& zI_@w#ZaD5sp}WhVpv&O|1*dv#ZLd&HZC~Wz7eqE)@~i_Pyl)YPUu;Yep-Z9ykMl@a z|GomhRoISw*4$X&7&^CTzY&Hx+_X_5sNLP|Sp?F_9k^5RpsXj?zRxzvQ=FuL3*=2s1fbPByQSqgN1qr ztg;1hZQBktj_@`Up4VM)$jtMt^TeEQi)z?`C-LoEwquGW@IE{CtDRX%TU29o&M08J zxnw2P#?|$SQKHnFnH0BTpSB$r>1=$ONNaz*4Cqv^mv5O-#InkjvqfJut?c1uP z7A0n`U^?h8p_5AH6VcJiQMMWOGPzJxKx9;j?j4_?kX`LT<+}HPy3f~_G(1Wl^d$|8 zvdf@Q?iIAk`!|ARF5BY(w+GY%qOOZ3wXvnxTZejg}1A^j}l^f)Iu z&(YT}0^|BBz5!oT^YrT&(!2r0-J7)FZFQx77jET!fYm-woAf7YJ3guJ!ze%~Q0SFO zUQdgoM;&0<8EBRtiP}}$=ezv5$2^-^+vg3H^OH=I1AbCPfNf)1&n(*yORazi024jD zlk(Z;sw1?blE11-wZBpw5K7$!!5su@QhwUe{;iOgn9C9cBbJ!~l;NK+s_#!$ zsy+0__U;nBy}Lj%&_^l!Ox%btWv@B|CRu(cze2@ex3akF>Obsi9-N2Nb!`Tv3` z>MQ2RuTh8ohOYYu$7KJ+q271uDimFA(Lbmi`lNacB0P*}NIF&Yomw+(;Y6D5?gm>? z3Kz#L6^Avcq46WN{e1-HMJA1idQ{IrpGJ^*FVgc&<}qryE;GaDDo9|y89lcEG1j!iLKtQBMvAD5kVS ziUx`$QRRSM#Jb^6`}H&TI6ZUHuV&^%05ORy{FveT%UEl^1=57D>`yYDHv11mTEei) zM*kV}yI(z9Hy{kY7zQbhl>+>Y3xo0HU zW&>^yIWgo`(_0Px1K9*bg8eHUZ*;orf3+sR+VPjw$W~Bl5120+H6ls6H|9$vrGx!) zjBLmPjn!(V?a4g$Ng$f2wifaJuhb;%G&!o(@>go2^%}iaGJ9h?g&TI0Mbdv=TNuH@ zdf%_UC`#nL>|#KyHt+QYe3A`Fp5%o;qShUL^+i`EUUc~<+1cGqYX414gjToz@+OfW z10Sl_`Q7C$D+=eAJ||s+CzzV|siAb#Roq$nYBqAr9!uIPIRUlCA6XtJd{N$#vQ)q5|^QqVT)EBG$9s!1>S9CdB?nMv3~GXgch9 zsZr0Wrai-NXrm2rWXA5M$J4RDPxo&7ces%^u1mO}s#kINopcjD@cGrhk2XMEuYs== zVv0C1A)H+`YE(GiR{&W$JABYF%5Tq<-1A%|^S1mA@Crt8Y_VoB@4WfdM(wejan@=@Ba*3aAM@@3rd8wr3rAxdXcSg_ge!L_+sTn9i zryuK{=33HAzZAE5`42?%o1c0#E1r@<;?Z$*>9XMuRcLF*hJJte%GUp*gSMM@&2aVBC($oxuZ6SsVh?&l8^O&_70bU+On0K#ovx5B)ae?KYzh52V5Y z73C~LM1|Emh)rj?+s2uUeL!Vc$e=ROgYF)vP~8%7_o@QfMG4S8Nh?*>Y3agovFgh} z{M_iUsboy1f`F1<@j;Y2Aj2u5^E%UQT~s@D1~xHWk>hk@5cD8TPvFdo`PjYrynVnA z=%+3t-IW+rY>+L$SdDtB>Op=~=?(R@g*H=$YCO-JJLY z67r#2@-R&6%nK~41IjQtY+m3b@`{GN%;kQnZluESp1YLvlK@S#F0pTKrn50*NM{G0 z)A}dXHdzkIdXmagy)f?)V$clI$pGV=ikMXOo2$b2XBhRbsdt5A8D=WFG3Sz>P@_?) zM$gyGW+~pb6OD4b{I5}~ed@um>uVCewm1BH+hl3_*Q^h_qKenG@A_>5#JLw#gOB%z z2lq8axBgdnhBl$A{~i99WkT^E!mmY4-kV?e`An~=QeIP0D>j*vyrO@HV^z1Mq*XLE zse2cuc`=iE(yod&f1yQxQC{bF5-Mrl*CW%SUTARI_a@1e6Spldfo~{D^-y$eB(Ygl zeTJlsGs=-ww~R~s&{s`dIyr5(-}olyrFHb@;jG9r`(~1PW~#5aYv$n_*Q9m!(vC(W zqwwTgphkhzBExG;GJ}rkutVK+m}F)`nN~;H@#^YoiNuosY&AKBv{NKBHI-wx>A-z@ zvo8r}_sCMEl4y<;I9KB5$)(Iu^X2zknZ~i%&ru6gWO8u~bQ3shUgtZb^SnncO`FLC zi^%v?y_x?y!Je9I{n|-wy{VU__3~a%M~BnQKf5kVdmv?zf#ABp0VR9tkw?M6Cq3%|!;<%4P6e?!rK?am8}f#iH{{ zGejNVtp8&fTU{AfqZz#22!$A|7c-iC zp};##i0z;(w3?X&dNN{SShNUGqWDkeOpA0jBXmDgLRZE#@C?OL9<(ylbvXyYxJPrxd2Ne{C)Ln>hhD zrPN@!sYT5j{U;+5Zwx6WfX*!D+>fLL3oIP*PcLI$z=?oRI+bskd39PaVun@OooPjm z3Eg#1+AaQ|DBPJgw7(fg?cGIs()fH~Ma)jHrtuGw)r2w0upcqhJIqK^azWZM#=9^Twyl+!E{QZYq z{^_XiQWU7CjzSkPqP~}Cv=Ji;RYVjKk~#i9Dft-6VZ@He>NQC;;#5??0x9g%f^wgP zK4n-t5+8CvL4i4KP^rV=?uBc6qofuDySRqHUo>2qB`n^ zj$on`2_{NM-IyBb97ixwy6QaYV=z%dIzL4h=w^;??!IuaID&1a$9^ z?t^d{zwjCb#IEWea`~s!r@!DF#Y%7TF<@X& zLIQ&l)-aS|%?ujWC?baS_>e{ou|R-O@OCWaO_IymApxCogo_Z>~lwN^Ouy@n%4*}^5~&YHjQqOy4%U45Ot8m_D3 zoYKWE4m57?I-hTqct1{#Gz;HI9<^EDOiGQ?eh^*e|sVcxjIJ3cki++I(0zM2|o()b#^6KA{nK3DJ3 z6^=lrJb;FmevtSRTQYjsAM&&4N}0)Zm06bKaK_V(jp(cbH~-jd}l zJ`v9M9$uE-w(7x|k=_CQym$VyX*;W~n-jTNd-?mLSzh_P$bD{-4|fgo-)8#y&dr;8 zXLLz#;;q>n2zVcVmf?8zUou)%EhvlJuj7}pASA){LY^!TYt1rYu@d{6@hkb&LZZZ7 zZQ`ymaZ#S)7D`=LuC7-%l(QOZp*yaU6YURTZHWaYWEvPDBoPyHzRYP!3Q6PDrn!qf z3^Eqr+}wRh7AiMc|GdV zpUHYLs$P1RO?(Bp1E}siAnD24o24HfFF3zy;)2NIR{RnXpierYiqR$^gvc^Ye$VC! zqI$G1}3O=EVjS=`%JQGd8 z_!k@TbU%r&Ixqv<{wt-3nKAFk`H=>N=a8ybh01tLbw_JJ96!#ICme&!&f8>Jb&pHE zljlb|#pjrV>K2@oNd2&&-K5Wv%3wRYQBP!plqD4|+o;}^XY)6ywzWmn=p&p<5faHe znpLV6ylz2YH#tY4O|wzY7f=T&r&29U;7XaIc9CM=+!ReheC7KbRu zF}TJ<=x4Drn*g^q5x#R0+}_z7xlBeRF&z=Y46HcMLG^AHgRKfoJZkHD1H$h8WzK!T%;SRi*;)-!G>_eFiBm8O!{)9r*YPaHORl#>g!beN`04F z#~IMI`XP?f4`4j}9Opk@a1QaET5koEXW_iSYN2*mgVj!}3}35@RE4z`7hpH2hpgMw z!`3$S8w&>o)?S=)J*ghG-c*lUe^q;}Z`3~PKdRC?sjBRN+HdEmLw14st=(2VX}8De zVrLvv^@*#e>@n(Dd!{;Wm#P=+GWDXpLcMHXtzNMoQm@)is1x?{c&>U)y+ZlGAbA6TqD3|y#wAGlk69N3~h4Lqg(6!=Vi#`^o`V4C_O*hPI6?4!O8&Qae6 z=fT}EeQ*fhgx<|p4`SFHq`%`uYBzIPKsA8%ei+RRo6c(r{LCI^%17CwJ!)bP@sS=Q z6r?M|CUdgxg!DvmfH=ozd7OFEp@W;DnYV#w%PDama*1R*+7|x|dDP2nI7l}_Q}mE z#lBkq-c;4GAL1iE!!v|}!TE`4(xUy(s$&pfN^k~%4l>2B4t$A$Y5bV7tt@-IUmIOv zANK2N?Y0hLpZx;8@FFCNlswT&IkxcI@p5fDUSWFloAD~Kelv(oyzGRs5^o?KA`K*@ zs^v-5Ks)Mn$5L;2MHfV}<8R6>e{b=>0EWN)y~?j0*594nww9cO;>diors2lfBq(w z=a5u=M^%q+hs5Oi8pd8J+c(L+k2ypZ~=wv=Y3 z3`zmL)EMY@e{>ibw5gXl7Z5o;vzgTOF9!X)F`YMw_vi=N)|6f!K=Ci5Rw`!jCckiQZk*Nl^`@=mw?yM@= z9?1#lA{7PCqh<60nO@{f>m?l7ysQRVuc&d>t7?{YLM^dg_gd_VtRVHqU6HfQ zCisf!%Syjs6a0nuNpnogsbvRW#5Smc4RyGeupabjZ{($Yy!j7B7U^!@8xKWhm^RLO zui2%y&VUeS_#13;XT&?sT~u0L{=&(F0>#nAeip-IE9{6BT*Qi^|8A_{9vDsvF z$|c*e8e(CI#T++{ zNT_Q_?Bgt}yJPhTVH-zdu#K~<;*eOzSyrEr)z=ZLxOg`!9Kte=Bfb*33LCXnjoQx z5}K51osDfDv6EA+DaPt8h0lycZ(Q7(A;Fn)IY%zDq_Ejo?WCy5()P*L94QAm-;Y=Z za+w!INND~^$}-17>NnduR|;O>SPOCfAe}eYI?u7rx7A%6)?o7mB8RnDD!(MvS}Gm4 zR61^1idr1EIQFruxNWywRNLhZVC!ZBmXSNx>@?0Cft&uZnD7d zoNHmkJJ-5BMQpYt>z#7hkcxE{eL2^K@+L;1>GF_72>sa^k8@F~jR)wU#KZX8zz_A`oK?~#|$J(6&%kr??{6>iH zkrZo>^=Qa?Onx5^S$pMoU&yMI->RgNv1?2_Le~C}bpXRG$2ydv4#`mZt=Li;dVeCt zdeS;9pYw=Zjyl#;bl%g}-Nx`WWIZi(aN7hBxL;|Wc^Xd~xpex0Iju)dMY-{d=d8?yc``Tmhql5cuSg{*&K@A#d3jDI=SzlCD{6SBUS7xH~A z3z3>-{V35t$up?wXQ{_Yd0ZW`S(ARWwOlNo*tT2(bgLbdP!hysJB}T4>|`UyIp@*H z>N+`&haBf(J<{9oSftbXjLS2XC9CAw{#cp@tvMZ=An9ZAwDQA2uAUz#`Z znaB}6zbfn5$W*QWQnmD0B!HEZx8k|T^>(SAX^|j^?RCFFVh*o7lV^)mqoimry$-m$*n;QM!CZ{?hsR z^OjCtx*W+6vK8MLv*g7{pHP9_%*Ef}9BwrBJQ%=+3g*>12P)B zlWP~*o!4J5wSi-I5#dfp?~?~3`QBwOMWU@s?K51Ic^kWSSG%=~3NN88kRuUz`crN3QbOh+^A0ro&wPtX&ks|LCDV0(yH z^m3$iXei~|!|dU{d@+7Bk`Z+65q!P$VbK=eBQHnX)KYsSy$$fO9Ttybd2#zp>GI#{ z4_$i{A$zpG-L=O^Xsleu$?bT#oMlgN?THeaB$u=89LMH6ce1Cr_EdTCG`pv3Pq$}4 z!>&EkKF76Z*|S}tJ*_eamd#(fyexm|$`#92uE?LeXlZ$wYtPYhnsP`zU_K^}r$x=R z=ec&7J>OgVN~BTfTszmb7udOIK16zSUm)LOp;Y2LJ%{n(+UMJgTzj!z?AlB0rH;MK zwJ)%jyLP#~!nIclv6R}4jNNe>p{B0A%5Lx47upy30?}EoM$+ty?bhD1S0hc6R@*2I zcJseq(~3wIT{z6X)R;pjL#GH!OYO@=%-J?TC~oAOQ{>?q*Ip~-ULlpb(zVywS0P5t zpTE4UY;@V;rOPia&Ufvr?Lkm&@B9@@SClTQdwh*N4)@MqTzXO6!)xt9-a{uMd$X>S zQF6U&-(Vxr9jnKQ5$-roPfTkS4~S`~Yu{qu>e#or_U-h6enY?M+NI3OmCV$c16(GhUnc1)NxIb}VYd2Zk{*(zXH6282T3BVUA;yW zOPbz9uxQjawXbPRO?^nv&lDmx>rYaamc+|a`DG_4Wentp3_x-e5&xTXQ%wlD+x z;H1I~XP*k0YcfIwbkh zy%8=BM6ftQt&F1TpfHq9R7s6^VU9Yn7f0orojfK|L<84Dmzr1FD5qqY!Jf6l=8x z1X^hX7Oh8PAke0w`mdyEyI+aLHdYLUg&A0qcHky}T)iW5^-c)7i}(bc3o`&=>_F&@ z!i=t`q&cLzQRD6zn8}K@Y!>>@y)ufqsZ_m9?%L97Qugsv=3?)PU0J`3{&k5HUuP6A zDjD5=uNp80o$7%Cc=?zbRH+8@0@pOjhZ#4Gip7G7Rt*89GJrF>N;MQTVA66HQE^BO z1Nyv54KEgy$`QvA^w7H__o)%wo|(8EwNIUyc(-we*Yp6+6Hp2i)LQeBnG!NxH-s|; zDT%{?F~#^o=S(B{0Dj%oWaOVyW64!Gt1&|yIA;unq%>7F6OFf#}y##b9vJ|xf2ukGE&6S{3cXXS0>Oz z0~zOLEI?biIF-+Eo|JX~3@UsL_mI#cdBCqEW`Os%d|m+++%A#5Q`6S3&zfQsI|+)|iwk~2QJ;^edb zo7NO&tc4Uh2wL(L*r{e*DWeAKPQJ(il2^k8zx|psyz(A*uhZIL8 z%sI4tzw{XY(fS7Jy2Z5Lzq`o4yNLHTibWHhe?xIbEaO;QiHptLSO(UeN7PNlnXydq zs+k_k#D4RZed?ANX!^JE`T4o6qWVH=c01D1ZK_F3YPnu&d52s~z2;*3RH-(^gcK3s zdK}is)WbBXHz=9kZbe`3Hm1ef(IUEo6NEeC%$6HCWAr$-xr?Kkjm(;x5Uk(LIl*S6 z>01!nZ$%KlUA=$=^mRnLKPzlF!9s{AZ2Q1NXaW{OONm_Qh zz8Ks7D^#Vv0SNm$)P8+8CX72c`o2#c!T|EOoYlk+>nGJS`lxyqqmkqKNA;X#spqYT zdckU-UbOPmOB~v~Y;{tvT3ys@)*$t|HA1~%O;B%Iv(#JGx$13enR>^%OucJeq29A@ zQ14r}s}HQr>Lcqu^|AH1`h)ef`lI!l`V86V=hmm{3+r?Bwe^Gg#tx~!+nI6oFFQ~D zo70>hILSF_qk+gi#~fuAU#e5>b-+emqa*gsI^DiQXV`b^OxVtbaFC7c z7j(>iTQ|1ZsOctM zdLT`8f{(sRW!NtO=XW+oXJVapp^ucB^KC&cDxdG4e@q%*nU)$HE8=Y_HNm< z!d|>!@6tCD3h0NhX*KoNPg;-ZTQGu4(nqaF(AKz}cD`?I(CY~$TZ7QVzeC?i^+#B( zW%Hc+tRV4P5+;hOng4glK~K}4;YaYqnpTa{IXYE4eV{(hp!`Jl z5{LI31$5S2ESrpl>%Bj#wrGF^1_rQm4G{aI8bO>u^y>#4{oqN|GxbA`-tFjzvm)V? zyl_gE^Suf-Z2ki}$Um$0XoZlg8ci}#JZj%q`@;JJ6dIS3{Dj~D`h}rIB7o#+%Kfj| z61f!9i6sgrK9i)s?-vji2n~w?I7yfAVut!I^H|A1GaXtfJsWb%~Xqqb%X&un0?g{l$nPySTQ}&BhVf@ zlc70^rDCkQ4HkV9%zAt?W{DN>=f@cZZ|W)Pk1*phQ;Q<@nQE-)@X)DeVeBPiB1x6t zkXYu30Gibom;q&T-r7W5A2DM7h-TdeYQsSc#`r!Lo31`V(Wfb6tQio#2h1KvMuHsJ zrN>R=!P-dNXd1e1rvfL1f_`iM-*Hk+ck~mEeiD+dhIh3MWk{q#C)*NHR)SAvJW>zF z*?#q#{pu0<&IwjE@u^ssdS>k5(q1eO<8YUc`8SUXr@7Y;?=t`^hC8j}u#LwEb|f~F zrGqWoFjV5CtHh9*7bco3p`*-4nu-k2PwA&=COcmyO|x$auI4%4$9jv}!4K@ao*Fhd zzk#Y;dUKy&R5k+x!k#&3u4Eh_P{S{0s)L8rAvm<(ipgM=dNPNMhxe-^`_xf(LjN`L zsXC_lka}7MNP4_VJyRS=?g4XYAmRfV&t@DGrp_@;9m+5XrtTvNC73#atbp+mQ^cmR zWS{vlEYET1I|c^}$Ce?#&sDOPZqt)0^ftL&vPq9D?2rv)dzE^jSoq(k;=W=bTzDc)Ad@3pXrAqa3rFtdfRoXAnC;WZ@N+Te*@&@5)U$d2PpvPfnYF~iqYRuiM zUWfC7A$=o&>e5TGl)dWB?qT_Pt9x2j7{n+Z>{Jc2!X~s$4aiE9&|BLTd~6on@9X@) zmA)Nwv)Nj{6LZ0u+KcYQwggg)RO7uW_5MEfZUx*twV*_k&6HzuO6UWVSxS7B+di1*bf|q{*C%_bo=6Jr(O#$ zComINt0Vdv^_IR~eaMphS0^=;~(EV0!rv>`Mg8!^rFLE&+%-h>;!yRpK(2bN?j z9K$yFf}MJczK{QQ>D{`5DeQjzI-%d8s`#1yjs6>|ivQMo078A#iU;*$RsqPNU38Vz zTR&k9V1++ZAGWS$eZNjWZ5`InSjY5n>t+2M=zY(d0T0|3oX=DGS>zdEy#{9T7$FXp zsorshekv2}m-;#VJcswyRPzNwsI1_5P3#6i(X!r3Ocoh&>uLQWnYFsmIw!$^Txv~C zFd)}iqx4JqWv0^`tr7Ya+8&mBmw(=sUA~$Fon0p6Cg)@}Bq^>u6ogq6!Mv&R^s)t9>FR+74)Kzvk%I ztNGafm*h;@l3a(Y)v&q@ap6*GDs|cmYI6Kv-$2vM8+5Q5bA#mn(gb0LM#_|o>{kJB ziA2QeK@Vf3qrN(%4li|5~9!^85Kd^;T_+6F|8U@zJ}n zV7`p#Aa0)+v5Z$q%3igR{FZM50?HKjg(>;9fne{mfrnB-eon0-m#|% zRip70G`ztsr~&5m1WHi)wQ6#o_E@M_k3-X+NL0K|&0`{qPO8CXSiolN_(`>N^n1dT zo|aT(p2+PJ+()Ini1PY_w1Ko^3=?7_xXL-g-Q6HUDp{cT46$kEJH+!lEW3LrPs7N(sF^wU$dpERyqnu8N54FxmGr&jt$K_ea@^w zZRa%5pNM#zTn+Re2>HAwcrGa@TUp|*GSLIi(gQ-6MRs3C-M_uv)hG$qy$vzFm9KKF z0@a4!j#dlvHnw&uhXnzDmk>Di@cPgCFYtz`H0!Ur1+7jqTj&<1bBk(gRb(2?goQ-=s+CNS;3?yWD*z8Q6P^}LDc?He>T%}I#gX15~{wY2+T|LKMN{G5f`dgil-yW@G z9)dCw^Uw?lnR)17aj^Yy#CO5b9oi@MKdLvNG{HcXW=|Xeb+6`e3kHg8i|$2m-=Rud z9S&$4tE-v>OqHW6s<)XaYQP%RHH`+nM-ANmKE5rW7pzbNYU|6%C-@wN zxV*V=6MepO9to~qv(QFRx@(rJ}C8J3DE(98(C#0u5w z2=YTG)x#=M(~(Tgw9ZfqtnTU}mNL)ksqVIVq0Uo`icTL?i2JG|RtXZV0Z6e1Befc$ zes2v`pCZZnt2I*n%{o(kXN}QG)>xfvjnl4$CnRfvF19A>LDnQa**aSLG6H7P~xvpCdV1=_NuSi^S4d4A^0Z2H@B%gf^QLgYn#d-_%6YBYc>jE z^1V^D$lIv$iF%!=*Zov)6MVaVs=Ddl>^ZH)>I@4{qJ6A#lqWB&oldDl?Ibb-zqrqv zscnY-m+3Hx{WpDM!-KbFwUGXjWzto_BEctX!671(j-2i$X1%ebwMmgFMAx)Zr^r&3 z6`0Mbn4v$*(4PS}wJt$R=~5=z%S>&2#F5O}+GOeNkIp&$!pDnQCIe4)<|_CpkW-(jZ$B zc!xkm_3MT0s(9*4518*RXP2rDDi9%P$q`R>B-f#J+c~fL6L!2j8P?%Jj=l< zOH+zn%g#vpYW;V>7wpF%nFjlqThyC zNhGY_c1c##b|lfFuT3*q{6K1VwEVIBWwHF_>U_%gTaC6tqSX&TUb1BG%Eg?>CzD#q zUNe#GnNzQWvPzkGm_2vECzFcm4=U7hyPqx&B*+W)Ke3WQLbZzqO9_;i$Bs()IABV- z>laHZ4nSVN$O!sr{fx1|eV{HDN*Djg9n51gRF&O96C2A5T zA(Cxt5(YVgS#0RhR=w4Eh8~4E+g1DVb|vUmp2?hR)uUF4tNCIbRO(NHPq?E6WXY6T zN4U)m-6sEl)`d^FE&o$XxExwr7>{vvbtbOgu1cXWp^xz**98G+BYR7W4RHvcb}%uKJfQXInMNHY`v!?d!;|8 z8a4KyccU9^>y<1EN6w#*rK>=>3{V5{>*)xFOJg-eE<+^-CFkMd(Q>4t&IHWOU>3M) zOh^z5T!nh`a6GvYLcKXvO$dp7I{M84h5L_yIyRKTZ^`Q9B=5U$^x|~zM0mT4z07on*<1DTjwVO@V7gzZW-`?XdP=@3aMUapm4fMB z$;xm~FC`KkXLlm$%t$mRW44d1GFK2)xOBoO{8X3}9IOMB;lijksWtuYjIB>o)ud`$ zG&)ZAYRq>Cl*A;3!e6n(@FYXtOs_YSucewt8zUuXNjS*1td(j_Oq=zcGjg&7AtQ-9 zr7k8$Pyi&RG$Xfl{h0Q|`-TmvSZa0T&XQ_zZR73Eh~`a@#6F6)fCa9hSJxtNuCcCg zG3S>`Ng+}#o8!cq0_DjWJn<_kfFIpdcd3Mzn;9K8zOLY_DD-LyxTdyXmrOodfuc_S zD|%4??cu8zLxa6lA1R7w6Acr<{>$5&8O@)MiDE!pA!ZwAg|P{bD=h;Y|0P`ghO9>C z=MpM+ac$-LBC3~`_vhpdGY(P*7_PY`jZVQB7`XcTy-!V|Ig^_CFdbMY7}AY;UV~^P zJ|G31lBXT@jA(#7odUkXvl2QMQpZE;xsZB3q+SRq)T1r+QZlhG%jFfhyeb#8q;u74 z*aU$krk<7?6s2<&iqg3XMd@6HqV#680MVoM=fne{k6{ z?1WMHo!2qH>h_%I2(3TyzIg>V7CZBB*M-l#_FmV#=$NXtdC~5c9_a0F8tvqL)->7~ z$3Knoqn&hnZ&-e`yUq|h`8O@S&YM^e{mwh9d9mb%mU$-zxlwQ2 zRnf$^&TbujLVvRUf=7FLZky;Z`vLW!cl4e}PO{F_4P2e2v%S@AqF3m#-p_5QMVq$K zH**^j1473bgA*d~AXf^xa{(U2*wVLWHU)j@RzV9{FPBZkqLjDQMO3rZ)iJfxMO4G( zK7dAM7e?De!KlZA-vvAvpr^%Zj8VpI=W-(7)K$xPm(G_^xr7QNv`RwFBm{N|=nA!t zK~OeDe-DfWUsstCKGo4}U0tZ#xf(ami=-Bsh&Spo~iV9^chm4t}g0%-Cf-S&=g>RF+}f0t#z?H>rMaZ#W+HItzGm|yN}xE zjcXtMGPSRZrd@xRvx575L-G|JqWyJ=*SURq`>KySpvvmC?Gn8isvp6am17?Im+M&W zWZoOcua+t&o<;ecz|fmmj^W|isWeYS<;_ylP0oCiDj=z5NUE9YoaqW~e^xo| zDV?U+>`UwFa>Y%yyt-_2CEGm7rptLjA`hKWm*?~>Ws+t7=~?J#N|ol+tNH>-iYl6X z0TYV~W4-9}C3+DN&_g0Knv1mCmKH6M*rhd9f<%6q^UEYFB+1H}Byp(f zmuW7PG#AyRflv}jYN|@?B~qhHc@8lqVrzPl*to>5Np#8Dnt}k?NhC#PMys?sJ6?{7 zzm^eodQ3Tsik4CK%ZO`v#K=oT)T`7D^4c4D&SW=MIUzljjVu+S0w#sR8Yqa@22p0=lfG>C6X-0_#=eZl#yV{P4?HZIeQ_ z^IWVSeJZM#?_M4;kNh%X7Xyg_2h9bEWM<>~5O=RbK*CC_K>J+hc||>S(3{phIzBFI ztA)w2VpT?=1KuG33(y7P?7{cx!%!Q5GSN`R*m%a$*^Hg(4362%BIjZWyacbk<;)`w zlhfxyR$xy22zSXCEI*ofc|DEGTn+{DP|a$x-`D&E5>wtSODcH1)ZmtSWku9um@&gc zVZyx6FH2v*Cdua6XL=NS>-=M)JWwD9u=!Q0nE|bC4rYMBFt@B!t>m|LC0=e&^E-wQ z9aJp~g}G>luVVb^bf}2q2WPKBCM!y7>}@(4)y>*;of7E|sUmqtXa2hIcZQTJs#$6L zffL(}A1R^N~pM)BLzGz-@WV01fa_%)?{;LpFGH`#>~Y2Tc&L+@8lw|MWbd z25S}Hk`NBciG43Wrk9DSiRhby{E(e!Du3xY00AZn8)6FUE?Nq0I`}pn3R0h6yb3JXUb4#B-57|!gf_S zij1fp9FjN+Ls&La1Pg7Dcd1vb3|l@2L_J>p^Z$OqF>Rq%l z(!zUpLs|m%&Aj!*$_Q>8yxVqJZN0y3%*s!4IPn6RZ~XOmJ_Db#nFT6k+5^!ZUiU@e z+1?9}N1JCgP+5+`3HL~Krb~~GW{ZhG?(Nf3`kdWs_=x-!rRU}^URYi(mRQ&b9-H1H(j*_ujm6XeoB;6;bI!jFhy zQcuQyLDPP*DJp<(#qS@D7~R~_@YRO*Wxcu zFK^98wy*b|n^NGV+?kc;rCuC~-T8-b$6hL*PRf8rWKS>h9va47Hg|Q9$yEwjhY(Lg zu7;q1LK^pvl&rWQar&uI=r{YUA$VFs6PH7$;26AmC4#`dJkB#Xn4o4^c6wSpE%R z_ba@agbE1Z%YhZA8M=~EP)GK_0tzboU{^Hb@*UNR|MQ{)nB!zmEG<5hv@-LA3)>w5 zrISBURHQDawA{SkpX6GGYu_Jcui4hTKg@_i(<#c)74s$~d*6PSnZ2l9-*dgbt z-^k^WklGVck4o?{0eLj3AD4ITm3;f;0>*EZ6p9n!1L~l_h+vBxQ5YikQBO$3lM*_d z!aR61q@I$SyHc5RpGjrDJthf`OZw-M)umbLd9LaOxx5%sFFEQJN4@IlRpG{7;L6lY zuV`Gz^`5vgS`>E6%jPa!GH>ksaST&$%>B_lX{TaGdd1%)7Z}i9uh*54SM&+*m2h~l zH)>rZ@*&58=1 zaBBGlE6d7OmyKFjzQWs9l-|fHYU34@hdVTs*o@NUWdjy18nUR&Oi5!_E?OjsvVxek zCDK`Kr+G`Rio|S3z2+UgB$9_I?6*@wiN1(j8p(;KmY0<+8Ml1lV$9pnlJU#;dR`q# zOtzB9J90HP%pK0lY+^dmt&d>r6_kOUmfkL-ancz@hz5W|yeXO`^W3S)j((S%>Ge+v zMf~p|#b#O;bw-v@I(Kead3lSjl6&0$kQ+0&Lyt$h2)=^UuoN%oV=AT=17 zw|d>|x|=f@PcU=4tKLxK0VQ$OTk35`oe=M{%+>Fz_k4u6m#&G7Xgox{FCJ$R3F2=S z)D%~JB#Ay&pLm^Jr_h^vZDd^txDr8K`_%i>wUHL_KdG@UJ4u9wpQ%3!$}*va=xeGG z1DCE?kYBcB9){WT&9afdpmbGPK8r|s`}{c=oGw|qB7en#vi$PW#bx*6x;f&044c5RiT{^qK0Q3l1LXkq!{ z(iL+Tuuv`Yzvr(lSj?gSaMeG-3{?M0_{aTT{oo?%q0K+x@fMwDF4gL1Z|QZBjwlJS zrV0|YL0tXvx=1r`%^F}Z-~KKb@xFO55;RL1pR-YbX|Ro#?^unz2d|IZ?fw0$>^5Hf zYS-8+7J8rF5NY861GjkTDm)g;k~iUCbXG7pcTrjC`l2=sy>EUOZ0-$Q8Sd^iIfSq$ z^U_edp_OQTa$C`xWqtdD$dv~*4s`JDJDghN-L%BY^H$vyiNd?2KY&1p+7tu>W5$jd zA^@w{U@*@JAgRra47o60mvCpnvtY&GiQA9+Q`hQDxNJXb+q*t|68HXua{7od(Ntp5T)H872 z$LQ|kaAMD^5#C+5M9u&h;iX$5ZR67ECG>#wi8zC#N|{CrlD87ruNs`vLQ z*EY|?^2n#+pW)SD{M4!(fN*ml<;L5^V`UDUquJqSG^tcg+1BQZ3)-d*)@8z%~WPZq*&i5{ZN zYCW0C20}-pwMISGQs?M^nvXitQl76whArcwkZO;H?VOMJ0!3%lu(#=Dp_f&)J-b1pN@p5u&nF+c3RX`!309~XwlM))0)}Ym@=fYv!hZK}Qg8at>l5=zr4HEYPrud^m$wjl5+A6#D!Xq`-DtXAzC$lgbAWIleqI~8mUGl3T>50 zHH=V$0{r}q>xzc->yoXrt1GIn6?;_L(-Ci$5<0*+ys=TGX>lGlm3Nqj6>Qu6v_Elx);eUUvaAWt$SRdhS~i@7}R)&>RZUyC2N-X_0#0x zU9~gKkw;`VQLPhEe#|;tbXFft)JpeDR7-acqE`dxI+%U zbOuMXnJ9E;;VUg0b;E{ggw9b@buLt!hqVr3U3HG`jme#jY4KRJ;bX#|iH3w`wGdtomiT$LsE{Jja zvl^sz^Y(IV4U46$xLJ}#9YN-rP7n*4Zw#m%s%5+OK6OLT2T(O(ig(-R=>w`emac{> z-jUu8)Gb7M3X9lO^ed)uvOfhm-^|+iz~b04(JJ+|D&vj{GSb+C|h)~xLE zgcE$zs1#+3*TI%FfR#*EbCKr2uyK{uO;m4X!EMFrO63Fd`sUuPa@yr4MrW#oWN5Fm z7><9gt9Aw|#Zbv?^dL*A_iRk7=9)UoLLsMgQfHaCTGiIMm8qz%S5bK~fH_Qh-&7Pg zrgas)ko+*2T&1ELNhclMl8Un6k8Y23vCkLK?LJ>1+N(i(H}wq2vh4S-Sh7%F77L{h1GAQj9-l!&A0J%PT4Mqc&rd+m8l6$?k;H>VlG zJEfQNNjaKG!_1xBXKot z3=7Q%1XY7?;w0L*hU9E~xEki@BdR5hoMhE+sV3aD6d~ruhnB~TRo|^)st#lJpuWs&}Q9$k+_;x zr>THx^^a<(v^&x4bttOfbs@oE7~z(eJRy+QT6jrcO1j5Xz~`-l91Ou*lj&MNlQ$s8 z$&UYTiTy((q4r|xC!;3`eKv)@kPO_@bNhT?lK(1GjT>YS!Ao!CfXQctPAO*rn$`aw z1@X$8P5N7m)K`Q^eURCoNzs?`ds#{2Q-w%R?kR0~18m3*L2vB?L0|Qzc2njpyCia% zcgw?}Xzjkcw>4sV?LH~Tn_ZrqTN{^yqD*FU@6+FeTGV?s@?aoI?<-VtG0Pyy;UG9fVwDNkYfxhUZDHJYKy2D zZI#Gv5<+ao0li#yI_h3=A$6alcBLQ-BsE8B5`*k4h}bOd%gvCj-fF~VO+;+g#E8x0 z7r|Mc2+pzY*4|I}guLg;R5ou%B;irHeJZ4$mM2Ke z4y$KF>R1ZA;B#^_19{mCqKEQ=dP$PM98#~O;3N*|-U-S1S~A;3kov}}H&QVveM_Fb z9a8VeFO34FZ~-O}OBXLIU0%9k>2j?G`lM!L_Ln^p-Q}HeXS8+Iv^~+V3uXiY!TdcpY7w-l?_Ae(juJb@(+`ed@Bwn`3-> z;jo~UH*s%tX2D$bCn@7IxgdO+tfsi|Rm)tMqvawsy1|?}*nd@@*9mT#o+3j*fyl+V z_m`qZa)S{o`L4gVc5TquyJcUrsLwYdR{BPLi%2KGU%spUu0EI7{TIp6ayq6~c(mMy zGE1$D_H0~DjWmp$qftNvv)X0z#gic3`6g9Hd!Q?lB=?REp{zVVU5>#%ckwX}FI9vf%$(sQjDtUJl6J;JGX zf`c;?aMtLd{Dn*Am0cufKt_;elAQlP=QWx^pco>PM6c%xN5@>?3qdK%MT19m{NH7A ziK8WHTfWE}+uP-8g@~=R<9+^UpupSqmKE|!zRWK86_J~#8wT>dFSlnTdlRmAjJT{> zvdD&gr)=@6=xySIp}p6?B?9opXGh0+ckhqh=p|j0-rCE`2+#EfPY*Xi#mH*t^*#_C z_CLyaw{?o9d&`=+NnYne8QG~da@#INMc($*hB0r#N12Vh{2L=(y;Vbzqu)M_p~I=x zd~eU8j0R~n$^U#DOD;Npe#H{?*kC&kaF3FhNt4BHtqj$_8ovx72UrxyPIvn*H^fNPO&YaT&QSHUdQOun0Fqg7F90HWmW;7UPw7PUa>Jl~+aR}BF zvqdwI?haT{GN!PLT0+_WaD35D4TiW-7#%tVZdY2ezzA&3f+Zb;H;@rL2O|iH zbW$`kHuWxO3#V-8z#s`qWpqNhJIf@Mm6usfk`onDafG>=5{JsOBOJ=gtsAX^gSiPK zK>t?|82l(JmfgByB@;3GNdh7k$|ihNNB(x4?n)tdp5>hG%;QZB0-i^i~s-%33Y^)cvL{}2Df63XUXLJR+7WDVRepsBjgAD}SmwbxAC>4!?N zEV`#r4I4k$5F`}U74I5hZw*F>nUu!3>A`P=1DjuBJl)584fXw&;$NGSVRCrlWep2x za6x}qXxwtF=)OpMq`{8D|_>g|O~oIoSq!t3hwL8T=;8-G~cl5%}G} z+yU}vyg~73yirQ&L}FXpM2EFqOBF-Y+C~*a)7r4_jD1wq7!#hAVvI2+xv2`5tQm3w z7H@O>sBNtBcd64${thPlXpoUuj72{S3}kgcIhtS?48-_ZgwePhtdi9jL+db#-oW&X zYY8~Jnh0+2BpljI#tF?79*KJ_sj!t%8JwV8+?(pg3ivqaxSGM9;q%!nsA8JUH?TQ; zADfFi#q)L1P=#EX&DLeI7j$)T#i0pIW(>qVw)_AtvQ1^HbhB{lU9dH{7qL!`NV-+l z1j3WeH4>$YGUPO^Wox*{q62d!MX z)!6G6zdpl4g;{Ip~|CQwN1+DfDLJ zqvQsxEQk_d4W!T;uyhon7Xc1mD`oQ+rdC;{Vn?V&rY&DE^%R-56zFNai?w-+wI!O# zCCyaQuZEdHnN`Rz5%Sg;f^B(tNdcWBgLoZK?;Y{mDW?n;<*nuE42j?2PrKzydoFqA zRupwBR}AzP^mispqmMau6O1~8mBu{z!v~I3&vipfTobWwv}fbcZ`^9kn88mrA#MDF z$A|6PicSX{&+APrI=qV1#-v3`JtNf^?U+$iIslZ@p^m6Fd@UgKG+_`23D@EtHST9S z==jnkg^RBL@y+R3=u5LqV&_;GVJOqK(D&V~&?hQfz7CDTg6izP*0IF^tPM z-zCSn^78G^2xmWseCy$LZqUm&UdDM_r$FPKAC{FV#r5nk@-8OmyLyouYx0Qv->_zP zbay&MtIIZf#VVBLvh+wK;ff@BU|R`^6cfyEpdT{##Qvkoi*JTH>lY8H@2tkf!_I~{ z(XIvT`;BVo^rWQPFez8Eq12H!RD;Eal2^82!23)r%?2?LpJFNOVm+s)_;^ESEay#y zzqLIsicGTyr^eG{R4^d|LkZy<BVR(+24hD4ImA`|;Fa2}4pR z2JqDq#M&HZ8Cl{1|vj$kBLuU~oQVViV~e&LnzFCh!y!n`)u4Pr@71O%Ow8P%rOh1tbbq zNXTc>;~4_aqPN)um_v{8c;vFV81l3~o^Qo)ewO?vSP|?w6I*Czi>Usgn?HfJrwO#X zPhc++7`uCtf-x~sd_C9stUd}~%S>`t&sh`E|3h=EaBPaUicbbRlf1Qr2Z7lJ%4DPpb^Wnvqoa63ibBKK^YS~@dRYq725v(1LpT80f`LU_DJ z5n=DsMqE%E#0!VYjhh3avL?2}hLeFeN!-4_B84yztyQKLd&@@50~>@h@hGN;yv(#r zR2%JVT70&Z6ML&&Qkq^gJY%EK7C{#6a zg!ihl2W8?LnbUleNm7QBaWY7X1tDN`<41?7i|M-Ho6g$qi$B{rKMg=z4sM<^Pce&m z2M4!%mb|KJA{YGg8QG)_JGSP_n>8st`*G!JEsQ*P&Q^Q z3s9vR&=*3`r$bPb3S7u0z(jAbek$qQebyXrBuQ`;tL|o~!DJ-e z5^tE#vcR8fg;yhTL`NZQKt9?cx+3^SNZN&vvJXs_2)qW{O|@euneAvWmC*0e1E)0UQS^A(9XR zJuVqG>c|JhB_EtlChvG8g#0cIN772xrREEq6kEZhATv6u5xtHjWiUa*K%RNV@lz)hH9Kn5R@tjuS=lXWXB_s zec{LfbRVV1>WFZtu?vwFW<2Ik$9|$hIu6fM)8m7h7I27~P8gLUD}tMGkc@~&u@fM? zSyp0CQV^)ffeJg{fwGnSSicTN0@Rwm6sAM}9P*Ix$qtJA<6x*N#%?}nvlLvMzT}vGkSauK`|={icwfx$kWk< zJRM!g)6s=I9Xm(izcI0IVGx_0x3CNBBE4Or$9Gn&moJ;>7M?D^ek8A-hz)&(L?vOU z1VUUe)oNz2P?F3}z{szO!9+;~cGJvm(eG_~{6>%8&1B{GF#D4Vp_uyBvA<1p9WRUB zr7YaTIJKaPa3QY-dk6t7E4FhS9~{TEdMg`Mmu}@R|Nz+UGuklEy3E2)+TuM5@y zMWepX21~L&iVz$6I*V#r6n<8xr^aAZLlDsfI|42G=a0u%E2!y*SoOfcoyR^o0Y)wA z4<9ppLPHZzEFRs@xl~tdABbBhJX!P~OR)#Oe1vmQ zi)!)~8*tiqO?FUTWW!Rpv^IYeuY+4{ysj$g2iYFt#^Qm#PxpTZ(@nP}YZu4K#X!PP z4qZR>mFsOdT+g-f#=HqKXTu45GqBjA%HdXDbKb(naUX-MmB9Q(8{XDL220xGTA{O} zDMefy=^O@W)K;UM32}M6qmAeBPGn%CE$Xj|4XW6>$_0ri9OW#p?*gwn}zVv#Lsp@4~4UvXfP^HnIU#~?RhnA0pk z!(EZvB5s=wM9>9dMlwy|Z#0AEo2U`RBU(gp(B$}~%T7ZrEy`&L37}p9W zI8c_*_H8*6aZfv~fw=4#KdhIM7zs&FVIGs>W63>SDaEYZK2kJ>_z9a>RJK>X`Mpwz z$nZWZ-$TELRlr6#)m0I#^c~@jDi-lao}i7 zh6+e1eOLV>^=n5Xvf*J$jSnK29~+iRz}wS!)5*7Kc+<BIz)Z!^%V zaL$Tw0lZ;ri#PNfMsc6#JD4;LI=XaaRf37;oeps$)(As87-68$i0g6R5&&@BSG z=ZPi3Vjg1db(`d2Db7pf{}(y_Ol%1`9GlAjcv1e-K8vvg%RPo2j`#<5@^Gn1dAk<~ zzTIWb+5**m==mX>%l@V#eMlu@PbR8CT%!f{X8$nDLv)+}=d=yMquQ1HA$TLD;|Gw| z|DO4XFalcmE41VlR0j5Uc!ENpZzzZVUNDy-F6V5UW*~9jcUdNR(7i<(112m{utCPV z!jNCl0-n$=1f*tMw!;xJ+G-8v|FvM}L3|ZR&G0S4HzDw<9S;qGI8-O=qJZF4wq7(2 zI4dN|D}@kW0wICb2MC{OJ;WRY3V;9^iFlZxSPRsngjEN}7}U(9ylr}TbPuZm#l|%= zu;i^prVVQMAjODS64z<#kAQZvc9H_r?Ln(yx}4}NX4!4i_p`d-*pgD;h!rd%eTSRX z-@_X8fUdoUdsriok{nm=F4j2E9-LoD)-)kH)gl9LO-bizGjLvG467eri7~jE=dXB3 zf{&!htP~+?0XjpHaxGo0@TL+Q*gW`m$kw5dZNedu5@teZb#B*#nC$H{idhF*y(*Mx z1Q;L;+iExV(&fQDtqmIVIBuK{z*c$;WUwb=zxXunjLt{RF2#BKDsWfV-~!(coSyH3 zOfpW@vHLi}&eAfEeGH!KC)jI#iaq8D2#cQNmD%S!4{B4pIY88?ZE1%&04=%!hg2|N zwGjhWk5q@SO?WdP9XBrBZUN0$iV;e-d^ysjTZgX`mEnWbuCN1JdIh=*Fa>%RW0~&E z>XDOq$l+k%3l978h_?}C3a0fYU`QbVB+!e2z=V$86eN(!q+yTknCT&w&_)b<>Uzq+ z7PNt^OD1eF?%{u*_*CzhaFgdgD`SFQ|GU`0DuZPglEJMW;HAOx4T=C_D1}em;P*!8 zt*=RKetGY*j-vNWXKbL8R0hO4zsb66fYAdqJ6^kPWJW2k`#FhXyVq-Wj5Mkgjf@qs z|EL6kv&ztCpN5W>4l&58veIcUoq-+59^5`P;5SQBchG9IwMx_R_CDKN3pur63%*Zo zP^bp{%w)w57bO+H%NY7?R^; zESlD+apbSZx8dp+UH@W@#JFr%s<`-z&s+Rntm}QN3_C5wHCe%ar@U);jLcf|^o~c? zp(WPo#jOXBL;`a)M&y}Yk)kBlCw!@n_F2DZ$Xci^k@|c zvD!8!-j+Vv(W5>2bf8Bbq3Rfktyd??MQ1bbLOxw7!EWUJxQTZ+^Bz{{KI=(Qd%+GV ze}W*rO}r1R*;;sCiqVgv_eX~K0Ll+m^Mh#nSAh~490{#cjVVrJ@*5h->hOW|mPgP| zCO*sx**nzXaQYoVkCA3Rihf6%_!!##RUq_Zsc!P=F^)c;H1YA~00+aiGk=O66A3iQ z%qJ6ciiuCPLc`_L^yq2kU=cUx)2RqE=n-Y$_pG#7TTE z6>OfF&$mKy8O-5<{5dOM$iWt#%AdDFlKBNI&wY_Eq3@UI@iIM@nz%4?+=-mPaVN44 zUqPr=n)xadf5prp<~5bCG4Ud(ayIjI(1RxWr`uN+e|6N=#ulG}4j?T!)I;k^)y~b) zZitBLEN*z()kepsiUns}e@D$k?W0dg(8l7$XI+nTUL>x3?MgO+RC2yujQcCPmKb)< zl_C!P73~&9=Unqd`ZuoWV%s;a+ypEOXOb?Mr4Tx!$7Yhp*-npl*izBxa&(N?{+Y{) z{q80kHtKjc5`e{PPyygk&0Q#Jf%th6=_Vy4^ibFmBp<`oorr4pK!V?l(ag1kaOR?8p(M8v?v zlL{UzB+wL5n6>anW2?|o7+xjl`d}jH0sjuNk%DXr2En-hcN$yRJP4aB%35CgH-^^< z!y7>iAR7P1u!Dlo#swqk-w56!=-WXAAUJA}Nc_%)0~8W6E+RQ7yahhW@^25m=r#jw zXR2ar#M4|59njNm$v`&#BIseHhlw6$*vzo(VqmO9Dn|}5DgwR%9TCH9JdS6qIQX{X zlBI`{P{c6otMaiPoq#nQ6PYKX;{v5r#aOTjv0+OVwTnfEu@R@0_HWTx0U{QL1m(?+ z72OB;HDI;&5qywu!(&*n(ITTf7N7QA%&~_#NrwwwUB%1|(IO8Lh=D$4X{4}y8WB7+cF?Kk#j0Mw|XhR1->{sk1KSp2g&C6joc^Lby7Q8<8lqayk{~SEDlQ?*=^7nB< zNPZSr18M-ZOo+i(vu@IlL~P1qmx@Cpy80K5EA23>z{={33}+SS_^JE>`wI6|^|11F z6MMeX==%0b z3p!V!U>$ofY%Q-N+I`ZDg3v!_{VjK58OQ(@CwOA4>LReyL+RNibyT(wQ1@QJ7VN?P zzZk|JNr=)9QnU|sQbOS;7^+h!R0gMxM`H@?1?pN-J+16o z38ZCKcCF6Y#teTy*+u2(dw5}!wCYLWrDAp&KASM9;lXrKatXb*RiNkA)ur@Mgx+v7r3NCb;CXx_vrw-LO^^0F^Pfc5n>_z@qYiQi|5^34bmUP*?F zK)247;lQtCmy|vu-+n_3qKKDdMJl`;f;k6W@*hTxVkU95@#^ZFj% z^x$`x!3ddy98+m=Gf8&gYjKsqt$P5Uhp0V+=5*l0bh zN!7LnQ;4zthj^5I0}QVaKC%&N+6D8YCck+X%g;804aj2rCSvuC%kBhU<<0DmIE48M z49KB01w6S9Q8GG0S@Hzb^*HeCTF`HNd8$MXfDDArIPuEb{uB4&) zb4&!BlUcsMXkhzDfISJfd26Z!Wn~6u^GJ{Q$KTyDH zcxMLOQGl&RvYG=~xc?dy2ewL3$>aX-VSzUA|Be?Nz;gdfCW84jamW9X;{Zw*%rZiA zAg>A?)&He3ut-shsAB(LLYKyl(}Xy0I?koAaZ*NKC!ULDqjltTbiyZVN09#i=yZL& zVGw!~j>OVqBH{-b`cWS=;*Kh;Pb_EY@sQ#*#YDVHGQ^U+kPV+!pfNN+#pu6X{*)lq z=_5gM+|DvVsQ7~47^Ewk|8~`o{f`X!RfQcuH~J7{M4+`@c|)c`v}QW@pyv<)o(@?L zGJ;8OULH&R4{ts!y$$GXleizy2rQ0!f&NhcBl;81O!vSCf_m(0_v!fM8tz9XgWBo> zcqxYp!F>~@La@12wD8qKePC^81=*Rrf@R`|zTk;3+yV_Re5Ro2@FqfqV$zC79biaT zxxovC$-EiM<}KvHS`h@P7hH}oRu7UQa%kZ#0dG2)Wte=4j!M89f~?Cc^#1{}M&+IZ zvH_}RC2FIn1nhqprorVGY4Mv1m4Hu6Eq(ykB!3sk*t8W=rGli2)1U4G(j2%v0xzd_zI#ANd12tLR)zdv7L@acu%am>#EV5mUnbs(*;Yi zB%Ul=Y>wKB)DqFm5*=kpZVhn)YZMe3b6yU>!<~hn#SV8v{G4#O zN8_h~(_KYkedo)*zy0oI?02uYSjFoRgtd6F)7^{nrmhBe2vPv$_yp zYY3)x6I4=5)>TATa5s;50#(qvxJL!|79Qx2?mhsWe;~HSgK)Aw7|Vv?EET7{88{uQ zixb!SICW`&y#R_n)F%$duXRv~v4F=d8(D->+ zGR()G?OCja7h-r}`c zGcKchTxKG7LX2M&^oBCO)?FPAvHwr6SM=)cJ|UCL=;1!0u+$$8bpPg&acgX=)9V7# zykJ-QjZCxXH!$Ecr?`K%%Xo3~-9<_AZSNNMF1Hh$^2T*C#vR;-bsM=Xaq57(g1ETS zVJ_Z%+`Zkv8;B7n-F+;LKw)yQ+0GTGPP!Lan&Pt=KAY2LIq~c%_b5v%P}SBr!fdnj zGxt-`ggMnwb5A$Ss@*Kq=lr7Iv&LxgRJ~YxUoeJUtR?g+(k8rRB;vJ!&?DW6#Ya2( zXm15Nd~{%Wk&tiiM32to2PU_ffyr%V-RSYSNh!-5z31vC}W=eeF&c`y#zjV&OrzC&C7D5i5m;k%$fJ#=au2y&l4XqVT>x z9w$uw0+WZ$o*> zCt)4)Rf#cF2iJqPLd#5h%^+7OuIK`jzl z5`vKsTrz^{z>6u1;ILEzDiVh%WX6l6UQ+=jVdDx6()1`$Wqi@(qu@xBNl~cuP^c;h zm4UBVrh5ovIPb5D1zIM4IA-3zkc%upb&$B84}sRV{Mh|G&&N|F6ni>)&NIvD%>`2ZCj;gPMN~G+AlqWB8?}0_VkycDA&GCK{_^ zs98!LkvSJ!+r`Z(@Yz-~OBJjTqG4efi+MY^DCgkqP&TaLX0L%NsEHP;g*K=SIo`Td z1pz7eoCM-R&29igt`ce}8_h`9e#qDZ{gH0AhA0RjfDTw>jv8C(rW7g{U(~Q3roYeZ zCejYuJ32X}skV&Rcg7lDp))C}g+L+nRhxbicAZnfszE9EdrxD5nmQGJ1wK)@O~2% zfO=ggm(Hb!EI}K(5DV_Hv2D5wdqH9cSfm;8Virky(rjT+13J8K)@ z4jT_9nFUSq~sbmCrDEc!_thJJZKYGUv|hG#?6 zb`x~g=IE|1;6vFbGG)e-RM0ZStR?9f$1?@QrXu=RP0(6W8mmVrGgv2dhoS}cpxJbd zg6x{eD-w685i=1yNOK;CKXuB6=+50r_eo5y@7(n)DvKPP#pU)&fgzCRxP1iXM9acXT?l3uboF0-ATp%)X=F@5%ErX?(?_Gy8%3 zegwOYdfFB8hCs4|{X&ncw9B%TgE z{-KzGyY#$A;QPiLl#%|dV3ItaSoArClCrVM5>6q3q~P>YSL|57wL z9fjalsL6%35Ik)3h{BB|Zj`A~%xFSnr=%{>r^DFj&aS}j!;a&X3E-rM3%P-?GTuCq zWU_SIXT|y!IIKM0#1pLOIKM~IPKCfpVChPkWAZ6aJ~;4q@YG0N zQI?%ovhvF8hMA`k@Ar<;#M4c@%96Doc#3l>d&=thB+<8u=Pmw|aA$aa<{9FChNnf$ zb{z;~@Hd z+^d2FHP|27WXofzjo0T5OuV5DtdSg?c-x>mtg#J(4oz*?9s=B)V!ceUNWz)7vhmhr z*JLHZAm(i2xUH>-U#HgeRMz)~@Mfo)p4n|X@H`vLOheSU6MGJ#&J<}YMe0ItI|!>h z^6tvJ+4$qUhlymFn^Ce)a9ASgjZnM~J^I2N45Xob>HQ#T!yJ+vj_N;VXl8qUx?P^!*1761VC`B#I7k(N1B?_N=em8KZ$z zhmu7Snw-!w@>kI)kOF!@It5^RM0}C$Nom#(KzRiNq{IL!Lzmnfc<&E?Wj=J0z$v~4 zX?zCbi%<=bsOr`8RN?)^uzH@vT#Bje+2hohkTsIo0m;N0Wk^>F8-)sD8wyBYaipH7 zT>pLutYoYXXm}-%yUHMrkI9VDhMAyB#^{YMZ*Yk;3?rlpYy@B=Kt`$vDAS{5GHvU7 zl9A~#^*yn@DbvK0rUTa?R!1n)4mHyRrIhrgQljumN(37Ln359dHs~#7d;5y->U-)T zrLqk?cfDWze@N_ShRio-AkE5S(HrGj-FMiH(>%KR^A;At4>hbT%lO~c$h z9W%x>tO*0MnrLS606GC;$*?es0#3vxm=u}06;pzr#2na(BtbtnN{`%&me2F zO0^~J>#38GlptldbX^ckNs?f6*;_&Xj6^?6qQB74lZzToZsaKkEvBGi0^hZOlx%49EQ5uSvvEpkcy9E!nk)U7bd`q8(< zx1?0<8AIhXvhHIszLe@y5+k63`uRt*Dsvj~Xii`Lzs%`kC8uI%jwiNBIAcryhm6s< z|Ihit6hK85k-46>?Zb#xltNT5RKj{BqoF}WFe)Im5W0EY9BKwk3lissTu)YO!l_(1 zT!;SkDwbv&OO-KE#T{-9@(E(pGw@8IkXpr;*x1vw)Sx4&+BaAd2B{crqgw-|SkT;4H&7l24TE;E z3UkNqV##WgAc#VnzeP7W54v~(YyV3!IXW9Ghv|gshQvcE%esjmq-qxEO{p8w(?G2i zo_3U&^AJ-BKbS;;L(LVX^cdn{td~ZckY`T(3oz4ILB;50xH7PZrS@=Dgx27F742fB z`VD=+mG+@T9gy*kqcn<^erc7>{tvIe3}N(Bhf($iOtTP3&m0ek>a z1t={At0sd(-yzqt=G*|8Z==jDDV!yl@6AY>>|rLU`KHUb!F;z5<-0v(?kV3oG@84?YbUc#bh>O{|9h(;y#3|f38S&tPL@aC1eXJACYP#}x)}1HdzEfG; zaY_UuJBhu{liB;cyeuspY0{PTZ8FEX=xVRw&4i^vt}Hk45_`&WN1_ov38)3vLDWQ{ zBe?mh7Q8)XKm|=FtPm75U9sMH9ruPTXv?QSFrUCE54NU~#5|N3h)dNo-MXxX8 z#}SmCYRE)d3h;l$y{fDRstF2YZAvR+%}zB13=h#MiB9C98`o?}nvroEMZD9-lg?|4 zOKm*fK+W*rl(%VkW%PqIXej{`jaR|3QwFU1RE4_HtWd8di)1@dA4bV$qy9@$5jA9e z5Y5h0t0Mw3I&UHz6rNz4{}$>@8M3sxAO;-~*Z_@jia@Qe1f$y$G%Iqq0zD3Ot)p6K|Qsg}$O%$xGrgDg2T?3W| zM5y8)McJqnO7p)^k`JNONohS=TuIg~t$u2&=@J{H1W;wxq#9Mi&%>J%O36!rg3UuP zs?k_xxX*Al4&^;7U~d~QFpN%JMp;49#gaXt+Mx9o)vyEATfYQC>TExm8~RtRbSzaX zj(^okfV^Z)!d5B}@g$@8&ey}?FgepD>BZexTH0J#0wE~f}(#w_qZLJXX50(NP$CxskVp9iC z?La*`*@oS*Mk5Edtn4?UlMHOV>EWr0l`&!FgvX?E7iS3W2(FqBg_P6? znPHkZE2y)G5kp3qMW_l|hI&eKl_(jY{X(I}gFuwz+@VBOW=@JgEk!*=ZeT!d3HU%9w`T5EQgQP_P7M)JeXwHp$|6NyJ7CD9U#9 z#K)*`zb)Ic>EZLh-LeGMD>`)aln=BnVQq9+_&{(5cWK)`J}VkR17~TopiIuB_lx0j zs|kvv&5$99;u9fEI0>fhCu0y!ffT{h=nK4sAjAZZ9U1ymNbthOS(eE>hAr5EJL*P#=C4Y^uc2>Bq8 z_z~VfO%pL|9Kx#!3lS25oQqi|9Vm}LG|~zJS_i%dF})Ejq|QUDDk8)*-%w)I1C%7@ zfdD8t=7n$^2M+2V?U>d45E-_BI&+$$sjJL41xUFAmYw%mUGj{Lvi-%34^dbUozmf? z4hX1$pmU*eFnEt*eH!dwSYjw56q#t*b|sjzq4HezndPi3Udj+5qFxg>SFDD5`4Zx= zLo$RK8JgMq;YT5@0_??QxRSgAgJvZrv(<9I5UC_B^1%&uCK`jPl@vdm!~#s=#VBkC z3Ptsyz)k^1Wrre)dnCcLq8y@q2GJ)ykseTRBg>`b+X!`J?s-E<2`)K-{q+lJNHYpe@@9k=}7V8i%! zSs+@OwZhTY7ij7T2u?U5luC=}hG1))#`3~|-lS|bsNf?)1s?%P#DORQ&0A;irW9PR zO3Cur7w8i~bVMH%bkl%NK|=LHZ(pH?f=C9dHRL4K`$=f>d(=U&TWbIb0vduGD@SFn z`5N!%6n+pObg_{TSEK4GVZ4**-__#@v~Hi4k&bnCw48Zxfj?@~!*9cbSrLzc;ZhDo zgM4?0fG1%5QORlZlp2~jtA$El4JlK}X$VkvP{L1>9fLwCMXSbopOa;yHjfOp`9J6; zN(sgxyz)lEy)^Hi$EHoO5!r<(B-D|EaSfRZ4n}xS-o5nAL%LMgv72YALmoAJI>fU| z5n~uV(X-kuPsE}ZczPKX(T2sI0p;YKz=TGPN{&V`MlM@8ySu$&@nVlnoVn|ZD~^BL zGdj|phkQ^wN2kR>KPNaZhIp}hAI>y7h;I8l6~(vvJiF_5Mx%GZ7Do*bAoL91Yrf4&76C_$l}a{p7wgaK+O2av!FQfkY~CMDnoAV zvJDkuyK~3VKDsE8bkJidPCM#ZogmA9{6|l1i#&+o#yOQEs)4OhO~Jb3Cy%qZ?oCfj zH5ssDa!h@Td|R9rQ(JjES|etma&xgq-I!HzvTFb85|hd0TUxi6H*E6l+<=%_R+;s+ z_tp%UZDee=QRM&W_KUpkF?LadUVXoMj5*HgqQ{ju8GgbY&MafNER{YB?5cl zxW$F&&4mU<>}3BkAnIRIj)kU1+&}>Gxj$(&k=&aiFPpi-In_ zrMHHHu75l4bj6zKrLNw-N=BRX^Zu@riDeJ;+7snl+nL@BrP8L&^R85Gxt4j!+e&GG zAuoHEDA0Q=y>6S#z}s&Ze7hDC!&*)m5LrvTE^%?W*HqkLvv((#K^C^?{KQP`Yi63I z8v3PwdCel}HE(Qj?pE)MO0G`7=^dpA95s67#G9kjA#ju~MR|gOql%{#dtXgzO=OPN zxvel*T)1UnMU1r3k~|wp4NHr{hJLtyuZRA*L1(<_&Q{BXd7R1Mblrd-1nb97z&F2a<|&!Csz+RmlRj604EzSb>yaHL?e{2KMq7fmtQqc>VQaG6Ovr~wq1RpU-COnO8Tmldk#0bGk&jrMHCPn`EIRl3d} z-+9L?^l9Xu-tkHsUb^FL7bV-eT10GnWf1kR*ynf>8Yl@3l&JTh&W&RK2bP%P_w2D* zN(0@9jZM(YzHE<+J)@AiUy@^co26gtwAkMjZIZe5VogftYtt}xxT)B`F1GDH{W^@e zVf}rNMXiNs)JWsQS;Z({v3R||uU6tJc;PGbM>C7^=U~@!J%SJLxgR5us-QRh(#);= zQJ~uB5n~2V$Q(Z;Gp`NTdGxr2jpEQaZ#%#j$9suk1AGZ7gk$6Nl&g;C@fXJbxSbiB}K=b;_H!W3#=ZBlD9b^f&I?KimO9?ZrL<^ z7yug%s(`Yh>oA{30!9%a>qrQ&9{`!v6>T0gEJVwY(^i%7(@^O2;cDo&Sj@=L2uV&G z`n%6Vp_d}GO!W2PNObF26;NJ${I}006P@ z&bHb^6B#3?txbMYhCX?_jO;HG_kk4OTpW}$a#R<<^HF@i zI9VO4XzPwr3p3q&;pw{Ar;sZ>cGnkv<`0E;iy39POKb^7YE)e9)hs8<1BWgZ`deQ( zGBp&*C5)rh{t-7C>0hZMK}*(jiWS2Ek$tT=V$??up}B~hwy)i-uLK`^$|sH!A}Yv5 zLe#FKl15TD?pErVs$^uF_BZQ7NQ#`c;>LGXFy$Qgh(mv?35W8Gm!!}Kk*T{Q1Rv*1 zsqzSoYJBXu`t=({LU9Bf>3|IWtuL%QQpOOD$*3e6MzXiH8H>DrI#B_wS;@VkXsp`E z=l*#oc?HmrYZSzT-}Qy#7KXyx#d9xntBA^1BL&XCUT+I3oLn7Eq+!ctq@`KYZt-sqQtY9OZ_ z-jkaukuEOQd1cEb6S*vywdf-DqO>0cPc;0uzVK?RQ20vX#|b`{44?Jc%ne_m&ygcZ zh~VG)!tNbI;giJn6}osiC&)<4{u#LcmvqL+)epwN@A|@x&x9hCDv7L#Yfqu1zibb| zVTkCU&L;baMEbVJnzs?tD!E(@(s}PZ8fnZ*UTV&f=nqT;`i1*-!f{krM*Ka|XOmbj z?Rus*I6%Y&0%s-sx4!U0C2S4x+*F?}h3fk8yxljpA{;qwUHAP!2buAYb(O^SNxoQ_ z-qA_EI781vL4U_&pGzWn@@7WE>sT6)(@y^l&M1KsMB{Y`Os(Y+Z8ld!uO+8Vapa|7 zV9543#H3<1M&PmI58egkBv&O|98~@F%ZRb|OI@nycO8@Qw0)S`k%p)?1r>AsXQg76 z{*HDWdmHd)r>O811lQjR!K1{5DJmH_zYqNy3T>QIedXmy+tO!gpN^;g}RzW4$H3#928NNWS(4!4As}x{L|Y{ADPS;xeQI5RjI)^Uh&Bm zwT3EBLkYI82|^qqe~aED=1%oFWu^R`nQ?kyWyZ*9C$zvi83)r=gj1ZK7Nnq`7o=6f za3-g1^zI*1_)22NmwJJ)|VCY$-#bYyb&7vxb#l%1i* z?fiA^TK6#>lSB4Gf9nf7ouy{=iS3sl;{NmuwWteb0L#*I3MgqDH-_8sYXZG;ou$*&fIM3Jkc36*^x z`j85IwPdiAc^4Wk#s;5UnkE>pY!GV;Aow5XW{4J}3e}tz7GlJl&6Kbhku$&;Azsfm zSVYNXm8g!%pEqw2raE#JG#HOmQ)BHP<&6|4>KTk;da+v6w3$G=yh#{sq!>0+O>#QH zDz{W&J7)SMD@2?m@0zWGQND1w#FZ;*l(c668_>=mPIWio}Eg?$ZC;_xgrjksAr_W0u}?BXDUTih6cGJP^j zCBHp(I(};cY~-{Pf7xDYtVTn@lG-;!jV1H=4bbv;#ieJ^KOM7uD z+OB+kl!|J^Y+sT@uy8i=^>CuR3c8!^X17=wQay=!)i1XIqh1Y@B@)E-#1il*WT?i^Fy(HKDKL1 z50h_8VPBvo^Vyo=)jdscF8~gjz(RlP3*Wg$$z+N1jtHX|HUhMtCfrJ!8y$*86_L{t zk2u^1de)aF7H=~e#pf?!p-16UeyaUyS%f904L`e9sqhDeso|^4d*bPQ5GHck@PB59 z!`G-$bs(AAcy!S`?UOC$*{BL~O+#fNB2xkdY@ys47lJ5J?(#QpxcU$|Bd1+t7eTjg zFg))fb{_G?$QTDstUh`Kp~kuY!;s~v9pgPOZnV$L9^T`Zj1RVq@W7ku%uINshfmz%nS83d;tXh#k!;rFix z;d-zXO93i=my+_iM`F;RaN5O8Qj$3JA~qf`&hwR(K}%&Bhrz)=+Qc=kO+o#lDA8mV zdBPVdNm?I4)1wNy83$03(PR|sgwHEU#U6rhucJf_!jpu585ju2n#fuP_?z_~g)dsB z_N%|zeE8D{Y#PX6FA(}$U-(NCZ6uqxPLXKwb#p}g2PYAfoOU((YMWA#a+dr2@=EQb z^^*>Z{>wo--ftU%Tg58)$Vy3QX6RLd^*=f7#;;ahD7;TuJW_qXlbv?FIwo0i+J#)b zUZL;~(R~HTHC4js6{v(=qZCkHUsU4r*?1IhQ!D8h!JXqCg*#TN<+YXE-P{>zk<;eS z5C}zz78l!@3}Q4zqD&5be}^s=1tO;zahr2;M9VRFlcmhcjS>#ml914*PfyhUx8-y9XLokyzV zv;%w9?NFpd@y0$R`ZM->viH7s@6gKk%wSW(X-DhnKTAbAvqr6+>uZpSYY$bB5BaNI zq=MQM0d#0INWzPyVmE4REPkAs=lhaNM@8jlqk*~R&Rr`JWVp~ymEk;Jg zGzCkM-)s>L?_iAAKc=Ee-TB%5W3?G0r{%r0%nC&^iHmF1TIv2yrSUyLzR79ZV=RvD zia?G|deBp8J1UnyqDAAkJEE78(~`R~qfMe?7jwEOTIY+AjESwwPLC^(+9s#nuD>@) z4SS&(%>B^rS|absRri7g!ULDZCjPmlR_mp#caEIFf{2{9Q;hE*BP1;yWwMKAo7GbG z+>9*0(gg=MI#Z%ZFECj|5nwX6w}#~ZI2L%wX&L=Dx~dWK?xGV|FivF?mE8hl#XVJ6 zzp~~iF^phTZ_^3(=TNg4x7AlsJYUxmA;-6beM7M9rzNbWC`bmCYq(Xd0o~*`9#+7F zORlb_pNyL>BQ%#u@7@X&*I$%ANupqnDMH*2qS(~Edz%v&B;=}TP~2alM#$UdD=V`y zZX58#tq6NtMaed`2%i${^13k0B4S>{c>)zL{WZWwZ&G15p9CQ!@+Irh=6B8f4e658 zw)%i~)Cdp12hoi2`ju9%KJ-cNtg6s=4Tt}szx9P1-bIA{mU3eMEVR1pSJX|Smk+7< zF1iRgEi3=2A7ljd)I%17c=mO*C`GR$yHjpUSd9=bWa%5Ab7jop)EmAe@y2OOqzF;R z-UMoBAyA4~^#+!>v_g0-XJbYzm=EN%y=`?g1#PTQ;)BsMwqnUhu1csdL>ti12$d{y zzBHIbukC6#%HNK(?s`KY1B{bLSVYnf7NgjDPaV1^DNvPoB~U@Fh$s=aLp|H6wgaFa z6H9?|oL0Mdc87|;h~PIWgy4RW^gB@B-=U(m?F8J8#TaERf2YAF?gXQbBv58o1+@5r z#Uys@R2KrJ3I;ip;2$md%Jro1Nn(Gb$*kdUQPW*5v-|o?Oi|=|1!LiNec@0w+KvI1 zI^uj=t5GDqjT};M%X}Nz7%)&l3wWAn^@7^6+X(b>ekeu1_;moHc6dRZ#m9gAT!$?n zKjgG?Y0_jRQoj+_B+)WB>mPZ&V(|y)R^)1F@>k*MP}`Hor*{^_UV_8=@dE&h7* zl<-*_tz|_0ch&rMd>8rc^l~V_X0d1i8Y@(u-30HpL4oJLV)Y3-xTSLG#dmk!(HhB+ z%hqJ=^*2l5+rQNs9me-uG4OTRr@&7hv8IT`x2#@aEm4s+c{gpV6@-FZHx1I?KPnX| z?JKK8Mw0EZVi%A;dAt;S`d3zG7@k-H`2CZm;C(-^g3nf>Hp-qZ-}vTZbdbYlB=onw zaPjF-q-@dmH?{q%>;}H)zYT$s#my6FaPk@YTVGfs&MNr(I0B+Ull49#x)P<2PYiiq z?SRCEG5plQ%EYB+k+Jqi_yl4PqkD$lH2f^}bog zmMAVFiky3=zVzLf<58*P8f%hh78xnaE%FRDk7#(%S6)U)pTs_ziooQw5q!|APZdQG zu(pP9g^cj(*_(#Z2ux19RKD#CNAQROo6RXQKTz}a^O9o++W2sC1*hHX?M+l;-MkNd zG&4R@+iL9xC}g+C!Z2(v{M;tiq`^SElmVd{x<{b~WviizZbf?Kxt^@1Cq4vhdwmJ> zE$P?XBQ}1hBHu%x>NyHXxT-EN(t#Q!*yI?x67#H;=?1AXiCK z!W&fSW^3%V55% zMiG76**kVlhp`00b7)K_OQ<@ zOEzYH*NSJ{SWClcS5>A3;Rr5q9jVI*eLBDEfF2UkMe@if&R89efDUv-jWGC&_rHlv z!1@UeyS~sL#9ub55k{}F`NHKpdIaSw_n`t?deoOByWqTQ!`2MMBAuMJWQm{4p!H=3 zC^(A9C%%dlr;homin7PhalSdE&KQ>n_W0K-EaMoyh;p={mpR|6-k*G8GmACHd`?-~ z7jxnwV3U^Y(mt{>fAFJ>&|IP@a~vo>z7;7hy&dHiZI1hFl36rr;;(;*zQ@7ex)+3H zUB<|zowY>&Znqonu5S)X38yX8vwy0{5Fu9F`BAORogaK1KNEwHoOZi1t$|HRJ+FFH zIgwc{3c{Wrs|62yOa*VM2H5ejuc=IZ{}Znz9!5tYr`>kUYaT|HELwe{_Pjoy0MEwm zD(qzED6iNMgzYAnZnO$}Vn9@sSkn)gc<_l@)(>-59eEd>hMbm$?3d5Rp-`c%$mu0p8W(6sW!bTvY~K}hEOY$Rhlsg^bFB=QkhECYDN#T8K|(dDF? z|KS8%xlY2sojmEQC|mCsdEa?WdZ&u$Q!1u}Q}8~sLwc7L7k43*zCqX+g0=fq!XWfx zk1iwjoKm+*rwI1ijW8@xcz#jaZ^zdOe@(KZKf!7F9~*9mBa{()|A}(SrIW0QMaS;9 z5t#MK)zH+$6bKii&&7zWJ5hcy_^O)B`_rFKYU{_c0t@(Un)xq}MJ>lKYI^mw#UZ2v-Sk-Ze4=>RprjV~~r5e1F0fA~&stlG%6 z3gwwP%2VUju+J4lS0iKQMx0Em$QFJHWOV~-fPy*Enc{j7c8_4Wi&R+A^60AK+^1?T z^G^dd;bj%(cq_V!_~f)&?|JE4Uit?+0&>+fIWevZC*&7ZzEY9peg$M5&P9oxU-=S5 z(Ra~i@%2mUfLUAh%;^`fP$H*Yn_M|BBV@{T?(X`#W_^ye61hs6G)p|9@N=JMX}sMl z($1*Gu6qW{=FcEOsX*+)QO<4@fivpXVBFigj{F)A@k6*&4YnJ}!8p<4^I-7V&+?+d zpvdH^Xo4p<-cOoX;f*ZkPI%=ZmrDz=PZ}Dg?KS86VrDr2L!=^zDv1*osl0 z=O>g3Gv^e*T8vgu2Q?H&D#>i=xU_HN4!sm5!7{v27N2FJV-MT#B+b(6W5rWgw(j8_|?Hnrpl^lmn{DJV2BwEj-m$WFSfE;}t zei4Tg6A2nYpoNuG&=ZbU;i2&lfeNcBprWphSg{5Zm&|qJ#1IcY=yhH#WWnesUSDYf?SN~p=_mTG!QgS?_9XPJoUhd+KKU+n?e?;`F$t%gV0691 zaSjyN-D2~%YC3y&^sn$NvP7<}CLPDLa2)$IbdCI{F;4zoq(;cj_(!U4wuyfpu+Tc68M=bU_ zjbh(rHQS#Os<=A}JnsjeMP!t5nuX&B6`uG5CG}4T9w&)@T+a?8?;;s;+N8enIYj^8 zoL=GR?sSTGaB?OiOzP{tb=d(qAGj8prjAcgA}sBO@OxW1%OVwaY6#Y>EG#5HnQwtR>oN-kT|N4^BF7r{kA zLsy0Hb%Zk4f$0_)yadYfyP`y|>uP!O33PF|0)pMS7-6?#ZXu?_7ndWh9z#=-)6$N# zNlN&W#weelSy1k2D*p;p&88`!o7-H`V&tzX848=MdvuMEC6w@Q{Oa?|8aYN->VHhW zF{05umreA^GtC9&y9w zHj0uPY9e6`(~m9(7hVcQvWwMDM2fztc9w*j$n>GV6gc!M$BMx>)e4?UpxyUFAhUqV zWmHWt>Pf)$qjfH^vy!`vFyrt@Rzt!qcyDyL#0xoYZR&EJ|bzLTVvK>KF=8rj;518lO`n zDQY{l^np&#uc;w2fOQaCp=S}k7Q1O1O& zx(0dm)h^Nh8nVCt7wF7;{zufsy+75OkGzAr_+p(5kRiI=QAyP(@=jYXy|cyDL28L_ zm#a|OjQNF}b~)eh4VSoe$JbC+Ojxb!qbEc!@02KgAtkR$#f&(7INs-R|jrsyj5dFJ(vSq)PXdaF}bMp!8Y{^y7WHTdP zhQ+DnKWdUER(`o|0D2O+WKGzQ)7|3ITb_7vg2K`^U|Ig{Igv)(S%%Y&iP4SJumvli z_p<#0k5Mk+Wp2id*i|tF)k{vBo8!%7tRyk}u9|}89SE2C9G0EcBE1pQXWstBEg0#V-PG3;tt;vju6npQhwff%kvl=u6nIfk>thE2^7Ps#Ea%I)@ zE1aAEL>w9lPFpo=ewBEW#P|p5()IUyC#}DrO30;Ys-_{18SY}+nspOdzd0l(LL5gV zNdhlE0CCQbECs(fGzP5Uhbq#`4O;r3xR{g|KQgkCeWfDh4U4gfMZrknd(4kU+MH4< zQr1{R!p&x_q&@YcPyRZ)?H`Oea#{*Dt#TN_%Q=2Uxbf`e&$|r6+MArVmfmYBgTMUZ z|7q*&LZS+y0PK3#UA0~Jrrx_dnd_!xYJ{7T4BvS`&*iYyi(&``3@IkR4OrH6s#d~?nW_s-6onK_3mICQPK zLnMTk=EJSuxU^z&bOayB^3pn=I=!&T&D)Loj`=Orb`zR%6dEW3>P^@P9a6Uvx{Se> z)XH0vaIz#G`1!95M`_jpDOI~QIgRrcd|W9tJ=f63!4f(*l*6rQmwt9s zvqOU-r0>z9prPtxY^b>eQy6?m*o*OP`$^w@<`TLpw&IqC`Z#>uu7WQdEs>^v7VbQS~K$Ah3BJGLWfN8J;RCvE@7)| z<$(pQ-0PD_osNisnK`6d5hZ?fi$qrNMx(SORZ82AVsQRd`~a)8MQC=lMC;Dr z8>Tsx(Pu(ivnAsEc+VmskBrL3;tSv4$sU!^g?<=z&SIv8r$u*h1Rq;EwFoUuZVa+S znf8F0_7+~)xzYqx`nD<1$wxRAPh8WZAs3@qnD4Y9zh4lUxUHf}ixRjhNS@_>SeaX2@g8`Z!UC$g0n_#-Y}wDPIK@ekWcyY9hmYdq&FApi>iGr8859Jil`d wK^0c&Vu|9;M1& diff --git a/settings/repository/net.sf/picard-1.90.1442.xml b/settings/repository/net.sf/picard-1.90.1442.xml new file mode 100644 index 000000000..4ec267817 --- /dev/null +++ b/settings/repository/net.sf/picard-1.90.1442.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/net.sf/sam-1.84.1337.xml b/settings/repository/net.sf/sam-1.84.1337.xml deleted file mode 100644 index 4d31fe250..000000000 --- a/settings/repository/net.sf/sam-1.84.1337.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.84.1337.jar b/settings/repository/net.sf/sam-1.90.1442.jar similarity index 86% rename from settings/repository/net.sf/sam-1.84.1337.jar rename to settings/repository/net.sf/sam-1.90.1442.jar index 3d28e1928aff76b4386b832384f49e75f8412cd9..f6e003657d94954a8a3443cf744e175f3db08351 100644 GIT binary patch delta 44213 zcmb?^2Ygh;7WbLCvwQDuZaV3mP(nfrozRldd+!|*dXuVH7F1NQVw5YQC@P?U70H6A zh`o!wD|Vj>;=`=7a+&60=D?|Z)w&D^QyoH=vm%*>gxKR?&-ulE{859x0xKaoZB z>qYL5>&fZx5V82g4+)G6KSI=3P?@L)fv9)Nke*IvpW6Of#Hf>rBT>njbS9Lg}sOq)EYN_)S&&?L`Oatxy@U|@;BL3@8_28zwst1_I;NiR(iE>W}3KHBMheGh?#yJx4MaFksgDJl$+nh z{Te5T;)HW(y_xI`i5HghzoRdghW1@{hZr2Kk-T@m*yh)9s~-`4G`oT?KOqu5@r?L*MbGC?B@yvop%ed=NDKO& z6`9GpK*2}iJxkL!aoc;3~IXk+FBb^)ib<~q2X&N&u%8xW z&6>JoR`-P!l}_<=(ZD%2%^U4pQzD~{p47`3vO=b&mQlG$uh{e&9Wv>4o8F)|ZF-B| zcH-~#IxBZ&<>H08^C}n2 zT#AA{=^L9aq6=*LmgYM%I>`>kzu+lLFd8~jzLsgumz`v4!gut&O+U~THeJq6Pdehn zbe0b$oX-U>WGBGRQTp0>xwCve??;+%(=m2_VrL{f|7PcBc76eZ_=h~HDK@R*N?%6f zP5RZQ-{^On{-8gdys087_-7aShH!pw`BQQOHOieJHK_8vmqK& zddQ3s+<@GiIiP1UXf(j|el&%q>Kcs+VU?)zG+p`W$e%>&B-z5GiWAh$q{`#uJ&x>x zW`q#D)s!?m_R$XsU z8|R0uOd4UoJWAzmA32|08HCJh^CBi)a2_#N7eQRH?E5nDt$-6dMm zR?(HViP5w}%%q*-a@r+sq z?dY&EfWFY3aR&JTbQaB{`A}^~QH=%AA3ka+kLZqP(scOEm=q;U@|rySoYvP?Qxpnk2PLy}HV zN0ZJ=ZIn`!pV}ogI`sslBB>?f{F%q8#c}e;*sRpz=szh5{)8mz9?<=ICA2%n?2yi# z0Wx^NX<7IX6>p_R__W$eIry~RN~!p?*-G*FwA~8D{t&g>MyA?M%3uxc4utLy^G$@x zYN`3)sunL9g+{3W&I>6}gS1}=X+0M{18)0wfPaVpT<=W4F)m=tTtbzJgbN+PaG?Ui z1P9Kb_}D)J#(X9OxI9E~G$6K*fOrPLUm^fTM^Ne^83hgq3Sg!a0(c7m8en0E!lG^D zQM-x`Qiq$#W9%w=f;tRjb^0vT<4;rkRa3{CDRw*gIq)p!Qnl<)#~4raCz*5{bCwer z%uZqGJ5B9`(DdZGcvuWuAtO-I#dHbCNTPhY6jL0j1?m-)M!tl|#&ZI{8BS6&*caqkkq@;CJ znh#F`yj?*$#aWo#C)1auX|>B}UYJcBB(GI^9rWWcd5P;=Hd?;xF3|?hlloJbqG?UdlSR-!D^w89QZM zl!n~AS}xLy_~4e$#uk?^J9-)2-PB{f4Ns~j`j5%RF}J?oPd6@1 z(Q$Lnhs8j1C_Upub(0oEO@_RqLanOWoh$dX)qk`r}KwKZ0UiSzyI>n68RnbzGvSLei=9-k8=1&27Ame zkI0|=^561jQ~qMgUrqU&pW4XBE%`ft|6u1&OJXtIM*iie*75|GJ;}}~c22`F$ZrTk zvd>_jiZVQgH_9*#iyfb7`2Ez&i1Hb>5$y}Wjp1-C7l7z^jCghuIGAV}_53v0NMa|} zZzLNj>`OI`G|Nc$(+DHOZ)6%->}0c(!=-XPr$hfZlm|W-^_h@-4mR)`4UI-VIE^`2 z04T?dCaAYj$i5=eXlfbF{4~?ZaE5$rB%ytbR-DqBoi^;WWf1L5qrGW#FpZ9uQDPaL z(AGw0u*K-&r)40EmKt3(9jnkmU@syHfGEr8#(7Fjqbx+^DmRgnEu-8|7aQG8qeo2* zR(W`Q<9`7^tu=a@MlZ|g?Wb#v4Abai8GTKopJntn#T3gJz_blyXAnDsO=E~<3^k2m zmNDEkMp(v3KkYF_u`}8-##qK!&Nj|6##_b&A7Ur^jY-C2zcIy_Y8lhex?GLGF=M)A zR9HqO=bm91Gc98lGhj9YnqwMs`3q$--!vBZX|FMl!wWg{BGWj>PmjvSO=B^`U*f07 zjiubU%Q(++ZkQFE@mzM!V`n8h=d*KxWn9Q#=krg{2<*FrolDtS!_H-v7{VP4V%Ube zM;qgE%ecZc)>_6o%ec}quCk1)(M^nNEaO_sxXv=xbN#O8G9UVlD&q#97;N0=hx~0| zp>izaCXU|7qH;51*o2N?++rHHLP$3ow^;@ZJwF&*OydsjJ0EhEJ5dpaCOBAN+=VV^ z!0@vNK3Zbj1vnEjUVDCc;L1CLwA0scN=83!%nNsfPt z!%thrGc4@S`idk?&c?`&{A!c0S~ok1XS3CO#fY%=m<3KDCU)ocx((Ku;YpzTn6&E#oW8 z_}VhQv5apm<6qp7prQ5{&`@oRA2|P!hL&;EGJdp-W6b5BEaTsn@v~+8;x~RZelv~V zP2&&K_)}Z6yq_79l=#b;`khE~rp)rRcLrZ#c!T@DGWtl*w}MiaIG=uFqy_W7HIjt* z#W{Qv0z>{~qzUr`G?QtZbQXPQ%oV4ckN8tB*ynqr)GK}uuKme)T8g>BLBALS4DnlV z)9=QO!aNOW5LgNf%@_GqKTeKyX8&nCYAQ?;Iu`VE%KkDsI+e$bcf~o*mPch1wqn%_ zuKde*MMx!_z9)^rflX4`;(RgA7L&ykTU@}-G|D%o zmxxPku|`~Gi?tlLj^nOm-&O2fEv~V}wc*k+5z#N)P#QL(0qvsJuGu*GxYd0QolyKGfYC4oz}N)~rHC$2Q|ofl6T z!=h4Ds;yE~nxOj5y-})uPh0LqZm~(F+u{%k%b9YPElqLI7QYEJ`)ONcFsj$#s7yJ{ zG)~znOJ$qJ30vhLo640BYdd72%2V~xy0*#}@7SsVI}N$$n_RRJJB?L=EiaYRoV?E7 zL^Nhhf~}f}ZKf);Rgr3Ht7c-GQ`XCBdSiQ)H+Yk{+f>bM)k3wjRk3PitJbQGEgMS! zxk_GbtG240t=cn+6XLk3I&jvGwknY$m|MebiCLR1PO46}l;R*;CL7%Nl4{&k8juL- z1@Y3yj?GRCJ8|qZkPU5_A(L#?S#_y8cw@5DMyL_dT~#+`U8$|gRJn7bP&r15>h1)E zYLOPBdI*Y{Gj;jY7W1YqnAxKIyrq@gGCiHo-ZyicY^hd7m&!6*4ik6Va;Ui5*(udf ztC#9+%W)u08LF8vP!4k17^+d?J+hUl`q-+kIB2SV;G*j9%r{i7F;b3kt~1nfc*Z(D zrJ7m;)IeJeQoWtxD?E)MGnTJx0XQZjR8*|{<#4&T>`M^|Ntk)RPJcM#A)jVOR+-0lzYJn}Daq=1yNYf6=m~l@+#0j@bt$56ic;4%*)1Qo)f$MM*uh=&Q25v*TV*l5^!GPrQ@s@x}%_M63FCQoX)|MwJcd+e6#-$MhZ4 zdvbZ-LBTO;N{PTg5@Ha{^l4(a7y@+boGG71@7;kjK^vDFnS404arF2ml5uSnY%!u$Tgl9*D=O8>cB0LY_ z`4Qm-2roo9P*dYYh&U%A!(xP&M1+?jyeuNT9N`ra;d2o_FCx4WT*SDFib)<-^%rBR z{$fbgUyQ0e!M>PBRvm%CRev$Q>Mw>^{lzE?)?cpuR{$UeTMYDuSBTBdADL=;!`qPNZo;e( zw~H+Z5v>q+pbVsvHKZ=FDoZ6d-i##Ie6>e#F?WB)6DlUeR-DQy|DyNU877A;Td0xnVBFi~D{4>-a^K@01meX}v*Zg%EptCZHY)kp}H z@KChaiAE8Etv?pQ+81rYYAVFNA=JTa)f(l$&sI4dxjfr`QeoJ?52Yl+^~1R9ySOzZ zMKUoOP%yV)DylwRmsB8XS@T&$ZEIPBnGR=mR^_OaKy7WmLT$hP+uDM56oSI!qM)ZG zhw=5oDwOfK-t>t;d?W&K4c7z@R-w|vLuRE^CplMTbzmag?Yv*Gq46t>VZTQJNOh~O zD{PR61B^W(uFcI=&3bcbcT4XtSanZev+xvUfV$dd$P6vt$-LGvdr3t3^l#J+QV~+AsmP2P@g0GG;5)Yk~32(S$fC)ortlN>}3N!YQuh7_o z?FUg!U240uRxe^7y7YeZpT{UxxC4nhF2seKD-Lpk$qlH&dP~>x_5v+q?-!^HJ-oT1 z-#9`#z)D{#^1c-+k=8`Ba&Z%tKZ%8-4fJvmrHHr1JJ?ZbNLk`t@g7Ep1Zpkb7awSW z2}v7rmWvOW0xfq*yf>U6O!3i4Dlo;zrugI!Xhqz%ToNJbD|K7lFiQOBGq^p)50eK@~J?4gN;E0BSNQeWosy4TK*NT zY$AEQC&-7&o}dP%_|EqyjP^o&ACk61MJhuNc3#j&GE2*Z+kF0T^ZCQghh~$<7EFLB@T?=xd8(OeR6B2w-6=-)piJ2-f|$IBPKIKcfbnZc{pU4TMG&@u%~ejs zq%%D%2os3R~QxU*74*14XGdYpE$|(_eyPt`-E6QVe%1}Ug z?J1#Z>Q+sY>$c;p2tc(H)IY^>lpnm@9l&(_KVl02da}T*Wf`r(J2E`s}bSWk{Y`)UvE-F^}S}zpFwD3Z|AXv#b;*7lWFLrtS4%d`eIXz{A<4pt zZLQKGh<8_9?l>75SwpiZ83WD>=?@VaGDEDFnva^ zY%QTMNk$TiMvB8sGd3>wqK&mD6UeBYbjo)k;GC5Pm|HYR9N((j@C)j4@M946Nd&-J z!eAz4D9*YzXK>*2h|Fij9vdq6aT}Fh#Io*k;9o%WT}0VhL@>o_MD^RM`Vp)?8j(5v z%+3+3n+R4qU2gfA<$nFAatWbwFP>TM#6Oiw43*1nr|Q?rvtj&Gxq325HwRHSj4Z5H z&B6>7q(yaxDZ@`*BT7SaRh=2?geV}BWeWFUnd$;_4zyFLwep1nMP4J}ABv`-sMO;_ z#EhYu&Z$dAI!=&eM8JxeFt}vMbku=JSZq7jO)#mw%F@(m{bb~UntX~f+?k78nVN+d zCU4=&WQI9Y8sdyqm=R~4g*Am_8buK_)r?xObJRc^C9}>7F77PgiUF?Ge+MVCLvz8AS)8=%9ZnT0c~D9g0Rs)d&Pkue!(%BE1fF^#f}3JrseS47$2K9U_`1ma{P%qHD+0Tw>-n4Z zrz=?8No5s>=K{w05sZoKE6sV1gJLZttYJ{cflg{8W<_b8Rf|D2OBkXn!fd!gx+T1H zu`HH+v}zEQ$kwtA#t>HivE(^Hwkg|2H9J8mDj%TRNemMJeP0)q;XK$G`w^R+;x1>v zUmJ$MHVprgF#Jyc9exKKu?)e#8KC=i{r?BLM0V1k7j*;apLJDj$1z9s^w~HFE%YSL zP(DS8#?zE)JR2c4-NNtDD>dLHlCGh_VR9D%p z22_UAwp6tRyt$?SJG@d^rr|wZs)k2Y{bN-96IA_ERQ)hz81A6$(pw&`dU?3&ozGVF z7*xGH1pk~eRevb6)>ZId1O7KSl=^K1OG9Hc^2CSXM&TqnOv{?3dWs&a4brklE!cGD zRGG3OU>^bOqk#S6{|LLM?4?-}-CgB7E6X*DA1GHj5lvQO$-qJ! zHo?v4{=b9jBm35nmf?KTU3CGx%pU(cync1yS!l!Z@X%N}J|K|}TUPEb5oNl2Uk(V)KQtp*L@cGaVy8cs=S1hrJ7sIwX! z(VktxRqGPs7Eu?0QwAY!p^_Ly%CbT}5@A!C@VjioR1`)vJ=Vb|$6>1Mj&+G-{Y5L%;^Ug0AU_xydz>wY za(rl2J+y!1-ju4sLF*edGu`pOX7E;W0?aQUQd1SN9g|pR_*Xcgtu{qSS`7qjZpcXx zS=dAPq(%sqo`^W#^j9&pdlza;1Oc^HiK%iLTJTmUbAajtCRGklIpb@)Ui(mkd8n0~ zuF*tOAvEuq)>6dCijW-GBu`>J8|!Hz`iNL-o{q z8l)>@HAXd)66G9iNwi@*<1KSnOJE1?B-O)-cOh|7D3brc#5+4_btZ!9Be%){rVbYQ znu*{7cSE_ZB&5}?^OvaeSASFI&{i`nXAguZX$mZiME-YxpOTZ0hbnhKl)Gbc;Zw}aG|kfc`ztHFJ1BWFjl_3*~uNM zl4^COr#_jUe2T%cqMpayU36t8!t@wMo+SB{gDW%8m8dSCdtx-fHA9t;71XuERQrL< zG2S~M`XgM0KWdZ_;@m1&IK|DUBV1(^LngM3VWLdQNXgD_4A3}8rMZ?kTwz1q={#Iz zl+?5ZVKc^UGx+3cFoFdnMP3}%LJko{3z94)87YFPp6K0 zOUJ}*sbQLGFD|N9$ZO=aXtZ*gD6a!uEP-3XOx_~b%Qj%Lq}#OWG{RKt7lP^;or{ZMc<1c;P0lz}0bX zq8WufAQTypp-0pmA0Iog$F!mwQDp4R4 zJsqUKs_De&qG!opR4v+IOcoCmv0hcISJ53u1wWpmx{JU(wa&XhZSXEsySz(P(7R0S z^RCdHWfAu0lWDoUg(ccJ0!4Q#`hd@P7h9CP8;WDzG)Hd6JOjt_af|RaK=l}_Xt%r_ zKCdwm7Y4VWF`*#T{!oKRwGRXE9jNj9;&a})2mo|5ag9de3LEs2ix$F$X@)LvCkZ!K zSOCwZHhd3D-c>K5UP2Q8myEl8d5}d?4grSx(U? z%)t?12E-##sVAXk^loMmy4rE_ppV6|m?($_P;fr+C0+_iYMkT`o!OFGQHO1EJ4+$B z8evWmZR-qQ-c=F0r)CgU@@{z#3J=gK%w#k@Kk?;wJ8`5~n(~ z9{6awYU^)Tm?+vOsi5fKWK^gE9q#Cc2Ue(JQQ`zDRF=OJvUPT&w^gWSfp&%asSRYQ zZKCK>P3^O)sY9aZ3LoD+{$7W$jiy7z_*PSiE*v3kY5FF#d?h?+ujay^+eiH&IXTMjGzDnHG6(r3<~AX=A{9J8kxEpKPD*P5ADGh$=Eg6L}we9vmxZ zq=nEMR&E><=#Jeauhz+#Ftit6$-TflAFud;Gkvk4&i9pRjmn9!UC3v9A(xlf+-eg88BoShV zbEVwa$S{X{2)>zxK~eSu9UFj2T;j77Q^enDQND)~Be3dKllKv_y!+7{k5anV-8FK> zDo}IK$)b944^*$KH1&!#9UVbd1CavyApUTOK2-BBF~%_o3%S;Dr9shsKoN@ssy!Ot z85=1J3aiIP5S1u;a_n}Bg0~lY@$Id@$P#F{SM&)*_T?{1cXf;P=Mawl_{ z*$#&w$omYW>{&?IbEwwykg^x3srO}S<2^*3y{}Ur?;AA4`xcG%zD?7-@6jCZhY>w$ zA&XK40Q#v??hDWJqDY;fraaGUhN-iBBwXj(-o951sN37WI7@H0+`{K=y%hVK0Kd4i6{V%is!pP5mMb1wGld5luP` zhwo{71y zAu&rN`RJfXHbsghQaMW+m%vrRTp>6m!zVIDmL;IfKJv)Wy8Jxe}Qy^6@P~~jA$MbGG zo6q4-U0gYE>Y{U&2=dIww>b3033yJ%vHqH5IUi|qek*6e4$ocby+t2e^cH<>(NFZZ z=}Ixc6a#HBNDOwG?)2n|A^A2TC@4$QC=L5kegzSEiU7V zT`tBrpDmP$&Z=HM%bAF~0^$mNZy<=2KZ3AiR<$ok7dES_fwdd7XKPq&8(+I|j=ZR9_ZHP|*l4De?Vy!b!UhKWj3W3NMb%;>H=! z{(_LiA?T;@^@GTRqB}iVSG%@<&hv@V_q#89YxA!A@dVlZN$&M@Jhqe)9P+Z~W5cR> zYbn_7kmrMVt$-3g_l$8h-h%Hv8Ln2n>L<_hZrtdz2HcG<}{xl#a9vV5` zZ$nT8_UhMPcxk_?djQ89I8gyQ3Z(IHNnKQPpx22A{s;~D4SNonn%)3xybB>Jh zw$^|jK6>-K&UgT!v)3g&W~|q8a>wB^$@sC}0dD3Ulf5Mx(l?X6y z?@g}n^2@!Qqb+`vt!(Z&3Bi|c@K*V=I#M2$;JjpK9AqrRdG)S1d({n(dz9jNgdWcM zw|OrSy`9syc{_{#{7G_#-tO(}8-gYJP@KFU=4`m#yGV?3%q`ww9;_i~d~nhhZ;=d4 zrZ}1cXw$G-olZUQ(ok=jiJ9drthHvN>|BhW3uzUdgSK6aw^Ei;HDV6X3VN2#qnBwV zeS}9mKEnw6H6GUZ7i~cwxC1Y1-G#Tawqk;HF&^`|1P?2$!Ghy*yrQy2!KWt9{WIkNr{P?cP&K7fwliY87mxKh7j5^B zDfaQ5rC!VC&4YUJ6pSx#G78`63zsce3U$*Lf7767$CnE-aPtYl1y$!PNOZcr;9Yks%4na>1G{Lkma&6S0LY@UO$SsAVe z?HbDqO|)LT{wClna9KdlA7QlQXEl=R!~jj5m^9rdBTJ>@ov<_EWKtHOg8SKc%S!6E zBVcI7e>n70^mv!dTX=yi`C+94M%S#&J1D=fC@V9oqMB4Ym85jC2g%+8SxIV2*X^f2 z;nf#Ehegy}gKLivPCHOX)TtBd(HWF?fxLHxQt5^~Wz?F=1Jn-hK`{&1q?d=KPcB@= zRJhF0zWOi=f#1l}0I3qd8lltVA*K=1<9W6ji|Z=+ z_dLpbcUrcx6)G!RdE7@Vr*z5+PPxb1*VP!i?(@#nb0yXjdCswGrPt~5fY;SPeIM}l zbSF~lAM_4z;|@RKy~mwCUG31=)lr3P#fycE++ZjuXbSZ-Pk^{SdQBCQbIE(Wh|a)F>#N=}}Zcl~GVRGufGC z(rh2$p)Zs096=HdXJ@`ec%UGO@HPS7EHY`aAM!aRm!U5)X{kkcl^~mza~Q7@ETVJq z>X1n*tG0aEtLo#)2~}~6JkF|3UZ3;B^=2#Q`W@aBC;wJ&d_qCN0DZM~LFLjGBZm#> z)MS9uZ3Euh98_gC^!t0BJH2wz(%B0aI9F`)_II+Y%qZugYgD|`{cTUCld;nqlVsBQ zpcf>dTKfMBOuE21US;-h;_ecuUXv~awNBX$W+k3;y2z=%!OU&hkQ%{`+H=IznKreh zb~f~Q0DwX-58^c(n=WOiB^7gqHBR)+-W=mHYUG@Al{dBF<4r>o8NJt4zAu z#*E=w;IauXV%l^)RXIa%G?TnH*m%=wL)FmS%z6$~Ckda8mHo{oZL;YWy4Cq?mYM9a zsVOycZn@DM7u}F<)sO0N>Lu=7YGz&T2J;f<;SFZn!o($&OM6t#n7VA<(veFlyHA}y zs}fU9RWOc~o{hwCY}L@`i-HM`xd9qK5AD(by^7~00v)jt_cCYwXCgUJfJnW6fIi&> z{u;$ri0c8oHwn^0U9uRRl9irWm>~C(p%;UAC9$S>Ttsmb+5)e`K-|DW6yqK}0bgD{ zr(&_I1{B3d?X4zHQNXAsZxI|*uinw^kwwpUpb9*YI2)mON`afI5x6V()DQv>6h|wl zB|Ia5a;!jX(2O>9X%>tevjJ>z4lLYD*1)E)QTPTmeU{>z9wh%p@&}qeLH=rrYWBb& zyqgw{H|!!3Luq)rti*GcXwLy)l;B$)u=2MnFy(Z4$LXl4wQG%{3i_I1Qe1S1;_*DR3G z*@W4H=OpEr)Y+sir;!Sw?+Py>NSM^^B&C^Ds?)IRhBU~1O=8(8*1{%0_@Lo?gg7`6 z1n?MhgBZXZ8~_H!?1S>AeRv#~1~%P?6&OUi7@h<{Pf+5Zhp66uO2SB!436SyKR8A3 z`S7i#v}#J@p3=chp%@%I7{It}wC{$7B&KY!f@(3TU(qy(=_aJdPDWFVLl7=s~h8kmbM1QhIy z$yFB=>x^ku7pg)VawG6a*90xg3^yQ&?Xha^1yIw0zP&y>5B`K2Mxyh(63NYhgOMjG zSvN7Fd*%3YABzUBv!0DS53I$=?Wo-~QEo?#4xwda zI7J>+DgbW)s^)gO(V&pwHPnrgJvJc+&m#gfTEUGOQxlbkR|B;+isuQbWEqv+nb#}mgsAVe(defc8~#_d7vrqMo_#M%x~!6J2XlR6T_ph5Fg177UE+p zF*XXuqujv0)$F^Gvv1(i4m&ro6WGWuUI)5RY+~mYpSV?Q z_QSc2bKh=>E&RR15_kHLe3zfETQjijemHmYHL4x#?8Mwd>|#dX1?UWMpC#`1Q;&23WAO>9YvcajVOxAAJ~zb|Sa~?Bi#&-wfN5c4gZA^Pp>vYZJ8ayp``o$X7qdx^ zuf=ONHeKO-i(7^ya!v6sTYM+*DD)nxw&_)R4Z5f=KVvZuZ&sM%2U{HBoF4rM1=DrZ zeM6=BuV(95yi9G1A3-v|P%Q@PCtaKyel`2&9<#+yc(jB>5Bv@1H>rO_8`$D!@r#YS zSKo)9oXO^gZ9vcO?EC?Tw$XN5{K@r77dS3D%S?4f{bm-X91_QEY-FFXg(ToS$&gNg zkK!m^K|S!BS?qnr#+Kdp`t_T>n73fh>^XjFrqIUA-f)!k*w_pA+QMch22eoPX&cYN zC)y%givdiH4aN}QgGLw<|p33g%}!f*Nf6R&}_oWE0i3neY z@Z}NVD-d2A5nhMzl@Z~q5WYGhd=0|aMue|Jcs++ZhY((mh^mMTHz0gtM0f+HNMH|$ zYYZaM1K@6Q9=uB?7u|wLefkU|tipKBJ47`z*UiMc?LN94Mi4G=nXqy?ashWxYld;M zIg(rGLhf`j2DXwrk;8-dy9oQ8_(FDc@lS-+sXrIjdrBDhOw2A5G4AJJHo$mY5QTI% zNaufrbdRpY4)`sY0i&X_%=vm6K0&J6Oi>*Ox=T+NqQh?1LzUj&%iJ<<2WyU4@x=CLrcUg zdK4{W(7QsxoO&l{8-O=|L)2k&^Ksb1@DIKx;Ddb=0j`aU0dH=E;6BOAfuXvq7pv^;G^g11kRuObTpxnW)y_pF>Z90lb;dYCn&G^Z6XY z)cPGr4gZ^oUol|$Vc&aNx+GQN7Y%j;qz6;+58zX%)^Pd|J`ZHG18keS(QYiXdI&M6eyS0(@32N}fJ;n_ z2MAx%#CV~G_Gw~x<@O+qth>>#rvm=x@cCedoq#u?cY}jWAolK0l5<{^)i(c2UVuf3 z+J88JLtjFcPzb}FCi?X#s|AP_ww0)fc9mE*eFK1QcgC?a69vA7Q5UsDWM{C4CC4f( zGK{^{k@pEC>;*Ln<fE3n5KZF$9w{#=~sTUyeq7slYjlJFf6V$1&$TEPs zpg8U*=;S2~24!8JcWmob)Msn7)nC`=QY>}*;6LuEQRwXxu}Q-0>k3Bw1fLH}ss7l! zVD=Tl`t&n)DonI~K=*v;K#UN-B2^*h)%Y=pt!S8=U`Iq9?Ty zy)c#UO+7?k8YKGB4AGxv!@m%(VlBqjdQCs*ja3~tfgfwrF1iV@?uVb7z`|O;NH>8G zgF!Fd1YV4AlS9gvXlWqK<)!ok7{tAIX{hZbmdxBj1^~^|Eu?fuC?gLH3K8s%Vzx1Z z>u@`dnGHi-WTNIZ4g*M6>iYp-9kK6QN9@Iz|1S0brN!Ra6=$V&3ilRK@sAA>5^(^~ zBEBKs`c=1wKK6rM5ko8krOROxITwek&ZF+)LaGoK(R|qI&lRiiNY}-*R$M|?>s^yI zc%*(Ey(F%p_YnUHPV;;wuGchhSGgFvgxkr+yYc7fcJk4==oZ{gcuy8>!d)dQ0E_KP z-B1?B$SU1XtYyCpx6_y5-tc9pojBF)338EyA5@m zkJ7sx2CqxuqyRchr%)2&!=0u@LvBaBNbhE;%~9Kyu0S_Ww4T!p?8X(bq)0t_0m}>@w^vl2i1|ObUc)_&lFw?5T!d)E}?uRw2Sh(*C z33m+Lty$&KVvducr$Dim=?GS7-4Ygd);M8CH36S+M)?eZ=L|J`!m>hl`~yi=vC}Kb zN~&OK)w53#1he+yEtP$Ere(io08esqFd~}5LKn2Dv91@74{e1{`0JgfzVhqM69jv`KL0iL$#pS;1*Px&^UBn zsx>)+sM9zX!3XsPj*d$jAU&EWzQj{R*_tS}-nZ98@jyK&Ow^!oFB=ph%B0?!6n9Rk zNlA(zg-n-{1mQ{4qB>G-(M&m`Uiyw+lt0foNf)#)op}Gl4&?6l8GxOS(J<;AUQIJ8X(+E z=22T&A5XP3(C~R8oeVve1Vzr3?iN}#d8h=tFx&-A%w}0v@tmohNP*9-w4a|d!hHjg zABRt>OT>l@tD~kNo9EE7320~v8k*rKN^?rXA&_i@8{;5I%_Ke%)6E*(1tLwa*gTXU z#+?_&ofCl@7tmbXSwIa#NvM-f1!)Gx97@T+Os4E+vNJN%_tJzjo~fvvcj8Vlli><_ zipaz?hX19Y-Py(fpU8qwYORThUA$hSigb6Gjr2~y#(K*%(lho__qtsY7k?SP+a*m6 z6+tpJRICHo>Jp-S9pG?xYmOtTDH(CSLH$OmpB`7HJ!xuJMqGF8jY*Fip*=VaWH3TQ z*GDgtWjU@|bf-YF>_Mrr4=z~srKWN~L?xStiD(`o0w{IG-AZDbo|?#n{bv=^Gca?X zP)BRYA!n&!d1MuHMV?zl?|5{ydZ4!w*X{M^C zSz}A$u~U;4Urn<&Q(y)?&pEXyhhxG5b&DaA>#Nzr#udYUK6d#P8X#bZNn?q2FM z;!+D^dGvlN9-A;PNNo;W8jJA3G}BuH`=Jno9T^)bpW(}rgtP?3fYK5kR~36{e#kea zoi`~~=T;R{D%-^}%G;grE8hofRSYB7A+#R^ih z18Ff2(a5dTG7`#?ni5N$53{X;tYtOfn=%mP$2_d z>GaOAx&-E@MLm$_(Fb^0vP2>FdPvT}ozS@yE9XIhETk545tPU|xU|0*ifbvYl*?$1 zTtVyPxpcK$NgL$(v{7D2x5$g=4jG`Q_ZmM|o0y zZy54-!z=$VV&tDjJ$c;7k$)NWIay zq8LjpMLv89;s_q#Z9rQC@b(rxMMGGO;zdV1@ZJb&DNcCBLD85C!$cJaqvy30C*Ouu zPy+;WJBR|LMuWx0q6r!;22AgU1ELltSog^>nuI8$9yEPZ=vbH;jE16_yU>QOIdEX% zZ={Pr3m}$g;on3{05s6f$MmdBp>;Q5R>sTOt8x3M7^z;g^tGauXbok*QC=n5AP&Em zDc6d&@cGd8mtgt06+<2m*q>0&3BqCRr%zHLCV%Zu5Y|^PSLL3j0==@u0Qz)*Q+UR1 ziVijFF2Z=;isCT+QUo<#PEpaR0A4_O}D3>eo>u|Ez=W#Bo zZ)FAk4w0VlW}t0LT+=RU#p;d!`4Z0GOBd{>>AWQ=A_kdBn3nQhr-T)G6}|~zuNNoY z0$@<87)^VuKW2~3h6r~cvqjlk>SQ%kMO#M92VrPhM@=P~lU0O@-PCWV)Q`W0tp znYCQJ?p#-HHFS(}tCi<%oF{n4DJi#xdfrFw4>)Oz#@@K%@$tc9<<FCw~y6NjCShxweTLbv#PIEl{5)OCj-$`oRpe|$@p|E_$!=(e%1wImb0~=HAF+a zdEMmY!IS;0a(}~%05O2fs{nB|b;c^Q4_2Q;u=<=WE~f=Jw|9)KS(A zS?hrU*H&OqO~+}z9O2-6%uTQ@Y@)8XaP|VSh&!A~qfzT^kcREJv357*VR6<#>~KCF zZC#_!o<~I=zUC-KUHZZk%#XCsE&3t~%lEJRuo`~DG2fc>FFyPAoex^)2ZnnjipJ7Wc7DXqe$Y?s{M)3T zP5Q+T1$oG#U-|nRpZNdX;^Y6#`1pTQI?j;Y3jj^&q(!GJKKcK>pdl6x{$pt?O+NeI zMJQ8v{M3!_0DLdJrZ73tiei0ciYWeK(L2Es(SDrAk7YdXbB;LZ6H_FZ*o=C-(#kVT zQO{{U*J>z|oFN0OG=%X3wR5e`A|-h5Ts%- zXx(PW%5$C9(>?W^Ayq2H(@78}E(hCq>p2%JvO2_zC|o7rT8j++pH2jyTx8v5mf)xK zaO91q+OVoDwqd(kVDq6jn`ZM5<>IiLjgz$3*)UH|V$%y!7qYX67PyDsA3WDuV5|_C zPW|(&R&nELg8qqJ%sp+|C9>Vk$}7&Z1{pWfb|uKMHYEBhOT@YdV8C5 zc&C-o1einvQ#7ClQx~77cUN<5(VX{bTMAq$=+rzncLWk}7EEUyF>uJ_ z9({-L3;$Ey+)K_|g7WU~{?1#tbcy~YfJIZMS2njX{NTvj3eno8b)pTnf%WM;=e(=D zaSc|8wt^CtEL_ZBr!Sm0uW~w{&1;V1XU&(+UxX@(cFtQXt@hr9be*8~PH_jTvGeJ8 zU!27t+Y4&vMz!p3#bW!kgXrjd*2`+r4XHRVC`xSDl{(`Dm&Q4JVT)eaIJQL>oI`Mb zfTQ{Bg^Z|THcmfuvqh;Wv$5%RkN#y~-rn>&2c8nK!K@3cY{Pl;A=%Nnc(~EV>H3+s znUlXsrUr)xtO+7ejxLLnOBmi0aAF{)fAuHkV41{Om zU^eH4_~p{ULgKFXT)l2 zgIMW&zuFoeI3H=ewB|?dFTiNW|JGm=Hvm}dV{&=vv_=zZXdjcvYtS_qCnsQeIfSnm4|iM z2xgkUyx1B6nmb-%H6FnOADv*iy4d;SKZ^|VnPV6MIsA&&6z&gBGLG1U4hbD`Gc=_h`SylxQ0n?> z>mCgGwJjd2$$^=pR(d#!TWLG~`1~B7CWWF2;n;xMGL8;_AG48Hw;5v-It3}~02YUx z3)LR7k%HeB&f`N1__gCMWAOrO*9Z1fH};hBYS3lAHtS#`%cMU77{YOt8*d<_V@s(K zHk4WbM=g~d{)1^8QzPQAG{k3xy2}m_x3fmL(7sX=o8Zy~;$oOM7(Lw6|11M+@P#J9 zSp|Ws5LlJO3p)aJ!hfLV7)HGo2NUiMapf}9f~7cSKb4OyENYsSrh?SnUHOK)8jK@c zB{q}sE;cJL{ue`20$Sm+msPFQk@soA;yBh|N;i;eUgDyc)D(qXpPPeA&Opg&@Vf%H ziyjCOcLx?yj54>DxHPp7yJk2`&~qR4k`#D?dY^^f<`MMfhUv`>jSm1ACN~%TgZlx; zjmIvrORlSMxui=jmvqVHl1#3f|Dga1xRmo!Zx_y3xB`6}lnhgzzn^-JP3W_i`Ua_A ziK?lG0{vr+G&TGT+O=L1Juubn89{%3nEw1Q{rNTY2XF@2tw0Q&O7?!?j`^R=4TM?W)L-zQv8;AXE+O+iL@! zsn-Hs{m(SALIMy%#WhoIrHgt)<0+Fqa~a<`(6A(%s-Lw1WulC<^vs! zf6n|QuH$j$Csuiq1OEj1Y3vXoGx*d3UzfWNlVI-32-S8bcQtn)m9A(#iz6$X`*|Gw zA#9MDu*#R?_>YBSKqckaktdwGD92eKK3!f;qwwv9?`V9x;5!E2vG|Uwv&IgtxeBAQ zKDhh#)z(w)HJ42rtnb~UzH4u>Zgdax=G|#EtEck~4O(;Eo6!gMTi>~tV0Jxa`SWz@ zzAvp(Q?JdMM?d(&MVR+oQaPOf!LLCOSkdKkN85iEon;zLEAVio2ZG0T>8G{NGJ1}kro6>Q0;j~dL&}PfN{btO((xc9aG?~ zYlZu}nBDe8h;QpyGQj6Aa$w zPhem5&Ce@E5n6GGH5f`6gDhj|^d4mvb&oQ65^Sqif+t*hjbz4}+S^uiP~Uk+5yzx( zWE@`>^kyw4E=h|?VPl{T)ah_R!-hc%fm0D_#4G(tz}Pj9p-vc=E3- z{K6Js&~s>xxd<$IJhRp-VyIl#%ge|>A!q`<+B`b&s89s^>ZXCdY5J|s$A*|+NS=Nx|EuI*e$RxM%&ci-HG7M;&?T_ z>ol)nEQ`R{i>S2%aZ?0lVu)K?DOU3hJLlJmcD`}Ynj+RYLCJzwAl;p za)RW>Vte}YfPHfURKueR<+(`60MI3a(cRR$J&Yf5n;B|Q? zT%sd%B1s*bj?LxA!~rL~gI_u^n?UB1e8cfjPK@i%AG?&K9yUd&t+&8}wqn_Yp8v{G zNq0u7Zpt2HDeu|r^c`E8s^X(a0s0OPhz`BCbIh6^wbqUxVepVSTo1v)c8ncFvk)JB?Aj0I%!_fpZWz2xvUSs>is}h$sKu0zt=P zagLu$F>PCo^cFY>{I>;D9fA4cAlwTW1x7ukxaRYV=1dxrCDlrAwzOTcTT zh{;h`L5mHToG25nVg<;m{f)4!|DR~eRTPcY519#HVlly#yr45J$f^{D&xA1S$S1%W zo2M0M&0c_E1YbF$ez;!>%X-ze?M_6mss-12sRnJyAVIH`4PT?CLk%Vj>)OS%)Ca4NKc}-JJiD(RGG1G^kNSDvbHa}L5V{VLeaX{guu&$#b5$Ht3-5nF2)9xgn z&7JJC7rZ1~heb;M2^?l2`o!RA=uBw^?zE#K-KQksn0%itNqOEH5Gp-qK!1->n3Q0CFX2B9!0ItV07QEnu$g%K($A1$70f$|=A(0J~Lyh7N)8;4KKtIB; zDB$=wFcQaL>O2Up&j`E@qcKMNFhS+x8Kz>|GXqo9x#-mku@1NxBhO7@6;^syqerg6 z)M+g&=FKpRUF)zg(SeV3>+$*S7MQ17;qKpt9o@HMBl2b}u=uf|>`s$aY%!%{gJ3pR zT!-NphRIlbVbj9<Xntl>V_$zc-L%&?&d`$u!%2eaZ zo*O^})#w)!VYyjwRQbB4#n+aK<2>kcrkY~9fjn7ix&g^@e~Egt)OwlqGH0D&=d6W2RSqY8lBK%6EdKAe+lvol=MY1U zA^Y)ISyIpIaf#HIt-57tyBGd?6GaQbU?hKqzQ|S^B~i|q9E5=+lET4HH%f$C=8jdn6>_wt_BRY}zNwn?Qkxq3NAYo9GV1O5!zhH|o3 zg4|0bL)AE`CGqroJ9opAWhgUH)~y01j_(@>LAa6j4Mk~VZ$@Ha&$H`X^xGRC1|yR5 zfEaNr5JRasPfe1EcW`3oy-3WYQ+eu8S=hS^f0p|cx)Q@MLt*p?D7`%~kEv9cuk)?R z2aBKV0wSF@B2lua`)<;e3)pwV!2LUT1^h7|fikJb`>d5-V-~ETbIhXh8;NND#!21K zs${dZX>~>yIw(V0kO!Ute`YN0E_d#_#5V5lj)6Ns4mmrB^Dp~WvUPi=>~e0OhsyuWSUg$DpE&F9ll$Hn!I*U%dZxzavg)b z#YkO$QcGVqLTUUR?X~6`Z!jV0^Ft z_-l>6yBVJXLt0HPL8+;K4WVq*^1dm8{xMnC`WnW6{WlHYa4#d+8kxhr{YoiE4X^Nnqn=?G>e{#8x!cpOtZb{TGPr_ zuXXalsDm(=491#&U2GQQA7@UW&xWhp(oX!<&$jdU&ujK zqtFa1H`0Pps!y_LeXVB2a14D6-e7(_1OCIC!J_Lfb25#egz{KB4X+P*JQp4>LxzDk z<8CxT|Hu^bmg+q?=lw+GQqK@Kl(8!_+)ItVqo7Rcm@1PA4>W$?p1)&j=LPr&3`Suia8d!PpGJNEMgeT{WXFZ##Qsu?omK|%os9RFOMH*(phC|;FLF3}g)wS{mUCx?#ZB8Par63g zGvYhk_Tqe)90s1(2CsmB)A(yLb%I{<+vWqmA(J#z%bjL5Jhq=@)bhz1s<+4LqAhq2 zq;Y%q`Bd{hbRh<##b|HQ=v|9pU2m(l(^Hrzpkq`g&{*o`?i!clg zN{eT}|IzkJb&M3!SBdP&U7S6ER#mF0GWA|g-Eu%nEjeJpJ_$6S)b9t3`sZ`Bd#pN$ zc0C8wr#bqt`y74$tDu66uhgn{m_C)#-?1oER;|-desn`tIb>rfGRS9pm+33zD>Lbt z9n){XmsJMim~_v3OqD{ODqZ8ls$hLHX;+nsUU2|MYArBQtLASnS7S+smvyk>5WX;E ztfbvlYK^3N`IVVv)g~bra>7JV*pus+DusGmY>I3vsiyLA&_&UD4fTQ5;^5s_TI1i( zsP(sMs5Q4+0Tz!=G3Z2VLVttQaSfk}D zAnE`0n{VcM@V+t_?QMxlGbgBJQjPabfNb4SoRGae+UCOlC8U>EnEKL);jtJ}0K5bh zyaN7`I7v>u`JlfLYqQhTiF&Kdp9prxvlyR6k4)4R(95a)^EfqW`#hV2E&wBioLl*R zYpD|vMF3tF3tj=gYeA4)q3T*VmaO?>zDvW>F`zRTP2xk#BspO_rk!@8D@ytaQR^d4 zyM$mc(qFoh-mF!JORCM~uX?t_!ZR3s=1VwiYXXdES)H0FNe#!~28G2FQ@5Q=n$W|i z;{LcU*oo>;aocYthAzHj^UxUCt&d7 zmg~Rz?U{-!gkk`O7!zkpbB z>IWzD$pz$`s^&w&Jya6uG@2Bb_zt5@9 z$kaSaZ^16Z{j>B&{!`@qj%Ap&Fc_VVD>pHheA>JMs(qzFFW|{(V0EH1go>uC)AUYe z`Z(#QpW)SFFgj*U_b`2}+}z5CkkY0GHPPHdoSL_NkP;#J%TXwNl;*$zWY#YwdCx(` zPLIsf#Thpp;;h&o#^5N$DCq>tet-FnabG54kYO;^?#siRCxZ?iw8zp{({*;-ww8s@ z*lG76dm@dSq3dt@3?z4J$@#Q>h8icEYab{7`G};hlAT>PTH;@R{c;Oz0E2P3%{?t? z^XSG#U1#Tiy!pwXn`Rsd-+rCC8`V*g;}d)SIB|WU5DZ4D>Oa`2W;Q$o@0)fPIcMqu zr_6*}zBSPoO^ixbV&ut>w0fpqZ1%bBJ38PQGSmjUB%T5P+Q?9*d^#JXx7_o9E)B;VJl;V;fo zOC+nK-@N+Gc$guEJVW{aye`DbMrkfZr4MF7h?TQ-O?1u%zdKqqqV{DrH}%cdl`!MV zo`~Q=02CX#0{*SbCH(_OVkx;v?;6ETV03PU#;9+J;tCIbFPBejNQNdEU~xiM!2ead zMt>BUWknKKs&_Ak$H@RsJ9Gv7vu-HcmZ_{j^Q0hj!kF zPiwrYB-w@!)Ep>*bGs&iJhd|SI-OPJbzqhGxQ1%KPL&6K(!HEIrdLa??!_sUSN16m z>553P&M|(=(;<9pc&xb3w(0FpP#cEqP_4K_)t}Jl`^MTLDCQ{ICbVi&bJKTQ>!O8V z7-h(`?xdEf@r05@W!LLce>NB8M`9t7d-cT`mgF$1;d-E~f6$A16Fn-Vn@?Yj+;Rn; zFN5)QT=$-&?>Xy*=ksU9Nq-NPbcNYHcP5B-hcUhLfU$h26?FBSl1EYV_5Kt-v2CEw zcmDrE^iFCE((fqWlfN4_g2Ct`9?YhU8_>wh5sH2kiGYKCML zj%ye=9V}Tz=N78+3~Wlf5z0B!&3V)4?0$!fR@|uTZ4;x49?(#(ryO2t2u%?f^{yL2;|r%#ggL_Z%R7YbxY_&Z~4FKZ0aS}_PQeI?TGWOD zr-o5jifq$Y4r8BLzV$`;QVd4danE-lR#Eh)Hob{H2P@0|SXCQ$y^GW|S*4mqNFM3Y uljTN&4U1Hd923J^DF(hhU#DD~0Xq*O>HS5jBl;x1uWzgt;?n|*BH}-dctRim delta 42854 zcmb@v2YggT*FSz{?(E*Xo10F0LJ|TbB!Q67Lnrhmz4zXWAfSRvv49nfas^b11q&)t zvLGs8@4Y?t-V1g_%Kv-jZZ=Egd3}HH=Z|LY)H7$!oH=vmOxeHRY4GdQ4WbA1F_fRk zBHFRM_^*!Bzu1?uVQe!|wqe70X6xn!rOleP+VFL&1~(%o zGKBvr_~uPrTv7w_mgOBQyNY@Se`ks!110u4qFlsq(LZzczhB=Vs&6=}?(-&B22@hz z$K|nm(~RbZ&N;w)ew5Dn(nfE|9v{&jU*#tmMU}54COcc3_`H?bhHr09!d{=w{Z_`W zG1~J&;lx7iX*%>HpDzE?#J{s1kwlNkjRF-N8y~Z`^Y|AfqqR4F(q^yDegBMA(qg=- zx;1_E(`6@pU$(7e`NUYpEyGj=rJm)u$_Vhmg7o|NNmv*b8 zJy%?n9;ZFGZ~4_t+Pp2$MaN`*R+ywK?sfe0RE`NYjTZ-m_I;NiF4D*ZXQYYyH1J?* zwwUhMt$0~3toSjF=*Ki?VP9{u(^5Di_( z-fm)GlCERaC~+`W`;upg&oZ^|Jdi1+><|*YR;=e}D z&rgbE=Rb!;oKuo0VuMdUBs?z4FCGzz9*xO~DcznqokVmx$ywRcNOoRn?Mn~VeM(r# z8gire#XFYHIQGtW9?5G+l-H0^b|$P7F~MeEh-M9SzL`w9cd}{Um9f zk6O}IQFJw3GiwEq(yH zm4jQ^cLO`yd*cl)U>t+VJJ{@%-lcd_q2KV{MV z7Cm6mZie$9JA2sKYtcT7_FGhCQP83T4C){|53zHIorl?Zgq=s(Ic#Rq5pMBOb{_N7 z3Mp=q!(>^iC(tp6?)Zq&^BD!J8_=~v#(7=^~pxqS=rT zo(8r_AKUaPeI}?`PiKBRndIExR%Tlj^tnx6Ak!;t<*381RH6$eBO69Us37}X4Ei9}135C|>s-w+zAi_NT%SuMC2k4&O+ ziUP^Fi7w|X zF(w4QV^=v-ba(FVDl>*KLfkRg)SSiw4FfdnP2*_1MtVf3HVG6b==M!S{v=YT$QC9| zI!PT(ntX!1Cy-sxlu!k!-Q?T=W0OoCxTBFc1~rTYNv8ns=^8t?*4E)#`-f{C6RvgC zf2p-1vev0?t*EF>QxUU%;XW!=2Pm;NUFJp5rMkm?JT8G@H>X8+Spm9S*j;7>s_BwR z@gWR-lpn^hO!FMmylDu-8t&t?ER+@Eyk~fLDY1SS}ahCLu zg>6$u?xD1V^rD;~WgH}*q~SZ|LQc=TgX$fktdWPP?#LizC*)L7Zkkm^_3*8tydc$g z-s>Sdv`x8_ygQ|rYc#4WL}jSp4sk3^Jx%r@l7L{8t2A|p-1-{m`kWy>Ws7>qx9MN< z-P==UG;m8cjKoy*;TGj@qB~Vf4|@xo?|RB60mMY5`4QYi6ZzX{6L6$qp{Sl|R+|4# z>il0St!^*~8*Xqyk=yEs$}jbaoo5#Bq-|U;D!3`hyP0cQpX7CJ>LqgmF22PPMRw9i zw`0_l2ukS&WQGzZoSm?l^8X8#35_EFU+NV*Pt{HGL}o9nMe-&Q`P0;HaGBC0|0ybU zCiVvFUPrBHJ(zF0(Zk|zdPJP2!!nwV$P79v`_Mr- zgr1UP=xI46KrhP0^padnFUu|Ts(g`Nm+#RV@@KkQ{z?ClC+S_oPwyEymm z&H$3L;d_j(qdAZ?K58nD>XBzsEBO%3MXE&!@(V3zy*yy51t@&;Xg&rN$1R}Y3o){6 zBZbbVP9ROJQHK`MVhEz@A?l`GfHWSW9Vp9pnk<;qd)BH{{)lQ5umQF^5)c!O=IyKcTcBP1yj6$Q6K%Ex{!jDYb|wX0#5=8v_q39&Vs(4N}qH}t69PF0_dBM4*om{I@TcNOuS8E zor2|sR&j%|=gCP*FJ=0VnA52Oc`A_SpqcD!`&dK=cMO(8_3|%xVz_+UU3I-NUAFY_ z0x%doSI+fn--<=@xKI0*t&m*}?K`|u&aw1Lr^z}w*&BlrT^fEc`LEZ|Bd>X#_-7T2!tw^6A8h!k+!UpkW+_YW-@no%;=k6;syk(R<*`+!v+GA$C+PW6 zt~Yf9Z~9#>cNe=Uf6L;wI<9+5qnow@1g~vlTqZQi$!(4EP3^m?i?Q8BtfYrg;ig{D z)9|

        J<}=V{X|6Q;ZcUy2P=I*{Z-~&kq>WOLQF@?l)$;opR<8W2w71z3>fVquZlB zZ&_Zy4HVKg(k08bca78_h}cVN5?`_IP+|XTi27gSEd0%r;vDe`tK#f6r&{F~y(HFX z$0XKh$0XKh$0XKh$0XKh$K+v49^vm%b{=EraX8Mg_l)v@STA4Um{-|(%})jLbw7M> z_~o1OEe^iTzIXicKk{8pc#oa;{qh6(A;*2h!H?PZi6uYfz-OlXoWEcAX_5SreP6Nf zYrp(Pe#^n{80z;N{J}2+Kgyp>dCZhQoAMVwHJ1-r@>l-;#?J4S{DZ%L`l*>b&V~PC z=L9={vvblfPs!8lI|H9VQHC&NlwlZ(9gk^v{nXtseTHTD*zt2P3i%D2vqnb+;Kpz` z)->Y$be<8vKVdrbOD!LY?M3k9~yN6Z8^4`X|xZawNfklkUW7imeIjatBsDP z(Wx5emAn$*e*r(OGdi0_7t5&d({)CQX>_%WZl=-QGJ2R|jAis>jC!%to1H$U(bqEi znMQxhIL|Z&SjIp*9lSjJ4#nB}Jf#taV6=FD?UW3HbbmJgZ6JO)4C zPmdZM7I3F7fg#wMS!+1LWf>!{4~8P#ZrC*v)SdD}AHVHp3hjCZ-p_c-!>%lN=DKD3OFEaPL# z_=JZDq|tukGwwl_N5&VH@ug*aWf@;HNx!j-Z!P0H%lO`J{9ycO8b6uFG1K^2RK7Ml zF?h=-#yBN@b&j?aY0j4KM5|!mFOB|Ed>-8LwNWMnWapKhwBT3Y7-_=%4N}H5es|38 zjM?H3XAytmf**Znbn=Ryf;qn!Pe?I6_{DEVUnA}pLDs}6Q@YQbI%kTYK?RlZqtl$1 zjvG-g|QQ)r8eIPqdJ zz!uBdS;5XF92XEPZLvzMw#8)}cR9yh!M?TZtP@w-;wo{qEv{kbS~$iDalI|BW9LS( z(Uz;kj^4JoSL`y4zis2Bamtn(PMY8mU5C6{%ud=E?fDTr1bvs4?7k+es*kja%8S8Q)IlYnyVI0#~Wc_e_p5| z(JfUgCShw^wNd3xu~gYctZM7@m#QSqtJ(>QnKg0A#F9A^=S?r^a^d2s+*j?HOu0@_ zs!O8V$@aGFFK)MGKXJR$#ZUvS4yvOqhXJ`QhH4DOr?>OGp&BOMA)A=0ldU?7M@-cP zbW|0NQYyz7D2F)tN-cqBsIyI}Vymm_W~=V1qjU6nk?%pL${`y)YP!`!^|a+^IfgCD z$Mh6Est?F6wt^ev`ErphPBJwQGjseXc3`wLwZ$klC5EHM zI7ipWtV9{$Uc3Xic)mAPg)s@9G7B-2;aO;@ z_OZ`%HB;1RL5=zi>fg_~1pJmLcgnkLHC0Ws#iPs(3cd4(rJ5SkQLK?qbu(sgd7sL% zqLHpDb%nDg zS~c=sV#_KOVB5QMq@K^~ycn&Th?P$K4Mr>Hf(JZF!JaYT$>8>MB}JeY3DFyN^fYmv z7y$o3%$`yVLU?dQcnHEn#jxshtaj7H@QCyg2#>6lJ}M%e%R`z2cA6L)nJ<6{2zQ+@ zJ~9FE6Cwb_Qo5KJ5uSwbaTDB4PF0p#D>geF>#E5Owjj;jX;><57PlZov{c-RGT=NGKDxyCgGN$7 zY(|>9xv^hxId>by6DlXfcCmwVBPG(pE~_Y=O;|Fb{G$=&t#J9<#OfKF}$FPKWT!U~?_l8mTiaS6zMjc{MqrN^{<@MpB?ncqGDEb`0JP)J$ zixD6aLLfX8Eq0zf+sRkr?rs3kwz+Ba+ zHP`I!;{68tYO7g9p;VUuy4ePaIFx%ZMASXGsxesd<6ML0ma0_!uRAEBHq63syMU86XRu6{nya?Q)!sLm<84B)AKh#jq zL)=hjS)M8maDV6#i<1yFXk~~)T3BFHR6HC;%Oqu6MklDa0P;wPSS^7QvxE%;DH};) zXe_G57otf7!R-)AlILfZz)bN-AV~lccMc#e&H+TQwi8061ZXfK9!6p}JQ3nVw2LTQ z)8$8mN6QmuXMHF$o1GK&RjW2k1s<3M6a^{_&?N61reo)`*(^03wL{<01fA7DjR2I# z8>qa748@gxJ>0UoV?AV~CC4K(G=E*hZm4FV$kK)?UvmXlTtd0xN%54fcwo5Vff|`= z{xC8Jn&RnPz<_SYkWgJD0pe;CVCgvt@N8I~?k6WJU-bhnv-AHKM0hUD6p&7u2++oK zi{8uEL}*^{??k}jMHg97po#Ex!T*^ESn8aE2)^2UxaXWic*#Wr6dXX#wT)C}dma>e zG7twb*biaQJVLSH*O~zm7w(rhFa%U5RF9RCZq8SYRBO;Nr%+|+$pY%VpREb+Q{1DugS&BUAHEzKDrwnEM>;%&x2 zGguPu6DJ5$ymN{QOz|I6yn7tej}Y&Lswyv1)YV)s=cM zB+TlT_an zU;6&S2DlJkh5NGuz_7&hAEF^66NUz9*kObs5GGhko&%Gp+L*LGCni6IG3iyKhX9kP+L&|%CY}BZO{xbN z*0PL=bGnHt1K*UEsKoj;ayB<&fI)Z2MAn}}y2FeTRn(2?uN5-c8+G*|zwAq~vOm?4 z11L=njHo+3%m?YNv!h7RxBfRfjLc8T7Gzzl`|s9 zRHN7aF8ng8IuD4>k3cSRO8*lp1gruD3@Ey|C{+zRb8&rKMlM0| zr6|4(?Ys!>yf^|(Qn-yt!UZG5U*d!YGow^BsZogKfJ^fVeUOL{oY7|$w z1BYmCqj-5o1c1=uKnp)GAEtOIT!fsQW-7giJ4LTdU3b;OmMT4>tA30q6o2-3h}B&MdznW5 zk(R1nM7iJptz1l~Tzspu%ANeTa}Fd0o_GGBAR8VoP_ZwLL?vwF@&$;8%#{WBVGwTACWUKr#>e&91)UHoJO7 z21PF1?9?#SP8zBMqj{X}1T7GZLNbk_2n4IAG8ja{Fw$zlKw4o&T--TGR|;rl{{k&t z#^bz>rk#tD3l66g|Ix3l0j|xzfXg^1a-Gfrts|hhnn}Q=*SXr783xxm)C<1CjCfjk z4tU)O+tU%)RS!^z-!SR2&It~q@Eq{^0bc)qfmhds=e%03(rc__#t;-8`Y%PZLq&7i zo>g=-ijMgoi%Q;!(E?+BTkP{}cDA-vDdE*MQQa_0*A2BBak3t?PInD~MN&M}dEPP7 zP=0Ewk|V^xbX~bSvxJoh*+4c7F-gq;gMl!$W*oTuI}ZaaQmcwk!DT+|qdfGnhgXZ? zZ0FK;D!w#4Hye=}Rpc-#WOWE2ie$0I{i$|p6DTmDy(;O?8Y{1Kh}MQFuvWSyytG<2 zmQ5g@`%}3rkxiko@OmniJSWLEWocC7la!+J0J=;z)1W_SuQmhpvJPi~UKa+vE)052 z7Cq)jk9OyxM-1rE zAq1+ZLZy$~RLrIzSMhb{fG>gaS2)i4bp$Cw6Fzdrhv7%bj%=6K^wwH^tTuDYPBq}` z4P;8YV1EGE9|88K|0CGWvWq6bk#69M`CXOe9Ieo-@n?m~j$n;y!v_>Lt&*rOxLf z1txG=Q445$koLZ=|95y@Ww&Z1^PI-rREu#;0KKeKCV7=b*xIK=g&R#OigJ`|D0bQH z{|WK#H5IW=2;{|8<>k*dc30E1_M z^E8Y*d#WbGYADx+e5x%asCJa0+J`BZKv8aUfM4~}7(R)8djMf64x!~cN%7o9!RJCW z?808E|9~3UgDFW3p{8mWwZj&AjX-Q4ZfpAx`H0#JoHPiL53OzrsDndLpXsHhfo)29 ztL(5{nP_Mj$j~s5p%FkZp#kGclh_pes5g$80KdQcs8(S!2!p7GfJ5VIi-zDbXf*^>DMx|RVXNUo9U)jDgJLti zW06?PV6V1EU{a&sjFsci;hUVv{ZvoTZ&$yw`nOexRvs#o<8^(}R0t`4wxJd=azco0 zY?7z2eJT6GRGaVaG0TZCkxp{0`5;!L*)zE|l1G5#)as3nYAlHlB%O<1$<-vUrq?r| z*R$uQ7a!2#&XQ9@Hlzpot0J`cz5XgAf}^WVbtWasY1&|E!&=3A|E^TQF5oGug9Gz| zj}x6ucun`T+D&@@oOGY%Zj6H-&?}t_&QtLzbl-VK8NGDH)(X2kq(%ROj`o7AX&6 zjLl=|3mC75DzoT$IbSXSnOI!KFmF6)%7tK$6Oj zm=dloMXo^Gx$9HpC1Jshwwopk2Aqe|u-_S?GE2B;wOZwYvDyP;wWn`H4_Dhi1s})b z9uDRWRr%c3KEu>_Jv;{CWWtlsi&kMMm=NO+V@vT3FiUfKQ}3#vt3+}LwRqtP*?@ED zce$Fy1@>Wnm1|JephonAyc9ka-s^F5T^gc#DMi~_7r*QbD?+AhcDX4pf-)lNYGq?QHi5~+yH0HaJ2{c$Bqb12t9%IWNM@Lf=pby9i*}L z;^ylVd8NDx(^3~2E3XD#%yiqsMBXm1k(db4f~{Kc8DXq-jJ!6~w|S(_04&hGTme^8 zURRB6vkYt{ouKsNVLaCZPaFi*O!e^ytmJ^`MWfWx7Gc)J5!2Ao7_nAX>RwI~i{uSB zF5|Lh0p;=)AI$xiI}JyxToVW1G`#+!Rda^7W{m2iJ5J9WA}<6l1&2&GqLJ>1+(>nj zI*Bv5J|qDMVg!B&677e*dD#CLoOO2UgtLR3+W&tIt5!{*kK-~T@CYtfam2X863uSb9s|f-b-cTdfF@ zQL2kE>D>xE|08bKn-$V%EC%R0jDThlj8?Z=un6|dbb+lT+-zaCSwdxeQA=*ClTari ziT_K+&Awc4kS8rE?F1zyMkS>sC8Q=HB^7@sc^9^-VJORK8V0=?5%@nU^%Qn^dG9hn zMHnC_$b*p)$DOOtp{PRV`xx(kB>6+fzvOnbUUShQc1)|#5V0qm`=)cx^KRpw4{1CKC1YN^NW{HdyizpOA(G)q!J(cHOzsw&Xo z7H;_CsjAd!$#;&Q#?{gzQ&qCR6>_(BQ`<~ajRR$chp0L31h+^OZE(XbtBP7DigNh) zp7M7()HW0<#TOUZbYYxAdLl@FSJBDgY2GkvLEI&P5icdvyO#XkbrkEpk`ldFQLguD z>g>Iax_Q^rVDAQ6;JtxXcyFXl0k1>1dT*j#-c9s^cMHAgy_r7q-Xa=%Z__kjX}XDS z(2@sNwM5hKaBr$l%0Q?ij!(+45O1ng5lnLM-3>lfWQs=e9{4;sO3+YqqBks;IHpZU zxfPqgJnLqX{}c{@p#QVuGwb$%U^+>4;ASVEpk5)tc`t~M)1usy&MnhbTA&?hf(yw^ z7u*-rgDhgJ%ezxI!kv#ImnQcEVV;k&G@8{Klg#J@)&r;E$yw^R?EBXnWXrDaPTav<-+;HXEyO^>2ghrv-t zsMs5LoSJ)|rgq+EsHgW?8sL4NhI?P2N#2)fme<{Sa!1kp5a0M|s@xw|w4z9zq+(XI z8be=|RpB<*4DbV@XYB$0t|qBos$j5&09!KXsu7?Rte0@{b^z|Z>?K3H9vk_UjPl3&cdvy zZx6mWTYcr;S`02ytK7$R+Fqc#%6Pt@d13G7cA`bZe`U_Z%`&;N6UN@PbB)B{;iW3i zy?XgYKy`L+Y&N=FwJ6YVC*H4?xYs@3tx^xf>bR?)Qm?o-8pnU6j=MJ+M}4L)cA>oU zooeA;8jbs1J?GaYJ~*Ssx^eA&p0P^D-4gA&#l3LaKGAboypDUazNdK$?K?TrQ`c=- z)@aXaTVFE`mjB*3JA)`YgRgj|IXR;|{@^RqJtY}Bxgz=k|KT{I!*Oo%D^VU5EL`Jx zGfpS3Z1a5TMm*H8G0OR9E3V@-yuovClukK%#(U>O1w;=Oa7yFA8eIFMV}9J@{h1wH zH0(yWW!Q~=!+m3u{_sObI%W~>e3^u6h9=>fp-H%A=p|e;JV~cG?`d|pC`sk>LGex{55;zO)7@DGv zpK|DLPD%1Xl}xcjs$Zmubic?DnWo6{L*G7Tin@L(7TH`XhkdzRb3G2`aj?E$G!PB_ zB3~4+uMs=gM$=LVwK5{)_3gq;$0nzB>MP8mDR{Gz#NVTzW1>MU@@u(fEzPPr-C znxb8>&vuXH6&?6SR6%h0PR|-4I#%87DUb>xuC7yjujg&2?Jm!b^v#}<8^j=0s3)mij2MJHSI7l^~*c~f+>#XvF0 z#>x31eDNXQ!jXAf45yc!RZrm#$^$=(c<1f=JZY&T#7M4d6s|bY%ebd#<52uKLD^1R zrID3To@0vfwwS;eOti%$G1I1?&0yb5c4l$**|wM?2AN{6 zE#}b|HcqfFuyJf1&iMkjLl)!6I`Ivv3q*HYEa9l7>@4FP7qWAaSYV2a`C`@NTK9u+ z)o6vd#1sKrtmH8 ztbN{GXWL+(C*o~5zBoOtulIXb^b8&9ov*8Lo*wOOri-5%?Y-2!q>k2y{yDj`@MPMN zsMuP7j!yFSZ~;Zl_8xF$-BTBNZ*qMTuk^Ny*7D-rfeqtk#bb7kcMflaC_NID;@mpS z7w1Gz_WFW9JKi4ttkyVe-Udeq+v03rCu&OVaq_+co)+qe$8;*3p*y^5@KDWvc6i%? z#glLI4)mReb>{$_N+0MfxXrsj40S%a%{$055~|CnVC&nxMKUlBNAt!5$V4o`CgHTr zR6HRt4La9!EFfmkKqw0r&^%g63uqlJ#DgJ=s0z7Ww^`a+j zz-)Cr7DwB}P-GZ|4C9euI^7@^(MEAGIU<0xOOdt?Y1blcBW;F0vkepMc5w&d?xEX* zO?P?+3$e?&^e*pE-)?9g55jD?$NBUwLz{fgSkyS>FW zu1p)0Ve;b*Sq|r5R<; z!_&NeXKu0Oci#WYN{&&v7#(aGfwdI}X)gIq76j_R8#b@Oe$4ACfkTeA{zkKX6WyiN zC79d=ItGUU&WcIU4{1f_se`UOJ5Lhq>1W5JfB~QC^TXb|MZ~a(bonY&QW_@>Pk6&Uw;=Xv0eUW*O9@@bd)|5hpCW3#nX6;Ch&Kn9|H={#!RtjDpxU$U%bO@@+`s? z@<}uk2k>W^cy(>!LUSym<8#J1W?5zYsdT6DTyL85iDTAtuDI8VOPqsh00-FU|L2-C z*NM5w?CmVR$;@>2-(+?%Oq%B;Z8E3gO+cUEjNb276j$Hov| zXwn=TFW_M6P~C=1oNiM~TI{^I$;>t`pd4qx0B>@>jiHUqOKF)&7up!x7n`)4pOL!+ zb=Y`jWu;B4XfqqX!sn!KF)wmXZS^*B9^7K4I-fl+8wKm$Y;F`;K(=61qhXMJ zS?DhQ$7WJA32v^Ql>+rbkvzQe;Ma;YhDLn|OWvwM#he+Vpa>&?jZC9OUn~`5-=g@a z165=cC8#R$^n-aL9uLM~;poS6G2SXNi+A@cu7b+k|DQ>Xcb6+SHV|1NO&x~Zky!dS z0`Ks*E9z?man0YZ&;t2uKr!d55?F;`pi2yFlj>lwC*i*gG%XV)lA&~^P!kN_rig2c zxUTpP#FyL1E5Je+=?bV(008(wwL%;;=OI3liZMe76eWT=^AzQqRAN%oGl-KWl_Ert z%Vno1)ud+sOlW=@_yD06AYDtm8|9L&9U$`yIb%dWCSAW`mvqHNRamn0F9zv~UD6f1 zq-(spe`6+Hx%y`so~SDKoFm%f(yk6##k8vfb(i38s=wVZH*q+UcJ+{}K>)Zm1o84w zpb0pmgo%Rx3PFV;6mA0Im7r)7JQG!-iO1ICMj#~TV=+A@L_7Ai#vIEw$5EP!Qd6-D zQ?X$U6*1sIMLYnd2^jz-)2b;6xqnt-Br(f@h)c{FK!EvhOD#UEBL6|y%;+H5c%ZQO zAnvEZSWya3jG!YF+kX$m9in(xR}w(n#DqFDMfBd{cuWpH6Pbv;|Db#55c2!+!sUeSQx=SPk-0oiXn$}Uj(RJ&r>attquBu zL+wEK_9)a9>P0sc>jYJzGYWTw%Fqo(yW%ynZnPN>9x*fXc0*GPP^P_sao!5$+XLDx zpul~^{Jka2-H_%TZ%UFw=Wbd{zfu$AK=y>^mEm= zEX*|pczm%GL%s~0+mZ&O>@cc8Twi=g;>)zlp@4x5S`uhNO%4-kVg#YEr*V>%o03kr zMMv-YiOc~BA z^>MF$z!cN#Y#r}Q2(H*|#=FYEhP~!vu7WV^pqcNg6e}MxZ;a7ONb0lZAMP3<{!Md^ zyOdb@p4r1K``m|Sg`o`{cm2F0Hv=5Ve~p~n(N>zX{%&uk!fM!XUR@#kJJ(&wYT|2N zuhZflZ?=;&)tiCk)XyutNzVFvye(=gjOm!*uD{pYKB`(rgr(?zrz7G-5YCUeG>U_c z7!O*W9m96Xvt#ODN;x|w8Pf{pLai4{ig*A_lO^s`^D9G>_}LX@r+;Wq^h0|mHq)2k zBK1y^D9kGTsMQ1iAw6R1N{#LNl)e_w-(Ve|LpjL^V zoVk}NpmW8GKK#}VzWt)G`aUm83=jj^!NPZlC7^DlieaW0Zix}>9chVCcpPUm7Z}6N zSa!x)V!R(~?1_xdBz7iSVv3)a))t|9}rwU$_Ci7Wlomutn{Zc|+2r-9;HA6_t7 z&%PDx+rZiJgwJwO$qpV|UM>PRvWr)ImJ7!xZW5dPa5i(sEjX{i-&-tks}IRr{bHNg z&Tw}4;oRoOZpj_&-08!6|hjD&iP^Odn8Pw75KQ-fntG_l&3^m*oj|TUDXHF8L zQ!w=hvyT*=gHwJo?-0?qo8mBt32Lzp*ZG*4WmoiS)C(eMbcIQ$gOh(YvC)5xv+Ni1 zs$kx)<|ttvF~w0+Jm!?1XQVr~{$|$6x`{TK;&EF%A)d7HZY9hbPl=~Z@r*5=70)>@ z|7Kp3^SpS$7SH4L6q3a`P)!@z;w6C>oNp4Z*nBz8*{}|xWBmaY)4=9ybC7B_Z%mou zZCkvnGwl zTYSpSXShR1BF7Y;+qmZTrA_;2zfCXFOM-m8=H*PAIcMr3q<+PZc_61JXX3mmIgHsi z?mH#d{$ZBJeokMQ;#(k1UjSG?_u-UZ{xJLGd}oXA#Se(=iod}&eMLWlruJ9ud9*;k6OrbqHS>5xxrHt0ThK zAbf2^_&S8wb2t#Ha04Q)kH}Do@C^~+8zG-T3LuvUB*E-(+1B3-U!HwExZHz>SQ&TT zyLPt#axr}_$3w2w6RU@Ap<5C1(N@~#0&u>QR+c`y?as%IF=I4Ah7w)O-P?$P5oZUA zc@U3h-BHlTe8T{o&W6=iY;jf?R!TFj$SqMwJApg@E2O)0EAEEh!fTjOQC24|e-GWO z%Rg;cjhZpWyo@8L6*btV(k=~zi{gFL1Vb0RFH|s3Sv?{Oc8@67lJ1Wz*wiif0PWT_ z-hzU__zPua58%pJ`;Wm=ipv=A7r{)l2V~?08Ml^6+MMsbF-dAQ3=zYkDkVRz}(xzEeppYof|DH zy92k5mz})mdX}Dpiw$j5_&iq#-(mP%iup+SP0`nch0w<`PlNi!vKm%!$Gd#D4N2Rv zV%rhHcS&JrNfFSnoQCO)0llmNjrLgsfYUCYm7)8SR}a{L&^Rf;Zxx2pDg=e&@M^!t zxD_7oconVSo;6AK1d@#b91r$u;33Hpf_cGb{mSzn0y{*|s zLX~|7J_Be>qW{3B(5%6D>wgboN}#mA2cK707|VCMv3G-gVEm=>xOksF;EK`5zoH;E zljzlSyqek)%47E^3uoM&WZUYYalvs4P7dSpY8aPSL%4A2hZ-05eWY^026--pI_sQ4f9jvyl-1A@;>8bLzJlb$M%HJ zz(pv8xk?kTd9+oc4|=)X{4*N$3uyRT#Bi+(YcW%ky8s8hOqBi#KV!iBwk6t1(`RQ~ z>F`$sb6Bu1e~S_52bXT0~)lkHrQE6RrbRXUE|4Vg1qvKbgRU zD})*S7Z}3XtP{|q9RdxnZvTcPBl9#^2-}1h%V&TKelOw#Lg*Pc&ZNJ&m#|37ID=};TFHcmjpZ3M7bo4_ za@-9#Ys=HLOHsj(VKcuiHaZkKcf?ssbib5g$wn~ROVI*{fm=}{(VEIc8|cF2)K#>j z{-Qli6CG$K{PRU;I$w0rlxN=UfdwrS&yV$Ldrdr3Z(=aD; zFkz9)oZJcRKopjHR$-9=HtSW3&9K+bQqr9h|H)c24TKOhbN2g7CqGMeY|egww? zsLQay1nYO*N&3)?n1*9eGl2d~T>%DDNQtLmW`~L|mq^V9s9+p}?JI;|1=8y7PS40-B~d-wz%iuRAX)KvQ5f zX4bbb%~k5&V+s0sxbr>_x8UsDy{ed!Zo`i(`%wgT76g?SWjyL;`ArDd_B1QdT@Cq ztZz5s4UCO=pJOvjM_Xu)xET*;-9nd$t+@KN4ck!L>0bExg`<7qPEE9nK&n+RYq{cL z4upd%E|$VT=8B6;FmPRQF&VUS#YMjWCKlEh*w$JVCK^sog*d({%<=m)shH#M4{>}9 z-K9z8(Y%i%k*9%iZBn_SqIv`{8N!5W1Ul6V35&=K)pUYNuO;IP$yWC%JeBLyLD+he z;$A#FvI}pT+^^ftDpocoI*(wnXH&jz5)YgfI48#&VG?-`ogr{N5h6b))k@0`E2rWi4c)a-RgY%I&CNXF z?97jzGc*4_4@2ID%rz66cuHsf?{b2FrU%79s9F=-$l4O;H4W*1F|i4|dSdI4W=+w! zBx6{R_yjWIQ)-MKT&TcnEd#_?8X_w`jVTB3V|l@gDX`2kS0X=Do9cOLaCj&W4iDwQ zp`q-f-Xca1WuCC~&%@x1Bt3DGk4VOzh$SDUAk=KT6OPj~-ExmRyGTK2fI?Ixe#2pq z-=P#A*GTabH$}K^%1C(tw3SARC(r(2r22=4O8*d2CiT#$xO1mQB`E?GGF?pKgon3W zH#>8d=Pa&o%zID(Szie?HjH83V!9%5>5mM4?RD+>H*YV;>Pqt%bTBW)p8X|=xo+Fd7xbqN4$ugd@WdhYlxUsB5Eo2gP zkSV$nR(F#jkrE+YxaA!~ixv-+i#mElnb0{|>aaG`5<9;k*A{$8L;2K6QUQ?-uo4nbSkDI$ZvvK8#PywqeFAynu#pI#9aOJt- z%CjRX$Biks@+{O1(?f{8ajF{FVQOuik$HeR>{cuBh@wZz@h>NO1N!*PV; zC6 zqiV_otTz5pI35IzY@T>nyhswgH%jSufUJ$)y&ON*(ZWt;GDwu{IPDol&n zLl-*V=UVLpbJC)Ar+N4crNO2TQHTR8B!}S6-cY=bGYkV}B$dcf7&N1C^Lh+M>o~ed zj;G7yM7mN=qHE+7+9;>eCOMsMmNRIFoJmi}*|;z>hhCHO>0P;qK9q~;Yk2|vg74pW zZO4!+gjcQ<337En)Wda(2DnI3B(D%{w{$wlYs4VAUW}9*#AI11X5l^D6*%s` z5{KH?Aa0$!MO-cM#HQRKcFNnu0eOdbRNgI~k@t$1Thqd=3A-Dc_fG%OB)B@&tYf z;S^qnH{>UVSAJ^5$j^*A@^d3weqq#;TDFTgSU)a;VpCVk_9>JblI@>B&a>c%6=NX# z*_S9rQd5x)UxN4nPr2sM?E!duh;AYmOZRxu2KMWE7)Wu#D~^ghE-Zh*p7y1%e#OZ* zVOr7vLEP4&K2oDW;!>RU;|<#O9^za{|7yu3Q^D(VdDD>_YC>6Y}x(4?F8Y9(ssHJt-Jt~}Y z?eQV_sY(91N$Tk4m^;8qaOPg1VuL5zSf5#q_Ce0>LD%E(xxf#Si32!LdJy`^LzucB zg(mSZ^%0NI5OLV?R#=x~AK;n_YaXN|UQPtJK8v;Rb3pZZ$L@+3nqPKWbh8>bcXhSO zJg;L5{SD`%uGT=$+sOS6CymhCTa&gf$=!n)Z;$C5+waYARt^D5O?y~z!FRe@jwiY= zQ0fho`a$0q=q&GJ^~MRhr~6oA#R#WCU#l`{4B(AL?c*s^On@#r5sTnS&WnAm>LUcCrEIdd#vrECmXu?}j94gT8C`q|fDIjUL#D3_qB0JRe< z@jBlsXkx3ejJ}NKi7RNKSnK#kSbN1)&LbnN8?vrL-Rn_zC8gsAM>dux4Y52a6Pu`m zxY?OE(z*e!&l{saBk(rP$0cD=l8afOzPQU-J<7V)hqA90W{V)-}jyr4kUDG!# z!mmGMaE7=1NPF9&ccNfKdl$b}M(=aX2PS>!hxPL#AAL-pFx*e0ux0-lJD>Yu6#J5$ zuei|HCVgYlw|)r4?=1QrCMQ42{Akfn4Ct82XTw|2FD9Q1Z-q1Bt&qy+!#VYjDAubX zZTO3I><}w_FuXmz#5qoz`fRv~r=MS)Yvtiha2)CyxY9~?&}V+Hi47S_))DY4{7R>@ zVWov%Jf3GY5T4-TdDitt6waaxq6~Q-O>>Vz z@4DEUXUrFVCu_M?7B`rN*fgFdm?Fx?u}<6F@LIFn>Tg^_Yn?ZiTWu1fZRqk>LZ0f~ zBpYj_El$n~E30_ENH9gB4K;LvEs{jCEmB0P4MlSe@U}%d(13OaTvm%rC%%)JS{d*a zqV6oG-6d9CV>6C-j=jWcWlTp+alNc^3#f~1K`ou_-K=~J*?1#I5pA72a$GXJ{_hY6Pg7W)`oOc3xfqK5V{ioAlk42Gm3 zn6xj1pDhO*vlt2r|6>>zhhgxqhPp7u0mgFpGsrkYy(k~1yA>v%QpBKLi8-?)mOH)> zd04k!jU7J5*}TFU0-VQOVdW2DoZTgOMHuG_KG2B#j7v?Liwmb+jx??ptAqHsm{0|{ z{R%p+g}cys`wFWcDrm6QYSxA;aMy(GLKQIBnivMlz`Ug5&3He`ufwJyL)oy_N^Wz$ zPGb5U!pZCEf}VeX;ur>1eOP~?^YL0MWiS_aw*?;ux=&~jZj-#>=6ZGay3KX*;xq;( zVWZY90^DoRDJ)rUaC)z^CJs-5#$U7(zd6cB`%Dht)MKHsAFna&!(qyURLs9LO3{4} zQInBHK`J@?AbKkWzmz%?KxWcx{Uy+H9n?$((=C2*4nMwLa-~%ico$ORJ!E(nZ1x^H z|08t&$H@8xGJgi@d=8`Hm%1&y7HtX1do9?=fH7bpekKz4dr{GR{Lba|Xu*{jDV5k& zWfI=-53+Ggbq^G=S+S)_H-ddOiW8KFQ@{>>rWI=AO@Fbe9IJM47(1J=l4YsG>;$-d zfCa;kO=y}>iuR|}A>RQibC)MIJ@`XJ561oj6^>z4ZpQQhf(VdBu#n(7oWNC9YM@z= zvJPX})Vxsb*AH|X2Qi^T$HOnMwit=M;Fh}&Q7iVe=Bizy>J>d~gScP>z{f!&T!yB< zsUb}LC9o&fFjeb+pu{oN)KP@j?y^G6j+?Z!EyV0aS6i8ZHmpw?x))u0AZ# zgyD1LBKug-u(*hUfrL4PeJp5{9^5&h7#1)5k4y58C%Ywa6+hJ6+rb3evHR^Jw*#vO zMlPoYxj>pa2%Qa^j_nV^NJB@c<2mpx4&#f$VRUoC=;nmc%?YEM1L1#X2whh$a!D6m zF6p8h;G&E#10GDcG#bMEE{*x!A{hp7F?gif%Z1gT58s&0n@`Xh}rHTW#t zwfMjePK>CCz&|gHe;%+1@I1t)6sqZ+7is_>XI_R88A)$0>NbFjx((oxHT1qGt}{5F z&kFG9>WnW#_ki^)YViOpfy@J#PIZodFjq({;^yNnx{u*G@hd#ZQ)@5Lzs&ivpzcvJV^GVjN30B3&nB^=ZL zx6DBtG~mqmDUam9zh!2-E;83@uwZKQwp#T`jdM^7Lui!FbqI$SIQJvi9_61kH(|Ce z$9|uMskywI?K6;*t`Fs*feMz217< zy-0ywFM~EXVtwmg9(k}^>_^Y#;FH`wzT>x~?JIVJlWtCO2M`}L!xpJFt9D*4W5veW4s%XF?hW+mm|R}oFZ zu}@q+U?-PviPU3Xo{2}xseyZGXx;-j4XO)`{TUa63V*R0IhPNx;+?_2Sk030y!zn9 z6DQB^H*vv`iIe6`#Rf4>uRH60v9i-GJb+FOdAFp%CY)clu~><-!RCdPKTl0`Vcz#E z*4H2YW))`$G71`v66BpSb_flsz;A*fgc}c! zwl_LtW$vKdqO3Ch)oxsW*hE&EIzT$$8GfmO-@g9XmFZZpW5Qsqt}Qgbc9eG`#LKuz^>Pd!ZEYFY&pY-L{8+V2@m3^_+brOYSA)_F66E4i=rR^mqVAZ=n`I8;^3&=oK{-2DvFg?*l?W<1*#HoY$#4Q+Bk^@$Dx}{y2ggm zvDt?H(^RjbobPjd|24e@%r^JuOwKdJ~f=IIU_bKOpTq|z4UHcpY5q>)ig|yX% zghuJz>_FVwIH|QS*sq>%k8mD4ZIwH-S9@(|eIiEp@cO=Jna3AFn4K&Pq+)yfY>5mZ1nMK+bq9RZC-z=cKQ0*uk9UltYYNnWCfOAHR@B^foE zHon9KqN9!y#l&dLXbes!v&3;hGs%29G5Va0Ixf8b)V#NpY9f4| z!RVq%=!eOOo(^Fd_+K`#ndsJ85SWUc`DvJmrbBQBL}x>ICY<#wNKMDYFau{avaur| z6FzsAEH(aaI0R*fvmA}Gn6kmhMv{}&K!g=w(+k)Lds+))3)3JEeQ=jwftln;)JrZx zacBM`rN3FpGb{OgUk)x|KS>#l|nD=ECUD1fG*6NC8&{Ojrz~1(OpOylSUbf zcY=*kg`tOzMP3a!ccB<+p)9^Bxr}d9Jn?bIt2hxh)LXThK5r=zZHfioedKW*KODaa z02e260uL#@+x_}v86*9IvP|)xDVBXo@Swv=2>XViMX1n-c)zfNIApqZpa^S>aL9Wv z;WNF=NWIJi2#LbyvUKb^eoINlsUSH6h{KP*E(%Um9mZOml4VK^HU^sxDt5u?=AB*V z1I=$L@tUr|M#yCi0sw22Di^Ven7PHc^2|+_u^vj`uu*@9#yViEUfV>~UE_B5G6oIy z0NmjL_@EW_fd`O^5kDP`0*f6}P~TI*o(6}Y4F8tHaj(EWlIdt%m2m5-&?2gpRcOk5 zMcsBZyB#pdA7B+}KPHN8K%Kyx@(ZONyWo~!L)}tLcb2)hVTVQ}Hp2IZsY^sx9g8n} z+^uEl|A#_9GWbL@DBtl4@4U%}RwlWrqWt(<74c2Toj45-hTp1R!M`|0 zx)x)}-2XR{h4FakR)R3OOsBgEB9eVA3QiQE_yR-c2a7nGnJE0DSi`$f2mXrY$4w9x z!}-{(rwkI>bS2A&P9+LIDdYkR$v0`7bYDsyEc_x_(79$)(ms4oXUcQRQEb*jR6SUf zO3=2MCmRSJ0FxecnmPcID#cz3`DGYJ4<;T&?2pZQY!D|LOYXxpb(SYVoxW2!G=wfE z7=7sOXyGTN{8;@U{3z;`DaI}WZ=2PvvMIVUmV%RoXA~EC&90*T(_jmk2I!EkH9-1P zS+eLSbKj7R80?Xev^{c>XnJ z5T%NES^j^S`QLH7&=8q;M7BRRYcqAGiX162Dh)B$zJr*NG$TzDx0nOHuOTp)cBTpS zB8TG~n0=1}n@S#-=%F3$I|PirzSKDc^)hydCSxXtUi9P8fpq#PR+?MDl8P17gxz@x z1;C_Ntb(o#(G-guiuk=EIes+F9I93Hmxjd!H{sVZ#o9B3x6Qf~e>IsE=ty#LPaQnF zzIw@$o`r9|Zutr4xA7shXDfNHK#lxps0fk`>ZzsfpPJzCG9~NMxmTz{OVUM*gf@Qg z+pY%eWn+rcL5nI>=$>>DAT$1=GUvei@VS{Hbx?P@k}d|z8u1;5jNg2jI@56=Y}_!> zN2c;Wn~q%b!Z$9aI9=4|w{t4~)c&rEg+`@n1@(4E?6GHHx0rYyV1I1ZGc79q@54lZ zOt6a+TzW(ewH$LX(Zx2cx@6K{-rM`(1+*ciaJwA5ZPxbV-6E~c&}x2L2IPHxQe_!W z;Rwu`$-+}@z|0E$tIArr%*BU(H%W^%F%ztReXFw0MVY+l)l7}G6)YIHwoEZh7GvoI zk9}5D4AVfJLbJ&?OJvBTin2tgTYNu*LK_oY0%=v2*7&N&ws^I`{xRuu^Dp`^OC`Z_y=gW$(tO>zoqv4@Oczsx zT`Rn8*5&mQPObYfmu|7+V?%aqu^Z}m9sb5MjL+ox8-)*t9^b8i9?=bkSZY}bXOBlsG4c0v`(v~AZ8p%hy{-Yxo6uHUM~Ikc zuDCx;`Y5ywiZbbklaJqY;C(4%Bubido-|T~x_-J{V?_%=wakJjYGhXD4g+--ia=U) z0ww8IT*Z0Bja+pTwa`;jf!w2AxCt!Nr1xXU0wXf!r3YZ+G z1Ref<9^3A7cVS?yi2pY@wd!z?WOYxIG=C*Zhi7?_#sRKbb8NP&T;Xv__m9;+Rj+b z;876!s|IETlJ^W_FPb_^E6iGEb?s4E?X9j}^uC>Sj9I@pN-3j7AF`Y_cu^NvGIK}i z(FuFO={w=~Uok#OqQ-7s5t9iOnFj0BT=i#0g*D%G^_Cq)!e3d_HmGT@h4!mY}S?K z4ty_)EEZ7`@4Wc1r|C4zb>IczDnt{BbZvg(ybiHvq#COlX;HDOpyU@3iQOuhlJk

        Nsl*BHSv6A+7|8a{F@L zEZ5VE_c*IYD$T{1`N;^aGp{&QI6M%|f=O=|O;;uUjpaTR^*1E3vQ%r5YfGU*^$q6w zQ}SKR_|BASRd|J2jki^nWfD3#bfUyb82H9NFjC%QV;VI~(&Vf|u$%W|qe7jYrT{uV zNrcJXd6~mU3zG~Fqmaq)%edF3Oh)*d-uCeN$y%aT4i89E!viQP-PA|*Z8_36U>b_X z(fTJ~k-eYLk`8}@^^~EZ^m>uW%cuLSgySt{#KOrOZ@5hT90yz}qm(Hk|M>2mwrdn&f66_=8TX_{gY)1X-3``uXQT1{pWweWt} zvOZis@(#Wa;h<5^oNnE#6;|?oRO*FkT8VL`x{6j4MSpGXLy={|Lv3ee2+l$9w*BVb z)NvS%YFU}^m%P`Q_tJ+>USHVIyW*P&3`G5V(xm%WR=`u%rQSef163Lr8;ZsIGGU*OT!D5*2Xu zo0;wy$e8qv>tPzBVySWjF8kY>VK$Oaou<*-(~)Xfk;-p*0;%4dE_|iDgZIwd%ZCYJ z8mLQk%Vfrf(S|Zy^;XDBBiy~pXFyck3}yvUOBHHS+FG_?C69OXvabiAxEznonl^{d z&wz)?0=#EJz`Cb6AdD{8Ld}R;w7Kb(ny%H%-(JuBAZlN1cBd^fwGpFtomev;E+!NF zZO3D?K3>cUp+)t_t%j?TjDb#nFmx8=9b2aIOJ-@NY9WW#tyM!?o6LT4CLry)^YXo( z%NoDksN!+?oez216%MV0sNGvsRw>`9GZmNm$h_7v>-cV!mAnu2vdhjo&MdE1m6e;1 zHrEA~s(`WKtM8FHrv8phvQ7H^0ZOS7X%dxJ`{0$)SPEnsqC-7As7f92x!IdMJ~n$$ z`w^6|qcwb9g`_u~bbz9qtb}T?)}2>bRn;OwX2Z&%5C5cwUfJm4?=0Pm%o=?`W!<`j zdxChPD;3n;LZnl0?Ijie;7M(0|EMN?-D*@SQ=txe^NI?^O+KC!b#Bjiz53DO*=SuW2Dp3B$k|%4>vo0fl(?0~_*|JQ-Og|w z>c@0CKU++cDCbUX{GlWr>QX+-8Ys7I%Qh!BAM=)L+Biq_HryC5`EtKCuSZI8*!H~s zCd!7XKqs!cOcOU5n_d%(HN0*Yv<}}oqZ=N(Y%S7Tmx=HO=n3#3nR0ZIzMG>)>X_#q zNW+tKF9rze^vd=c?`aHCP6~FGr%H7tz zC|c5%dR_I$h6<2=EhXSxHPWq(?%vM)wU+)fI041R6r)SO@DSsJ=u#W-Vu4n4*A3qF zHZyuO2rC+n$7bEvCh^TDP#iNBXl7*b0;s$7geocbdz0X;AnKqIU(kO1zDZG-^fq@l z*8UcVELlFz4I(|g;2&==DwubRcv4lZW`m!uMS^}ma)Ou*-*b2SncsA;b5AUe5-Lo3 zQ#{TY&5QQl#|Z^dr{yD}4r1TE;}Gjbm0(E?8#WvomWMnr#W~7@ZQd?H8|#F;51l?| t0Vx&^z|FQn_;7Ej*NW7+Kcc6ApZryPMZNIH4!t(z54rd@-VfuQ@_)IzV7dSR diff --git a/settings/repository/net.sf/sam-1.90.1442.xml b/settings/repository/net.sf/sam-1.90.1442.xml new file mode 100644 index 000000000..918ea6ff0 --- /dev/null +++ b/settings/repository/net.sf/sam-1.90.1442.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/org.broad/tribble-1.84.1337.jar b/settings/repository/org.broad/tribble-1.90.1442.jar similarity index 74% rename from settings/repository/org.broad/tribble-1.84.1337.jar rename to settings/repository/org.broad/tribble-1.90.1442.jar index a4c33610184b50fa34f6aa6da9595645945da7bf..75b4c2fc5b4e143c6eaf8ae07f8a9373ab16db93 100644 GIT binary patch delta 30369 zcma)l2VfLc*Y-IxyV;UWLN<*wNT?wRB(y*Rp?3&`PUrywgpxoAO@swRQHlbKUaEwm zSPesOzwi6eow;-Cx#ymC&pk6a`hCc=8>$86_S7v- zB0JHGh3i)bCD~snT;E~thwhlw+5NfM>ecP_qsYHh)42UvS8m>#+)(cO`|xtBB({GP zVBGg#3A`L+lpKw|WG1}zo9jl-+2rjk3GejOn?}a;hJR>AVp96OmQa**cxvIf_Lnpw z`PKDqtskML_2eM6;G!Pj-P6snzj5<-%ofg;VMbs36EgdS1WVI{%If_rl<2ciwe?@E zy0@&A+i16cPq!aJjm}q=_FZ{QCz4-H@9ME}`(GWJt8I_?wvNWo&n7Ikz36% zvhzEdiF?NWZO*_m;U6<+{-iZ}kc|52iMdmop}%G-WC}-nN9TunN6&NbA5zrGY_Vl} z8#8gGR5ue}&7Np8#_8+H3@{Qa?;RUuB!1nr!)+usJo{#tB)mm0wzSJIKXfmAVE^tH zKesySPyp4XK&t26@al@tMifSARFl%F7B!+IYV2+C+6P9%bb;jY#L7ssF3E+ zWSWl}i)jYkL-S|}t)iu*=ziKp573jej1JHWdY)F(3$%t_Aupz)=rXONuV^D>de?n& zh0DBiul&&Yag4r`qR=LRo&vB>18=)9`ZLspo~1#w7o+Z@Nwgp1AE0~bAo}v5kHfT; zo}(Qody0oG_rRIuH;KkZRFX z09*ug6?@10I4^WI>dzrJ%|*#P;CsIJ>W@RL_hJ?9^FDet$~)$23+sb;dI(QltqQf* zdB3|_5>ZA$m`nsvSBth&3MQ3Kk9(*8G?bt9zW!5dKJ0b;+>cLrhyUD_U-RzxdARjF z)dpQyuZ`fhy<@LsTQ8%p_q>l?YiH2O6@3or|-t?RGd4!Rgdb1Ue_da>ED;MK=ap)GR z&1Gn@6)m<=ci!P$@^2A8VbrzXO0w?47|(c{-!c(#MlD&ifBvl%R!2RsNo_1ml5$&l zD5t2YR_<&~1KQG+&&socnv#W@fsZmMhFYks0G=L^g{5kRHmxa|vO%3~s2;UdCjUqGiXQcHS?+R@82 zgih0FdR2YciJKWX2?1G zY6Fb70>)cYdt5sM=YJ08lDeHSoi13CuEf-Uh%`#R9ixNSEgj~2a0GX?axL}F7~WF1 z0eVhGlT?s@Lo93~z@LUO(kX`;Q-4f)0u;rDu~`2s^xg`{YE9vkt>VXVhtN)dsWY;= zV5;39AG@n*UvMrdd%gmetq z97DFDrWm>zhluLFcvLt26uBBsR{GBOf!awY_#?-U&unP?XJ0#@k?Lh0uFV;PU)>^`WTi%ImyvJSvv4n&Q?pw=OfqIr;_LqM;?peDxAF{qtW zP%c-nB%eb4d`X3B%?f^*o0`KEujKUdv~MLR8i4I=e!%IwvXU2A8+60Wx&!AuK+!z` zNiQsS4(PKtpy>kw$OX3hg2ei%rCxr72dTafbA)QTmXmmx8on0u8LKv|<<>mGEcszA z*XPM*y1~Op7fZS|&oHyLOV(^NeMQpq%yj%CNPEom&`0=q!zF;}Uchu8)qsL*1O?d| z3bH#CS8W#!_s?LLG;|ISD~?4(je(h??)QaDUJYj-pbIhCZlI zbGQS|hNxL`dNa|6tvjgUc`u#lV4<)hSrwy3}(+?StFBaU!a-lb|jhxL5! z2++UBO#dnA1CoXYHM6Eahpgw#^lnL?G}GTd$4SPNW;zoNz-K89TQ1S0tgnEg{s_A>~O&%p*?f#|*lj=q89{T2)R zz3<}l?Bq7|I~x80T>J^N{{>O>H*j$s-2V?K_68{Q77by7l4Tml8WgQg#Zbm~v6bet zO~oGP$D>1`-`o_&VHCsR(9sc82kB(+;rqvV5yz8*1y2thYo^<9!w2*2&{%Y8pcC?55LH@xp_X> zc?|N$Vz>fo%j40_MC!?fpq@#zl&6rFr_vTKrtLh9KH?I(!lk|+U*Z(D&Y=LFtKv^{ zFYA0jw?Gx024&x;UO3GII8{ZR;bw+dZuoGo$@fFY$gj4sTRHi;9_m|shDYl`^*~Vd zL0?HA{08c$*Ekp0==(aTX5d*)<)*&HXL*gWhw(+d!ShVfw{=D1Y_;J$_YHDDeC)H) z0VlfIqy9S2tz#|pf)fhxr9e7CFWc#~6rHitE4ZsZ7kD&Ve0whNn;hyIUyxrqv!tM7 z@yw!9Chnm;Z*h0u2XAo?husC1%eS#ZhUEvb2$!@o_v%nc|NkLWnXfw_|_P`8Rpj<^J@V)+!7Ke6XcnV8Mjfde#rwI9YEq8 zLC~F4y{|ZjyQtEyIFq}o-CqIAx_bXB&dLfnMDD{Bc7(!pinh8Vk`EBFBCBLYnOW7V zWJM!OSG~XHm{z@@@Nytea-n7W0+0a^$pdKw#N9N=nuQSeMVJW)5Z8OaCDV*~-22&~OQvErMF71Rvb+>snOl((SlbI=X)lGDvjNQZI4xA4 ze8ZaqbZ0)$qZWV5#T=l1`Ih^{-;a&XatOKy(RMYqEoo^J%b3Rqu*_5KE6Jg0v7 zfxT7>+KyGre&jeqi$3`yUoljyZ^bo8Y(q%5+5hHyzSLjY?us4-@jV*$c0L7zJHo&n z35-1$7pR~yfu)!Bl z3-&yGPJ5h}sF|$gIPM2eKY$^Ysk5y0lZ|(%KQwK5 z{L>W3&pyk(ccM%(?X@UVYlzM>e`=?)<#&W-oBsHv>5ixhxNGt{N@1!i$JT3Az7Ay^OmB| zWz-3j)&s;c0Ms@DYIGdz>nYg&mO_cmhC-VQMRqqf%nLw?%dtIP3B9L4jq6cwD^gFQ z-Yz`t#(ggo-gvOcG|aOT?3u~v$)X-;tbNZ+g$^CM}y9ZhR#V~8;H;j zg<*q+uu}#Hs%g!&4UQ-fQg!V3qJ3c*Af~=G&?Eu7xkL)*TF^y}79=OQlXYAY6s>xcEZ*G=$))JbAwMoYcH-_QR z1O`r1HL$IAUqlA7TOhk7vNMsL1v%A9{oGdD5dqBycDKRkj(~9y16h+nUDT89v~k>1 z1-I8q>gJ+GU#R_l5TN}*X#+q|gRs3A45MTSy3IqE!_;H#wIw`CC3n#J!<1amLF>!Y z)P)XOckJ(CI%+iy0boCIPjz4Wj+kI*Er67awN0f+KpP84Z}!qUI9g(`Oo*&3U$-2s zC1jLLcne`okI-KtIqxb)>g; zFmyOrZv^;fBpMG#qtVLKN1Goy5u`f_(rz;FF$KgmRR!g0eR#SWkqhRTsW#+-dF!b6 zarc?vU!bq+ps#5bd%<4b5b2GkdibP zHPM2ojTS<=S~VJ|h0CHi=|gJ_}jF2dQYoCpMkfcMroJg z4+GL8fa6oZ@EL^g=U|O5s3Cs|w)jdpM*C&i?+q>;@nw(J-f|h_mxM)4humolHP%EW zOx1FOGN7GXKs&cYR;HSdEbgGrP1S}%&N;Ds!BCo!ps_^kQZkFQS=goQDbgN}9*VAp zp=1Q))ku)VXw=EaI*wH%i?y=QDQG+ujf+9H(_ke}S0U51^%3*YY5}O(13F!VwO_DBzP-iFV zJmEu~x7qUy*kCur-X4(dUUa<=d*uCKf&&nD2O((>K{@*{_+bdbBM@pw;of>4Jsg8* zJAu-Zu$)f8ynd11gahG2I1s*q1L0>n1M~P5h@e+F1TrFqUgw(h2G^nUkQ*1c6}<_A z^dff;piA7J-ci0%Z63d`n$Fb98h!$%xdKZ66!`rNI;ak_wB^nJLc71w?mF82gLXI3?q9UKMXewcyTBIj3xM(<3r9j0j)gO9 zA_pt`EGqppY0y!dTF894Mp)^e3bYq+$ompexs(IY(O<)EpW(1BQAFG|yDq=V;3utzqA1!|-mSGUjSMV682hs||4UM8#f^ z!a3^Wx!SX#{qWcyj{{*m41%RTOg(y+_JE@R+2fHtL8Z^riX*1saXKDL0N@PR5T#(F zS?a(%ZCB!hklw4nVW0zE3m$v~Hu^dU{tck+P0;;Y)S|n!*^bB1!{g{-r{ekA0>^GV zK8wda^4LJ_ov&qZUG>d;EzJ;c!|nCA`f4uFp0Myr^~z#x1g}!{?tv^_sg~nr2$R!} z&b2S#d_aEfD}~AEZ}sY`_Z#f+?6fA*!zYjFK6iV)X2<``9N3R5*JPf1p5io^< z`dx?AdMMcquijx#V` zUQ>%6);1eBxsdkO+j9bmbnRTOh96s-;_C8+C)#)E#j4z)qkilt&I|y0^NqMSIZE7ohh8ECZCcOuHC81Qaj2 zYbcb;FtiyC1u_B^Mk>!%ZK?ql|MT+pr7+m#mtpR!x@a|g>D#pFI^V6f?$jcb?=daI zVR|~kXsH_UI21$B<62k8GKi+-K*b8l%e6iPs8^|frTEpywREc&&uitmfR!4z6MAQ( zbZ9uJ)N4DnEQ6qSa_|p6*Av>kAwlbbfd?U&)tbp#_GP~DGcKQ})NoE-tqjzJF}2hL7_ zbWUQ8UjX(_L+YM^=i(KJvsWQ7Ujy;H4#D__Dtb;kZM}r1@2Je9Aj$XC!lPOT>qh|b zV|C`J_Du87K{H>V-!EY`d<8N3HIV-;6#4fc%O9W-euVvC$Ruq0eg=jAN;By<_2l!~ zt;|2s^e;618-n~gn*R$M;1;BchRwJR(PW`MY^8w=M=``xF~m~|^5(G9_vtas&Ot%o z(_rZ05bPAHsXvZu%Yx!yvc-e{Yk-oI9dHoT zgrus)z0p@+_$c~8A`YZ%h^clw#P{wCS`BVK9L+~Sua88%(UgjPP;=~q`olvo9P(>C zPoM?x5ZuF)XgN=%9b7~^VNWH$q&?v%1(-82%q;c6OIX~MD)nV7LwiFfVzFi39sBL(T8g!PhM5 zDH}cQk*r-#km)ml5G82)tmN#G4))6E@HFgqqVfT0aZpl+?Bui2VJ9f`IjMHkM$g;n zm?V!oL8>Pub;=21eNj>`Iid1jc7krtNEffbrEI5HouJ#Zs1r}GOU*anQMc1MsdZj1 z7i{#VjowlxI%#pfm^0cYR_`|kkz9O5o8jDVql-3r+eVjsxvzp+RN)dmLLGihTV~q{ zM|DzzUTX9vEnKzT&N05(XSK#`dk6CZ^3=1hYa(KTuKi@NfL79aR4O_itLZ1lUDzJ|j>T-f&f>B4^OFBdrL zx|$xUr3Lk%u`bxy?InMv8h=hJ41C80d*Fr(2Eo5-?`K+qx_(ZpIh<}WxQbnj-I9wf zY_)NKJlI@dzo{;E&~h6)B`46uIIm#R8>gq+ILO7p93uJETpY?)^wK!JNwf6ybjfqO zIE!Fk}r^Z+@sn1rC8zo$(*Ar~)r*d5Lg2Kvy+@sn!%SrzMJe zgBudy+-v|rHi;&ydoOFL`V=7az-6so^c17qg)RtYVISD3gme+K@UNG(wvDD$DNgVg z4>zi6p*c1frTKW!Xgb=I;B;#Ads^-2nN`|Aiut<~)NguEyU()_F2BVPTKB;9umm`~ z7lQ6S_(zvQ(A`f3yv%6c4bXIjqMwC{2cQ!NwnF`pmkE_S2eYyQOD(Yx8VCgufg$EX zZL!lPS%FNaU&JwRrIGe)X@rfYmL=z zqu%^eyV$-bhV6~4Tr}zj(E7vSIS}CG!PFTF8*3O${^2lwMj>Y${wsjxHy+5J05v`d z`=2T5%fGb895d0yEc7}@dH>c%I_AS%Spajz+}N)vy6_UZNH@!47;BCd3v* zM3mtiVJoEE1CV3eeJlRa?qSE{*i-Fu5NU{93B^(R#c$J-)gs`cV%!*iB#MwARJ6>Zy;M`tZ;s&{hMWG8srpf$^8BdIaj? z{oWX;50bM0*f_E?4FfIW2v%t>{bJ{~Xw(i&)gHEQ2bfx&unwJphpy z+p{-t(FYr=e!z8q2$ccA?LZhYL%?i#Dkex*aEI&*((~br3lG+3M@#}>lQG^DP+$=( ztzwKhLzM;V;~cXw&KzXVRY4*8+J@p~TY{{6QR_ZLE!+>(Jb)orVhJ9EUwah*TmzuI zScfsQB9$y1<` zr!kvdfczOy%5KbI59Y8JfE@s19>nm6u)2p~2^>M6PhrK5s$W9&>vfL=xRYtxuq|&mcEg@w?bW<2*75`s zr}*r3^?x;Z%xp<|8za~IV$bWooFqL&=YDELik^+z#uUAuLBzH9-WoLqdkXnARiSnD zWR=@c4^Vr4LJUN+RDFvc)EbnM4WzXJp4+M`X?j6Wdt`M0!*+!E)JgSA*B^kp_``I4 z9yZQH8tdJRIItP-l|J=Z3S3p{nZ|l6&vqxaem5Mr+?3S6PEq1WF*|88YbanHkuR{E z*eVYJQfAW=Al;m7XNMC9c!3V+{2)mM%OylE)odK<1ll#J<(5>KT*9TJ2?ZHyHoW57-s!*OznmrD(~)Rcx8ErAoAFpFzBp-yT`vpP1eD;4U&uH*U+ zT#{_uKt@P*Vu@2ECsi6Zv~eRR3|mbuX?9MRy9WS`;l^@lBE2=0OEVicm;N(kq!w~% zDVI#i%aT+p-E-^KEos!+#@SA61KLP-TL)6@WpI)Zx%78{<%I3?O52byr_U>p{LBn(EOyWI;d+ z{Va#7w_51ioI`Cq77K5qr__R$`u&zDte0|T>QC4Z--G{$sw0{DPWU$Rv-I7bMu@ zh8b&hi7lZ%+@?pTyI}YLmbr4dOD^-|GN0zy_%0XE;jhcR^L-kq{h4l9eSt>gjr zZ7XT;hzmLcbvM!`#NA116+I-~DI2eI@p|6ig7SDsO$)(Tm}j`R-5@*Wx`sEpWVhsk z@cqEWn|QMdYpb^keALBT$m`-V-l{%{(vzero-yGUr0SP0-bOEYw=Ryva@-3e;#zm~ z`*gCN?E9&$e%$IA17$}LV46?_QY4gst0pUE+_7lJ9qVS?v2?~A)MebUipE_GMg-E;^9CzW8h}TFF)U_WB>Exv zr+^jQB?XKjZ?$UhvR=o-GG9C{M^Ojxxm0#0r=-9wj?JjT&r9`9V;{se;$>w&c zkGtxxdSph8U@j2s6f4Y`AlNF^VG_h*H&Y8{UMK8iCSplfU{3q6q?fU%GJG4LmOvLE zRoo@@C{lv#_hIstVR#L7w<1r60kQ{9@i~}=l`aW|W zWpS#*3VH(dEKsgb!m_ElI1YFzU=*3_KF~i{bT#{b9nJmB9iuJ8R2Bh!i>r(l?H|qh zUq^!g@ej5VcUi+!&ru3Pd_#CItbRmZ_C^YetLEH7K`C)-5Rou==dJJJHeu(o2a}K5 zg9f@mhbD2{haJiNu*)6*HkZMoT3&@XQvHC~LF!M!i_2io8d!@t7+CNhMEVV~+sLyQ z`zpBCs!}Y3oe8Es1j`Wuemt0hL$AF;tSLTn`~5F*YJ#NdQMX`bStSqcf~aEWGp9?qn%AevBF%q+~tA5@9i( zx6xT+1UsIRBP>U$K^~YerN8;uqyq3%J;NGdjj(T{VW{tk&TvSxW7IIqsb>U41VlTJ zZ=-t1Yt$mpxCTW#BLa?6!@T6kz{nsUrS%6$J8xa~cYJ_SBOHJ@+9@zEv7%^)zeqZM zKJXx=8y#2FPsac;{wL1?N{(ddK<I1NWzBFYlq#d;psV5hv?FVygk>0Ss!#&~OGq;5g8iC(01mf#@>S z5ZFN&te0`@=D?7!<*{XP0-)!yZJC3{pwx1qh}3hy0O+1~VrH{I(Qe*uhyWc&QmqX6 zByQwv9NURhn#t`9NfiK>Y#;PEhio2$Rhp8^dJ7PNTaMZ2xQ$NO=p>{T#MVF5+ZucW z#^+5ik&Ol>|4lLO>s0SfemZ6H6JR$CN$|%}Y64biiiHLjHFGz|Bg0>2o6|ZHngM2 zAK)Pdqrez3$HKvz)le%%0naCZzNZW9)Adak%fKqcu!lT z4pTeJxSeFt`$Su<5tfYPf^-)J8Durh)Aru@^SaC=p-qmm% zRMdt`r#lWS#^A7GGCZx7YObgP{30BTh;pw@Jw3)uG&COr_7FW6E{9x5iJmK@v*0mP z&pk)>TcGFwgLYmg29@j}YG#h_$w!3Mgs#NUMfqrhx}rNr;4 ztcR#eUWR28IU+0?f~uG391l`SD5JWmkht0w3QmD!1%0>KNe(2jPFR_P)O9DnDm4?! zXl!cg|4hAW5If=mYR2fx>-geeVH1apGl+nESY5`>&4|(0Nd@5NEJuky9iYwZ2 zh}v$W9#B!>=w8(_cTh`zT6ONHCwhclM?;6$psValY|sSzQU%nA0CNUU`vtANdzHY=1))U5Bdq2fM2q5U>A&U2cJ@iLOFw{*9Bp zTdd(b3QL)>BLF`TaGUrRv z{L}`3HIhBCO|tC!s2|jWBQ8D~R#pFuplCZxq~niLlem2};4brND}@^g^T{$UZ6^gA zX`@bBi-4UJ;Ajy5#&u;lBb*UI+bCJeQ^BszMuvF_E?Tw)Shs&+Btf9fJr&j^kPMlai_QDi{0H6pml0UC6e^z~Fd+7jVf;$p*o zG)jClxc?GZEVnK!apJl&+M-9e4$}}zhSQ^G1Oi&uHtK+8fzk=?&Wg_JEeT|!z@M9U zkn*ZN4?0MLj82b51VRM4(52a3Q)yrsSEa)|K(~P+{Q3U=Jm}Eu;B_>u0LB~xz*MU? z&tEeS$+a6O5|4pR{CP6^PWu0b@o^JI6VJiAfZ9N&-pJ_gG$lj_M>tKinK%$I3|znj zSKkcQYkG#}CC7qJf_*fsQuiD3i_JLw5FQJwZ5$%HsHZFo?TMVmp8oh%sqwkTvhWtOd1hT9Ya%?8og2$*X&xZ4O4tHixDffg^ zsxQyyVQ@*^i~DkTrPkpMhL`yseubCtW#oQ?ysPjh{RY?3KfGMC@(L{(_jG%(3)?0Il^v2IC)+@GXI#BqbYm zm~%+UMqJ0tPT&m>7@gE#V}I9ntrvZQ(`h@TR%e8nm4OHI5Q`|X&k3n)n#O^M^16vr zojEXps%x$3bEJY$D-Wg*cIdF1J;V_dcMDsOU~s%4*yLRG3m78+3-I)?SZ)EDpz>1K zK9z+cvhFN_Rf}e~sD4EWEN*|R^0LZS9=fe4;1(eD_lRb<7v6%q!w)6OZtoRCrb5P5 z0qDOp5Bkrp4aWu=x3${St=6WstT!kGEmN#vVx7aOX397kgqeybAYA+#!*h~T^+u_d zMyb|DhiF8lYg`4bG-8*yrD&<9M8teh;cetj%Y74MXQQ9~B6kNHq9#xmPF`GPhcg9t+2Djh!#Gb< zu-ve_E}xYR=nSoxmEm<^?AZ7aCUt~DVFbG&yk)Z>dmT6K<-mk_{A(l7*huRuL*vZ( z7t>3L%q;(!1k^}?xRqU0)lFLcieAN$`K#Y;831z&ZcFs@I}9#!3Te2@^sRv1*o>Lo z)HXMJWCJ1a6e6|7QCMZ$kvq&5->e-M*JSE#TJ;{bz_SEz{yB(3tEBuF#=3O?3~8H>nO`Q z2A)R^2IUxdB`i1}90L!e<+gKn_4x?BiE*$N+2GyGHV$|r@vEn(kawIL5J2H@V;xCxCE@ z6H7gv*s|wH!`^b~BbQvs!y!&T!ud@<8UUttVkF?WR>sLqa76xdROlYv!Hy0dhKPs6OU9Or}ah_%lN`6-u&X~Df;-*X-su`6%~}G z%^06HBY$dXaq*NHX){X;r=+1Q4IOoyQdm$_+Dcs=r}t1ZPV4bW6}>l-qX`!!&?t;x zzI;VvTsZYbCV~W1>eJlb)1;z$!r1(xgyNzpa}&lCB+Q&qFfL(sVd=yKjFT{}q`0(r zZ1I%Tgrefo1Q)zU6C_f}=p5%FHN?W3ByS&2c40x0JykA6a>08~Hk$5|a2O-rh^bc9 z>BQbSpZ=vK1^I|#N}yTZ-4CTUO6r(jB!gl-#}}7OO(?-hOF=?09`j2Ji;HF?6wfLs zNx&qGgCPl-im{hZ$-BB;q#8XzuN~wvPHAM|f&F^n45NlCa6&<8u76tItx3^p2I8e$ zfu+U0if0#;bj+Vop!Vi-ZSUw;I;l}hxq3ljRj}`bdLUM01`dOgdQ}~$Rf8S~eUgFl zODCGCZpAb3UYra*a7IB%_oDH|$RAr=R8%k)`FDG_7KVA#O9EBy&v+4XScab9k$SnH zX#Ih zmN2ydpb=_bJGkzv5@UI-`bqyoZ@1BGgF(PpNx_WbnI&Tj+Rw@_oRU9gO2L1@;!yZb zV!^x&=`WHR^akl6(kg*GiMZ9m<0{As4fhss=`z%pc8GL091YeDlz5Z^@{W=XzBaoX zJ9MX63>hEjBi9Kzgb5OvrDKyLLfndL7`h9`H3GX@d8xNLTwnp5|jxVpyalAMjAnAqwdwUE<)xyoK^% za_2z7hYd~zQo;ux#rXq#Ws#=QQHhRv5d8JA2uVPB157^IfWS0B()v%Zc@xhc9SItNC_XU?-2{FLdHWV6lzvk^2%e$j*j|U`$aoo9$j3-H#cl&&KLIqS29!9d*f6cN4NOOhYQxqBXV>f=eBh&DJd>VNOECO*2&xT8&rewdSX+As=06& zvcX21T{sLu(=F0haK~aC-xrlO>{U=Sp>(1P=JQqrI?H+d4sTg*gvvd~R+(CSGYL%S z;{u~S4xj0mxupd&vSkp&26ICk()SY$<-!rhlbF81{uEP;dbUuH!NK`pdRnzRsmB;d zQavZ=EpOLNYS9Ee#il2TY>ZTkCh3u}Okpz$N;?&d&!0I(v_{ALu@eiRY%EDb)v;k5 z<$HC4K1Wlzlk}dRdLSeQlS-H(FlP`k(1^SERMH?A7@~o20AijzCWx>U`KR%Iyk_kq zOHQf|nYokfA{A_6)CWQS!KVaXYl4>buoEOK4PMSbDh;JgVDdIKWMs?;aD6QJz9tyG zDXJx-nj4I6%7R!tnIeE7MTC*%UxKVNhRH%Yj5P6IL4vlD07$k`wgmLG!KM~)s~VH_ z$P@v>JYx{sxs@?axg>38Oqmgp*1>47WtA4`kp`yNln;`W2e(@N05A@aIsqHN3OC0b z+ z5$#h>lMNN35>lQ?aaTV?(K)GS$bpibWW!lrOo#FciO=Am!(Um|oC1w604HOPL|p+` zci5smfaab+aW5bu2LKPmH5WUrzA)wn8j})9JOCS>&Is}oEqIrIdK$LmT_J8w{35RO zCLt)PyX88uu`%k=5h1m9?734Z;ae0F2c- zuvHtvR29lF-_`G!T|t}KaAywySQr_29$=713ju*;@C&vtGberB4{z0$pmBu`Y46QoCVc5r(!ijQO87z1ge;bs)fGIJmD4x zY2i1-=g_?h2n_GbP!nyjaJg&19O2cLI|*9tEz-m4N5$>J@l84P#qY|pl&7ODCx9dk z23LIwP~R8nu?AV1r)GDfU;!2c76mOtm8Gb(7*b~m$aZPP1d^am%Gter1v$2@LXO#F zyFsylza5Ct4&$kk2!-`;ipP0ad3Fbb7KJwd6%iyPSrzdTAg+PBl5uyUqgY5P*kHFQ zaNn6?eW=ZN{ry_thIxpuqoGc<$ANcca#OT8b^IElF$-tw<(K{~mHO#&fZTsx5Ka9umslY4y|Ck1i?34a{ODNZ;XYRDsEAVayK zTvCic$m(3(2~Vac&B^K9I1mx6h<&Wi%>vGvC43!$8_@WZGl;TV&$+B}RJGYi3 zVgTF7y{%0?5x11v*}1*kJIJM@fZs_jo#le#66twb@qbkki6ceG2iW0FKWCh-6#4@9UVqAv&A5$%W?c~TAQfo~K!kUd=B8)4&- zQgM`>N833cA5}pFqhJdrk(0+sivnpo9)X7PFww?^PV)5UNj9GB*j)?R=M==h^vgJI}ZC0y}%`ywFa+ z+If+k7u)$BJ1?>Gy>`CO&P(llznvej({FZOX6NO0USV&*EA9LsTr+ll$j+qC z`7W&2&$4AR(+`j4cu8#K`OILo=9o)*>cUJWPED;o~>N^K$F5iXU zRxb~G%K+?yXUOidrn!qYf+|Z2#!tcCI1T0*%4Uu&EiRF&1tldpGtg(?M8nl%pbCak zNx_7|8Kng!RS|gMa$Dgo~f#r(FCr z?{M)he5UV{oTxb7&Cl9+kBj&6J{#|M@c}+4y&tk60P||L7L_P%xiVJ1#!Sr(FC3ACuIJE}V?MEM1{|bf_@Ol8f(@UK3kglix*IZFyJhnhQ?82V6M9z+^pm(Tv~28|r2>r=_`JcKZF*b01@K@X4vud5GbZNbPg9GA;;l8=3WW44m|9#~P|?iy z`!AS zH?RRWPbGf-I~TXuMfV1S;BZ+Ai%0A|hmZ#p>M}#YF2_a!YhLlZ66U%YPV100Mx8K} zfyr?7s#E>o_E?Kd)Ba8}e97g;I$ke>!G>5NIwmeN1PQMlilLI!Qo@FcyN$# zCA(q8;jlDHTsg1;ZHLKz1mR%>`zqonIVCSJM+`%FFM`T#EE(Ek`g3w%5%k7>B^Q=R zUu+Nt7_i6&B^6_Cpp5{yC7{7OuYd+HU~jku_x5-)OUv-&Bt16l28GHOBevmLOx5jZ zg%@G{bNCRwiFMgMNFN}+kFM{rz#=(RA!d-lddgj9-Ym@I*w z4J!)Gu+PtN)?x&FT5yg)Fb^eWX*k9k0o!U6P(2!^VLt5FF|fMEVjnZUVpf0v;e42Z zM}Q#-e}#O6K_=m^8k*IWpmzZK49ZP&Tx#vEsP!yrm7xqVqOO}1WurYd+FR342KxZR ze#GjT>n84c!CClGs*c@JbgvZ5CPt!kV^j9X___hUY3)wOBuZfL&%n;G)Bx8UJsd|p zPoSQZ}x=NN_US$OxCc#N{#KXOYE~G2ZS$|)zpf8z!c6{78oMD111d%CmICTuqvcyU7 zZiF=+aj(%9gBhYNh_AwlW26}|`<1MVKl-_n8P=i8^%!R(9JQO^OW2I9$fMZIZ-NJ+ z%z(`!f$A~XrM!-J(u6#+E5OY5gDH9EP%1OcMy%T#zH>dC<-UI3_MiMC9Apd|sl_n8*|yZ_h>$PZ@#>cG4$^ zKEHyXfX@wNhzP?+Rq&ep7EB-Sc2=O}Xa!o1`q3h#mk~=UV%3K}-@qMTCq&P(oKA!z z%hCk^z^ACKtewqRG4R?g0Euwj4{ele#w)?a6O!5#U-m*Y#K->SJB99qD%t?g&DH8X zdP`3ebAw(XuSLN=-VPRT^HMtrb9BJx< zy?PvM@`v{7)tmWOUn1K|b&nLSB|vTo(I{<<-$( zkAXD{UO;^UF%7ch)v>-`8%y3CZv|DZcx?)l5Dwr{R$P6w57CPFY6AKbNhVPyA{jeC zF|>r)07U@Ji#RUJE+kDIAlkuD$2BUl@P@+?%G4lJtop8oIXiECVi6>n^4xfBhA|}9 z26F14mwHe-^{Fl=;fo>-DAU8q*v+QmQy>lTDUdV+6C$e9AjrRikQb5A0i^WOp2d4}e8CSJr=_qW+H+^`8ZynDt*k{W9c7$xHHXsOuZ#@Yv|;4XT0bb&P5X z_@6-j_-cen{s>?)1Fl!?qby(s$_^3Ty;9E5ykvK41YNJE#zvwI{K8ETT+}AjBDV}n zs@1r|7uPB;KEe`V^-(+6VyNBTFw{blL0g_?hp7W{5p&lOblS;3KvGWXX@urV8Ah`0 zqt3l#cp$xtPJ>hS;HmQ-07w9NIs8SYRO$`$g*XclMy=p?XpPq`vk|q`20jD%BuhK2 zYI}UNwF4qaI?@Qd-aD1McxXCzrCHolT)OzcN^fj*a^daoM=RJg0OeZ+a+>xF#F++> z9}k8W@t+O_@EcMV7==NIvk=0QFEafB@ibL1?KrsE;k^vuysO?kflV33t6@jHb!7E1h#HHL`|wyJ51eLCtDGI zYo@c4ZPg&j&4)IEb;Pv+=X* zu5QdxPU!-TISM_tsxk}2=VxNl{~FvQ;Qk2I2)MnI-QCMA^d99FdQW5FZ5D~fZymdy z64(Qa*EH925N_F9--R`|i}97iw5!}|McPW*9RQ0o)9y7!n<(qpw&57VG5EFde+~3IQfl>kT0$ z3zUd=a1kzxLCf$IO98QZJGdF$^5uvR5rb^UI0uPm{}2txNyX#9JUs;ugHF=mJ;nwDCail16h0`*BX(== zqm-A|5Zj`mCk*K1z}kQgz!`fZUP>W+Kb&%*TR6a~}7gjz@a!;;&nBtyV z`B3C8u6&s0o?iJ-;-2BDd@6O%tbCZ|o?ZDc$33_5;V$>Q%7?q%^D7?~xU2NG(7mX# zWU>36%7-QHdn+G2_m$TZ85)Vjlka>uFoIQeQ6UdG4ipAocWCwQXUQAZnfk~8tn z4v4QVh}bgXwVV>fg{~2F6-3!~iP;JwymlWXfm`^jZXnFIK61&GOJBM4iH+6N3o~ovd)%&TUA^0+pIDDGi0WG|cI-E~%=9qpOzgrX8_VREk z@ATGD>d{CJb{K;Uw$TtX&?`^QyvM0}1-e^ew&F6v@95kgx)m?MV}o6Ip<+1Ny0GmZ z4hN0+D{=AX10}XFaLTPcrSVUQs7F6|| zi6hMj_$r4!!3Y_X*C2*V?_(vtt0BHxLuvTTC^dC~UYff=Ao%=#xl=6o_G$bN@mF%j z9qVnI82KMTShvStO^C*a+PshHncfS9ff9rj=`nOxG$>hK8`EUEpfzzWVfMJz#MA4L z*CGRkz#~*12b7a)L;|hHduyVQ@cOFlAlY|PMyiAZS#LR9s!1`1NLeN%7lG_!@xKIOl0o;hM<0JJOMP+ zsta&kPa^bg3gW*9`U107klhj7!&q5;J69_s#=9@0@NTF}!N7YXqx@9Z{kU^?Fk zPV)cKne=4_1_)q1plz(UyJTuL6OT}&^*?$`ll3P*I$ag$G>;htFD}Te7Fhjqcx4C> z=zj+{3N{GZRGoXX{{!6cRz`!?x1TD$eOTXT{BbVd14s3n=J$&pIEI50<3l!!UeIr8 z#+O=#u8+O;9l-oft$bO}@IKqnp_x#iN{5`$Bku851O{KoC4>F$QZ=I=S@B1hH?)!RQBU}DjmrDP@ zKP&k|S8C-4{wDI*p49gr_>s}(LqD2Ve1rtv>f7M*j4LTKl=)-5B6XdA9NQD>A?Eqr&@z-rFeu;tT%-qPx57aHRmfocF%sL_cF5OGwiD7*Z%J0!{+Ml zul0PROgGDIgb~*^rwEQ7FIU z+j^U?KK((@G~m4L%Wt<;`s!9WoC)$FICEN_qKU1}LG#P+b~OAMt`_~IC#rT4h*KQ- zgYGiAf9TJt{l)PjzlOJU9}_M0lT^d2dYDmFK0$_P2YEXHDy#C}jzxWAjH>c!9kuGJ ze|GYLu-~yglAlCBmH*qZ=sR`ls@~o}fqXA451`7gad{2gcPvU%i9hM>j7qnCH|(}b z@2Z7A=^czp6<+{@#en}S{_R+_Onv&3UryKf8EbJ6e>CDJ-Lz*@bMIK;+g_!p=tKbe zu1YhTKe%4K`J*0gAV;ENPhk@Bldz=ne>)aUnNT^L1U%MO&;RU4{%w)5NUCz-xYbO1OqOoSj>x{HJ2XLrnZyH~&1{bSY0-i&J-B$ncGnMrNOaom`)ME`P z)BabWEc3isW8rgpq`LoCKejhZ(bYFAio(>1qq<#vix3oJ#9fPiyYe)gLGrs@B3!*# z(I`}%e-6`X_nV(d9cxc*KL)mtU;W#_{OWQ=qY!oJ7k~WBQm$ARZgXFCRsvrVTYG}0XnAt2M`w; A&Hw-a delta 25923 zcmaKV2VfM%_y2pl*K=t{j^siD2`!{RLJK4idWX;R_JftHe`>UF)GVg@IvAJHK2-HFp;>}U^>MGU<ZZ8Eyb>f6vrJlFre0jPKILnCL#u$S~;3D>tLr;MzI z{;G-45$y5x=^O6rGtG6dYi=W>#fC9;jKtOPx{Vh-1FA=W_xL}P#FrMvc5Iv7Kf)$F{((Vs1lW?6ska} zz8dd+rCPerS5_^1RLpL)rS*9ef9z{>F%8#Q7aQ8YKqp_KldpX5Uu@0a_)1(FZ2yEp z=u?WJk9~_SrKqN$bt^NY$WFs!36l;k)y7Q_k{bT$$x* zK@Mt3PHIgF)CM56rBv!b8PxIMM^~D%H-~iU1w!hLYW*bv4fi4qouB z|25Lr@7EgUWq4YSr>5rBsJ+H_^VhNN%@j;q$c?F#rdKJMwow}G@QwMc8}IkM|66T- z!)L$Up5OBIxZRZB_ier1!~7u#?jxW1PEY>S*Z)qI`AhWmmG9L%^%Rx7{{8D&2XEae zV~xKDxUT{3+kpE|zsrlbJQ&+WT)Ln#sx*va^j!v7Hh) zh^n!Z8gMYRpfJ&g42 zd6;Rgj;C}yH8s~k?Yh2nk3;eb2qLiMw+>?xiXfmitw9xVsLdbQM%%yXvz?o79QiS4OC^J zO&#)p58|jEm8bgRl!d2C3i4-J`I=LvMEMA=syk%nEE9!^-5uD$Ok-|F!)w*;H%*OZD_dz1MfuDUE*>>zU?`HP7k8fLy$3tu?%_O z13&elH^kSCxSE=8vBo^f5b5_DRSP{}CX&CvG}m`yr`Yh9Zt=g@g`MX3Dj2RRC^H?@ zSQ8SdHvVN`sWYiQ)uP6fCGPd$vE>?KPK|(r#>A9JL>eVMQGZedA4&5b@GtGjP0d_J zeAb`5!6t`4&Vmi zEdWzXWVOP4TSFYS5u*oioe()xN~vz2oU;&}vxwi4^(e8g3){r;fn3c0+W@{|&Pc() z712Bu1E-;BWvpZsq^e^0YM`L%7(5+Ftp7#YYFpk-WFFMz%&5d4KdYJ zHPy6drYg=4=NgK)E42Qyo!>EnJ8S0hKx~2-HI^gA^jxlNP67RlJs`zH*Gb+S? zbqUWjCpN{*nqkeGgRmteErIY>Ak%C>(;7m#4KUCatKCk__wg$3B-*WHx2U>;D{wc_ zV+H2ZPpnzNwRwvJ>DX0B&ot8AR`H37 z&jF^nfN36;gm$b1?N}Syu{oHqGc5!oFM`TgjM62vhL(!Pf`>-U2HqPhyf=9k?`H&8 zRr7BD{txMXewk}&W-pm|mZ-mm(^bihZu>O9cMWqiXNcK{xC3X&Tb%go5Lf2<{)EH) z0{83)v~&Vcok5sgC?3ct57bnqo{$2)Aa(jeeDtFi0p4;b_*cLuCm>@^0ZkWxrk{YO zp8@t=8pDw^mOV6q6KEngfGTK2Q-#yd>*8lgO2gc%f`zIMAnYBXL0M6N}5>%bqch@hj~ zj<<=PN4Y8Q5~bb%KHoTs<=r4 z1K0izg8Ks`{3rDOUy!i({FjfhgX=)E+E}AV)}c>L(5GNawonbWQcLLAw$QI#p6T1hGfMH>x8TZ-eUEYe93d0(F3*_8#nK98lwJdT?2NNC$!8p5LhaGTOi|(H8d66<$eK zS@>_g&B<(DOBP-yV$X0Z^EN=YUF4ntW$zVlp5cz1EF#ZxHKkSeG+*L<0jfs+%1HA~ zw4JR5`KO=dL3(fn5L6=QD+z?3EPi{B+j3>m?tM^Ar}Lb`)%|nM^GmEmU*rco%@BPv z4<5K7)?DOv!B#rx0P^=q+wYM0`y$saq0^fVdW+r;qLcKFjn2sOvo?AcchTk&_hH?? z?-GB=V#PtgTCg=w(x#nev+B$ zj(pMLN3Q0~Cq2hcro&`TvWta3a_QI%OsO)aR0TXz4WhR?giJapG7}bEEphHg?r5(I zs;&nbt}n`8<7{py#$V$MZY1_x!@^p`m)AHm)0{`HBUJ1tMd;))yWB~K2wBCS%8E3y zqMpj~Aj=|JU+3sr+0b!P>bHXyYY)}h2@<$7gh>|&iEcCtq9qqIsRl?0ygv%L^6xpC zf|+f(&IMTm!BK-@Zw&=U4FmkcAtZ7ktVe;HMniN@KzEa%EG9!;Ore?BZp;U7twCxV z%@+T;!LL{>j=qNh&zzrl6r08GKXIGbrP$0Y$KWeq1PKV9H85q?!ftsLn}%(o*G;a$ z1N^IQawRV3#{wP&m*-=xjzedhz+fja%qhs`)7XQ&MPJYv@zc-zDEJ~^y#yQmGGP2z z{Qe93%sL&QzGC4mj#1RJ^A=xKTGhY!4urIl%N-wo*WI7;2iu&U#*o-eU{yDz5ZJxN zVE4ws?k&qLMRoxTcLxwS>^`uj2dr6wyHE^w755AHNNg_@^+r)2Sjv52W%q*&=?_bL z0A$F(l^3*aJX}m*E!#c{Y&;r6j1lKqYsnKtq^4!_G%--q8bMp^(zK@Ld1U41#a&HX z6uXduz=mO9!+2hb{$8S*V8slEVaPtQUe`K7nf$41osDh!<7#g|G!e;PUCCtr(I%~w z3Cy(2rlsL_+NRav%B$?!NP875lr+66i-~s4*tOPMXfpYC*|l2^boz1xT4acG5n64~ zq{Xc*hA9yQ@^M0eSAYUfq-?RptqtM!!dYDFWupkuaiSshgNxTpPhX z#jnNDO<$25sZ9zV2-|BA)Z0K<`h&&ZNbUXLOw_Lh6=-N~@-6d04!z;H8R&&`Hq zItM^M2h}+bbpJd!cL@~bQmE0FK)B0L+oy8?p{c?A_8mUdFw-fbtquw6e z_k&;ufj>rLnvGz=3`b8Uv7oZ{GjA0Bs#+{xydkz$)Ap2v_R^raJkVTm&|GDq zxsq{D!#$nuiqX}zHFgc8q+@4i@)t`7IkmHs23-{lT@?Xc6%Ab#2R)U*E@+ctRF5O5 z1-q#;7pHD85aw_c#JY!$b2Odj82XIk=vyvLKX4hk&gJN5F7Mx4Lo3UiBtEXGy=G4Z zX{XURt}0&0&?ZQhT9v8QR z8(S_LHCjX0w}A+43tDRjlIj2z(h-(OCv?>rhHY2zYJF`k_Yg@9v<^H#%xs{wwpNof zYC#lcVM=u%;OkRU7(K0FKlev!1Z>?&up(!G>eho*wt!V$2Y}xJz+b4nINDk}9G(N_ z>xMD9gN$-ev8V91(Po4X1oaMvyc>dL8VaNj6Txk@HZW{^wgvl46l>aoeane2+G=Ih znrBxGn(#48xYS;=`x~~?KC}nF4nElf*|ZljWS@xduJsB(2$6CKM#Nz% z$$3(-)eFzSGPivbXk0?0P`D0dSlJ z4DUh=pF_9rQAIva>HNO1_X$X|;D&t?{aJmqkDTFUfz)^)wHzc*c_^|35jRq68=MUN zoC1@hBC;xp8OVZ&yf9Mh7M=lAXQCtvR8R*^>xztAZ6Z|kzFcjkrzyH>hLRSb=a!(d zY@oI^v{_rxYm~MT$aMg@A;^e^#3@U?MCfR3wYxuB4FEL{1f32B?uS794im>mYkNHt zKtJv)2bU zSOL;qg((Si{W3sY4dPmZLDxcMuS3tTkQX$$7=Aqg*|r(~wu3%*L+$24UB88=3y^mo z!cX@#Jbl+tavQXJkM_VO-V2*|Kjg##7_|qvEFI=bl*hHm4~==0n_DQKJJ2!VAFoa0 zQ=;kwZDaL!z%*w->1VON?}D7)gWfm~(B6lvzW{c*h;{oAAbyOU%_m@<&#)$63fn~O zor>R}>3`7lDw=+arr)FK4`})$nqEiK8)$kHO>d#;ud60%4Mgop+PXUQ814SSDiwe> z2r`gC!5V181l?(cd25I1>j14nO+h{mfqWbd`8WY`aVo6M;*+&m!6iVoF|f;H(Y%z{ zGFe*)&v7Y8#5!D7q)*XWz^t7;MeAr!Lq(W)*!q4mMcW&W{WsUdV@3c=^pZRsCizTk z3}%6@=Rohz6SJpllkAJo!(#NXMDPr4rhNq-SK@J%d`uMkXK3kMQrwuKr7F>O=2ZFn z{!%lw9VVVF-kqcMRGK`aw?h7gLsLO|UONwM?sL9tQJkxG*o*8gj0``Et$0jVHhsm<*Pi z3PPO*Q9m6<=nOQS4f}Bcauy=B2>%v?Y?pvMm%^ZX5l)BYFz;Rhy)1@3x5~fqCC$#} zH6VwzVvkQ-8xQWMjS&5t!C_mV>9>MOUxn=51_iJkgtSWxTcO>oxCb1!5A3oZM05aL zcL>aQ7#boEI@AwweiU5$2JDr5uL7r`Bs(X6424wVN3!JR7u1P>#(uKu*^b^EbdOGQ5*vfKYlT%RpOHpa$a6 z25qUmF~DyEc$x{{M(wi4@QGvqsK!ug&7m{efXdrLXLvVhBNb>J>N39(ire%fKqj z#Xqw6+%_%EyaLZF=4eQ z7;(2~epI_%{V@Sezh zLpx*s2u&}GjC_#gr(#yV)&M?t2Yn^Z=4-E4zY4nf7X5w)bK!f)&mXY(*Py{~fHZ%C z$#4^f>(7uVzd%Uef-B`V?8-Z0=P|7y<4-jG3r+7rg5N{)htL|208#L~9l{119e7`uCbQN5?QzYuXK!jHxq4?b z?*fJ174^DN3U`OV>_Hu{sp)}D%^+-QW@1D091oyHa6fE?`(ZnLa7k}#JM6gta}56Kb{GKsuXC#sQr75btbWV#BSKiS=y~XVmPs<#z{Fghx z@;;eb5rosIRY5@P%kr|?N^4}TwGQB5odaTGy{xrCUN*|dO+k_8Z zej6RJ(oqM9GGA6ZW~JjcIw6xM9S~QiW$I1kbdafc98mFR93ankWf$jcRL>^QuFli@ zvd#zca=}IsO>A^g*19AwA6n@nE6Dd`BQ3@seO6n?qS$2+#V7A-;~a;q^s$vbvC^mh z_UAw(V#fv!5!M*6{ODqW-N zR=U9y=Zg=G67@b{o68B_xoM?eoY<`V<^ez=oeY+?bCw>NZd(gtT1PWnXB|d#A#2 zHyy%g2D}L~u{d4`uvzeO&V~S+Lj!rPYTgW1XHzKgNzkkobYh1(?|{4vsM9H!jTzIg zi7i(r4B^HQQ=v|E3^X_}piE7J&SR)p!jw2rt9vO_>w&Cr(jQY9EA_Hcp9fUq0Y;EO zP8Wy1)Dk=ko|=QLXbv-Irug$q?N*Kt`;`@to-1J;t)f~i;Gun)-sUy*9 zJ&MPncU2-Tw(lrO?d&=9jphHZ>_iOjFH5>1}N@c=6*aq9Sp zq5xt6On$4CdRl27o~-mdjB*&3eF%zR4(UZD1bWsWabBPWV(r&jIrAp?eKs3Zun;hC zm`$2J;`-NG8FO_2xI~0r(VEn(2^EtC0;r4DOR+o}AghbJ3n?4qNFA)ttI$!m>Uo|y zfbcK5qM1YFhFl(GmhoMUwMQ3)pvZ%<+w<~l8L;oL!b?q|@#% z{)_gCBXCxA+S{?Q}%NPB3&jW3!Zlf4#6H=?w$04=~afntTAfD}%%re`~MW zN27}|=yjZ!e_!impA3s-3M`hX{$KBFi9B=$42GGYATOxzIdIrqU~V4P=Xp@xe2nn| z^!EbvyAT|<7#%GEJuXF!7m@Q43=khI#1+`ttb{jb6=Db@S2I-d@w?U(}pw@OUf7`>->IfWk!WuwXV=c>puTmgFQn9Lav8wH0T=#$l zEIri2V8x6Ck4~YXA|zNBJW6Z}*8B2!VGGeGxg|5q#&~l;rgLFv&4X35K&%MS2iTWj zoTbQqQE;ffqM{GTU4yK(sI?AJ3F|S74H$9@mVYbAbsM}hJ7Bc#1faXbo1yx;@V)5# z01U5vz}S9ll@E$hVR}a#_PiaYf6@F6F~heov$rva zGnm6!P|ADY-17kNePHqeFn1B4oWc(FLvbx!e^BmYtmh{H_fvS7J_F%>4*0$V!F~mk z?rR92D}e1AVJ@a$DEBQ^^E<%yJt*r3Yzuw_PhAH>Zh*Fb!gOw86@M0g7t@c~e?u>~ zf&4q-c!b_aA=u>B<4h`*;L)qGB8cr*uGDUdn3SdtvCj^x=2S^N#{WvRKHOTaHjJVy z_&@4kM_8AlxE_ve>Qfn<##Mk_R*@S~Rc`FBSXy7p&2d1=Eszg?0_?MXoDI3s8qT>k z^g8S`KewmT+=(v1M*9Rd+V`A8eU9+@ zW+t2q@;@r8|D$mm@i0NJqjJ}NTKcZPeg!>L=PsgMvYv(8i^+O>RrrDV+8h3#lJ#%& zU@36wVzKIBed~*lQ}uztFe%}-3df>)sEKHjrZ3>;;!>JE4R&X%%6c;u-8FaVn(FtG ziSCI_mGxTQO%Aa7Lpv^yWa_a4nEgjS7FZeXDQ3+9eWW9YO{l?6X;ZrNdRD|;gypj|m2N6CvvUZUlt zguKMaORQ{&(MobDScV)24-m(r8JD(l8CjvMgDP-2urHUla)KP8f&-RvqRdH>jgzgM z;s7E2g9}%*aV2>NJbl@lDz9m>x61NT#mZG>|JCG3)#W8!UTVm^nlhE4dkYE*WTQ+g z*K$xpt}U~(>`2v-m%8#&PltYz`c_u;aD5v$koSf*Ze-)eGTB4|-qZm>%gto6xt08I zdT|R0c}oWlB(vpXYb&>LP!6}1Bes*5_I4~p2OD>Ea3}68SEY-UyULOrdFduE-DQ&= zR_+N1UTT5MEKo*Y?q%iPRvfWObHmv@H+SIJhW;y6^>`iozJhf9JhaaF8v5q2u2${~ z>|1FkEVBxUoqd}sc*Lrj`p|GUFo%M{1IIVd9Y2vlc;OZ;L9$ka2(y^XqHTuWJ+u~9 z1~4HgYRrUzW2g8^bjlQeX6OTP;_7v3Ns1H2#!P*IsSRKh6>I7LM1DXQoG?fxA=HZ& zAi|4IIbk)86K!hiRq9x|pOgFZ0GWTli4%eeP6)44Rvzf&K|I*XL!3O6hgo^JlSlAK zC+G4gCy(YaP9DqS#D|@==qOag<9MFngfHi`Y=^P7Vw)??U0qr(oTV493R}bpmmkU| z%gYpbnJO>SX_A#EJ9!FE1sFWd$!7EDgg}D? zJuSis-`#66bG8!>xjE7rKPB3|#?f&SsOOkWmB#lQDyM}B?xxqBJeTJ=`FWl%X8ox} zlyM>k0q`&2g-%|?i=DhgJ}#xVHjHB$m`2h*EdpO5K4j zbq6)6J20p2z%lNu%uR-dm3_>CD#TQgcF57y9TUb~x=`jz%h-(~&?Fiz6j@*C)e{$? z)PhHXYF~^i+y%Inl`Dd~OkuT>2JP9Bi3jtlMEQM8SND!FoIyH5R7R3e-WDFrLN%s#Qpv za36%jyKiMdFAvf!P+3Om<;+12J=UO$x6TtSsI9H1T+nS z+*%Ao9RQj>2b$!x>tN{#Mg+(hcbR$xDT$0PVg78u_!{bNM4k@&Wgl#jNtlNjW(cSs z_hqOVPflTOu7`QIK)(@rY|*x8dkK39OcnwG9Y|DlR9z7^7sqU^l9Kbt*#&3{K0qOP z6bc{MZqzZb6rz@7K8AS#q_W^C>_ou=+m3oB=;$4=!k)&CopyO)@fw453Jm56v;EJ} z!e9T_XiG4arC7cfpBl{*7|r}YM}tfW47LP!iNq9dKDiJbQLGiHE+QvuEftFib8MjC zk-m>}R$?@3yR4&twxQ$qk-EBESQ=fG6ToZ3E*pnm+ zP8_EShbd7{H@Qvfc{80?BiQhi=r-k3g&Z(pQU~L)N)h16db-(dcH1^nchvVrrQ2n* zrU%Akcez{3O{ zGlRQAu_A_`7GyVefv-~^P{JT^=5Xk`F~Ih0D9IOa z1h5=O0I$HiyaPu7emdgCX5%QffCsRtIF22|32X&U!I5G+lu61P0X~U5KIlt<}B%EybzNmcbFG|N|NtqhM<$-pF^W?Ja-z#vsHX(@~)t%LSz zNU>o^Nke0P`&3W1|LRGTW(?X(`Rnb8h7y1nbQKE`RC6`C0?CY1l$l4hc56}y8bG%9Hr$*U&9Y|P6%;Ye;8)V!#C*8D;>~52W0PMLs$AVN!ZKQH3NyyqjA<2-e zCLnG**^vaEfSJS8WILaeHRCo?|CE&fqb4xg?JzYxOwCZUq|4iU=4uK{$vH$#O?{BR zor2{TISKP*LFFmSGuNsb(N8%C^_zy^>!0K!;eNt888yg@ZJS_}Rftqfu;KL+nOwRo7kQaFTaC6(!GB9|t;gaIQMLzYA=#uM)QR4J8W1rYZ| ztg78=?f*aP7>#AOWyk^X zh4IhhJWMJ+DXh*D&Yr4r*aI>{I|l!2WQuN#DvziXu8$G2!BNH_crDA2gho#&1px&{ z`kTx)8&(i|*MQ9V(J()?j$n<%Q6;Km+73`Vs0VutEWsnxK0VlDgNbxvJynT0KpmzU zPn#%0C5$K2fYj|2qSC5PY7NVFve;`_z_`wIhuh%}-b_idJQ?ijsHDtGD7TIxyt7^k zk?EB?>&3kPD$B_vrJ;WX2F~eZ1RNUe*yF;oRqE1IhUrkXSliNAh$;vQPC4TDM zVJ=LOf=hD|2wt$=W76HuBhZ|h!of2(rbAE{*=G$_AzZPxaUp(5)eNApc2!M*H`l1|~-h0bs(yrg^*uKwXhu zv4)DPqCjgo`gW2)KAro2!nn%ynuk|1$gUc5x>ie`Ez)NZXoYhUdDgI(esdfnweK*MP(o1&OzA+Y!m=l=@aB-wp;2Th0HlE)(d^m60{AfkP@?SFUNK|} zgx6C5{g39s|J}9nSfFuTvn|bRu3Fmsh(gga*<4JT-SDOv!izd%raD$6LOL$GXD6lT zl~PQVQp}a|sE4Fm!z5QWGtOfz@X$$eFm~?|g=&Q&$S^uBSo)oI%2bEnE2@w>;;nXt|eo)iT#udhI=gUz`@N< zxXO1S8gDmZ0bj?N`5tP=`>7+gdY$-ynwvDJ6QL@kqSW9;Du(UkPv|%n5k@yb)eMhQ zVQD8!>Sx3nVWF{-x0KsZ+S7LXD7-VOqU;ido11ruPQ5OzfT;P%BOHXiisUpK+8>x3mp{#Q_LH{jfBQZ zT01#34y^AO?nz{325@3283%DIw?|L!z7mUil^)FB!S(Xbg8c~mmFVYB>`06$q~b28 zZwBmYmt}NQ#@NLf{TTZ}Gx~uC60PrIA0vHz1wHggWw1S6+1QWCK@E4M98{VtHf($f z*DXyAiN0Ke4^lhVkb^WdEaV_{kb6>Az9+-gRs+;s6T6TMOzu9)G;Eqi;+};3;qb8n zm|74TeGy#C*r@lVQK(V@_t99L6x_#<>6t_N254OL7b#Ak)uYADp1MPwx!G$t^8GUh z>jU)Q`53_P@r7cZzYWv7*q5Tq7qK~6CfW?w>#3-!7GECf)GnAv{t}GNW{uPxqCzh{ z)VtXM=SVj@JY(JErH2ETo;K3OCOxnP(Lg6cR)sVv(W%~A1F%(;e{XhFhpL4 z%7eP$W}HQ-KUqG)MkD1t*MSL-#(BMb80&yrbsXMpmW?LZ%Fsj`O_BwZ9q_zPwbC?s zpKhfYR+B)x#O(#oamdYU$B9I1~QduCBnOLthJUJVTPDJRF6-tNOKXwKq*eJ zWReq}p>{ZpXx6xXBYE2Jyp!hB3s!jww2&5wO{4Vkw#Br>2^xJ-TpXpxS6xiY;WBFn4^#Rs(Yfk00WMe-iOy!YTdIzKw11V%r70=A=+@+-fRT zS)Ok+bK)R*EjFS3ri>prF3Sl9SqH!?;zXo<-`v3iop4F5$96&jxlt?{t(Vf<(k5}~ zK9?5lF5^9B)!l?EH~pQV;YVN@Csh!%l z5|;6Lwzn*3fDwHuLr1UzAU#bk2kz1atq7-@@fLGinI4u%?0pwNHjv5XHv!FZ}EeWO8$JEkF^-K?lm0GCZ5Yb3BSRiU& z0mV>{yx6WZECBCe>!dQI;BJK0jq#I+Xo6o;jIS0AfjDM0VllO3`4p9tl#Gdc{1m+p z4K#RF3lY{4RXaU}>|y~R%}_7R6_9cWZ{Zj%(6=#$^rl!*lqJW=G#4VF1Qxzj9>uue z>2k&+fkTc=Y%N?R^^gJppGm_=9 z=xSLEK!&4_5inhHVSbG+!X~2G0*fNg=L}*kjlPYAlGwxxS`m+=#Z~Vq5Ezj?9gJn` z1kZz}rd0ob5%fl~nxsE4tC0F)cEd$g_z=50(^Q!Dm`(!m8Z3_l95pi>L--p#5a zaH`MNcN$+q_;!&#(fFE2H=*}8tI|{J^b4%iq;p=ky1`{}8iZe>p(aC>-p3Dh08GtgG6XgW~D2bdbF9<buOLL&lFE zEjDb@<9!XSjeIF3A`B(12qAN`{#w+yXN?QQN|0&TsG+&O`W<70XN!K_J^@%}>QcL- zD78&5A-dXhmn;bNH{Pn}u?bNsy~WiXdbHoOP0wdrIdm|7?9h>7zfCVLe&4QF;>#j_ z2a00Fl^yzDrkYqi@##+en8}1?6JgEpp6J`J=@D^r5M@th32~+qJN4VJ8sx*cO!PPi znWD{Q*p12{u#V9~5eG5{0VB9fanb^O?4tN?J-OH%o@V9ga%VpilF*5r{VWv*5^0r@ zDYFq7g8e!v*mi zi0yE~UAYoZ)4=s92rya2LWa3Sc;P8Pl*UTD7e)wPLE@MbM>l6xxQ3Gv)h<)7s9+7* z<3^d<8Y1l&v2pBOUsj+R6CM=%p*qAMZ@SKbE>&TUR z^2SLn7&p=tmgK^T1QO09X&S+Ghj%?KIr(Ij=fgO7#-`B$jpxp%AerOL&ZpoU94Unq zYQ1^T2{84fo^3%Tpx`S*t!6^GHiY7o&?G_ARKlq8MJ`5(U7cx4wb2;6mSwm%1awB} zay;QH3?)kG3~R$cS%Frtqyw|~8oilN(meSnd|l*SO%_+!K0Jgo0X|nw%25PW z!e6(B&|!7|Jrf6b+dj|s~7T!fqeRB2(agYB*(D5TIDmbEq%)!K+! za>@usa6YC;D{ZpU=28Iy*a8@gP!Nf^(WoldEJ8*Ez!GbfjMIfc+uCH|&pG zOkxyFu+exPqrf$6@dz9QuSP8!rnU~pAZ!eS0}PgXdx`2L z5{x8#dV$IBrIM{A90AC(frwH}tt$rBy#^A%#wm)+G^H5P9`PO!U1?VtU?9n3I!a|V zAgeU+aFohv1f-k^)N_?b6<0zEa8`kF1fO`6k~DOM;o!lEbOCpfdETe-ha&)hHvyBM zh{;dJk-!uLO;5!+#tgL((ju&mZI28^k?;LxAvR>K;)x}Q37CeqAt2~|hznq}9fza; z{g}E$prm7?lVS#s2?u%bfJGXi8P_CU6f@{gF~dr9xeDXFj04-% zu>ID+@>&ZvUk%Io6~G4im!7R?n2N{Yww2_ORRpGtE|6f#ypt%EGnMh&C$Q#jm~)ar zqqz5|(a7Q5K<*e<^h+W7z*zDx4x z{X>YRk06RJV~_C}taKS__7%AKYY^lWc+0P<#pr>_?#1t2EQSFsBr`R{)BDQP0-RWAhcW18^2=qx7CD< zy<8ovfx!TEun=c3f*Je$kFXdTTGvp3>S$Y1(U?J5p=5%q6z{Y#Jf~(Z23EttVq-SLBP+A3gDzqbBLK39eL@v2AhQ4#Ud_b37_J!{ zD~GX3MW|KBv2tCuQ8qhpqT{5&984o|ggS}CajN5@O&m$vI12CMd+_dki6SD{1N6T; zI6wtQim2d7fC_Yk6G@EAedR%gaUBizsf9Gb*4P0WlT;7|D*IefL3P+3Plx~?O_iq% zSYAm4qSPszl*$3#GkLAHJ?Y{cnouZ!D^R5cQ?> zij*&bxuC%xksS`q|2(3SxIVzBhRnYT8vB;MLp!NP@u45E6Vb8*R0}Hu{dbCLw@N-s z(~?|Sh;CR-l~tl{c8Xr*2-T~TVv@IXQ;J#L;V`M46YDm)&3>v6^9;2cDC;dW3AE)! z992W);=H#J=(KTQfW+*SGYAWjWf;kNfSReMB9AD)`*1Arubp6~`4?AaO? z4RHXbNm3(k}ecCPI;vGsR^U#nj*ZBSh?#U zU!17HAl?fp!LTGv^1V_8>@plpNQoe^;33N+YKHayB_hh|kRAsHUCPTXh%uP_$dNjC zBi3J%Wi83Z2x_6KNjl8ms$pSAsCn%qGeSR%bXJlz48q-bs3TV$N3U4(ef&g#DU$;m z2s?MhE0Z}C@8xcI*Sb3hUVc-^upHc&t(+rc-tx^N$#9a8KebEXb9?NC5p3A+**NNNt?!m5J#2Qnw8^jo z!Lwn^=TV#8fG7d$UR626<|%05sN?}aGQPpYgYf;9Awcv9#5xSccVvg-gli<^P%d6; zA6+Dus{v5F&9 zOTHcX0nl_`;_89+08Zp5w!aN=hHAS5xW=jOP%EF>=7bIKQ@evyE1%j!DR($Xjf#=w zAbC52@Lio;oej#C4_#bcpFHHax;=U5?&|U60jGyg9(ud_JbCEr>i6WKziYsghk>p^ zPaX!lh8!iY}mP}i_0MZ;Yqo;-|nH#73p?4Z!h({PmqsPy<+!}ayjvO`=Hl6&*Fx*jN2dDP) z_qeF9)0C_2;fMNTHd2@2*HEcHKGzF0^&VJgY?Vr@AaGWRg0J*+YlKzq`dp&+*Lov- zIC1vZ`T)_hj~*);UC~!plP?1qyFr_YAqzqPQ&A4mh|>mqVb8!P%6y||s6l&wqi0)m zd~aRs{Xwr;^u=}Y+YfrCtRPZ;)YV6&MXw+A>MCc|k9w9$eEOr_MJ3X%>4Q{a^)%`N^-U<70PvW0k$_ranw1iv1i&%=$Ud;>OQPddlR^0zP04^p_LX5nxS5Rg0tFPjf`+5)cB){}3_(5QG1$a$!@R^_`M24*2YODL?5^ggP~d-$@S81+aWsGj79`o!0d^ls`& zekD)Tc^p{c<&X7NDqDV#P5k&c(4G9mnP~8j-dB~$4~L1@{?U7>r(k?3ON1BbIqFG% zF-VLp(1)rg`E4NaeStn$J;_hwtZZ*Gn#qsRtUO}sq2P{WQwvo^ejr8+Wm9j({zaeu z5TCHg;DbcqS6h5V%w|)Hxb+EJ#lT#OKY3$Ys_7?xTW7a_Yz!h_;$CesssDk1Sfpe4 z1^*Nd?-D2f0dzKeVnX#M-|Wthz-|h^nol~Tt+RhGC~OlasulLu+=Sk6YRgtrgjiM* zhueLM!*N_R+8$Y6-eZ)jcj|h1bgOs(;MHo%w~IxH*;HKX8C_H#m8%y7qg1(k%~xEK<+Un2RgRY^BUQP4(^sTgQ2ssM^_M^@AVyhC9yQ&f*Laf& zN4rn{w$5Hv*{F|66%!XNCRZuB*iULmd%5s0jBeHcbd}XqUiB#>9Ztg_lE2KtD!6Z* z9n?$qsiFWJ0*B}2RQ+cH9iFLQw%=3rhgbs`%(P)m-xy^SjkcMh6oD?1xsj8M+-3XV zto_^;XmCpwy*1k?x?PQjceC(+#YtWm$^H@ZFe8O9YSoEUm>pnU!}db#3lTbkt-2~Nz>bUpzu KfqWqC - + diff --git a/settings/repository/org.broadinstitute/variant-1.88.1401.jar b/settings/repository/org.broadinstitute/variant-1.90.1442.jar similarity index 96% rename from settings/repository/org.broadinstitute/variant-1.88.1401.jar rename to settings/repository/org.broadinstitute/variant-1.90.1442.jar index 688c6ae17c73fdb788c2c83f32a347bbcfd3870b..cf06f592eea3a230403c301e1d09305cdb76ebe9 100644 GIT binary patch delta 3114 zcmYL~c~DeG9>?d+>o?up4CsJiQAorLy2B9>A&@}^g>Z<29Fap{KtOO94iObRz@#>t z9oR^~JV-QdHdR4m+^EDniLA+nC|af@?5@PJQd@Y%in~Fn$hyQ-+}FSEsr>P3KK1<_ z{p(-K+~Z@myyHLj*tjO(|Vx`3Xn&tgf-xWss-OLpa7>Jgn(Pn zF`*s`R-B&*PvXGZeP0+_3f7dTv>|M*>R@eTIB5N;+7nQ|l9{EQW%a-DR_FMv+WlGi z#F!K>X{k1ir7{)7tuh{=H9}9}Vk>AB)K> zdzvD)#DjM3fjGdlqKBe|_4n}q#5IiW{wyM()6*!CNhS#mAvmb5Q@)02QKOlbXMPq} zE79)gi3ow>J>JCx*EvVE3eDB*MglnoF{c?Zxm8VOIUYTq~H(9{hPL$22#Iqa4c|wx3 z0p!DGsBjn^cLU9B6q)7$#eaiD7MpYfo$&?jrx{e>2b7DXT$b;sj}*){b;)0!onzL)Wzu;g#5UwfBS}CJebQaFTHsOXBx}Q0A4obDarH0KkF2YgPf6EV z<|AjMGDhy_r2UKzjk+S%nJDL9qA>O*g7p$K?_cw(H+i)fTDG}Pgy`fj?QAB znp{$&dmfAfHM-Ssv6U%@?#n9B2LGu$$Qqh@M`vYC9=NZIWhZ*MBtHZDX*BdR$+i!1 zSg+5slH&eY7r=VI*bBeNif+*30#@|J04y$oZ^_94K1!QgO$A9w%G7n-u335Gi zaPg!;eKQPLj`YIDH{v|r499NrN88oL{^>&;g28Jd$KO1o_I?J*KGnk0%7Y<1pPd^S z#>s3-vMyjGfVcF$df(N5A(1fejxqc!Yy1b}_#@Wkk_p_ahqJZcOH2VKcS!c5{^|9*0d znZW;*e9BBB>BA)7HWM?=B8MfBQ~*a3$pq6#5-9-)T16@WI#-bffFDYLDFry0 z%$W-Sd0tYS9$58GJuw89eH*83Z0V z*)vn!A@!-xB>7O!`AlL12+tyL0py-6QUgAZvv{X$Yk8?x){-tTZOkSPfGgSDIc^=P z2j$2*(hji1O11#>SxE)JoR#bVXvraM05duK3@X>F&h!W;)~lH*SaNxzhI7^aP$evn zG(uE=9@z~Ln9s|2Enm$;_w_iR`#1_X+$i9t+?P0BiRn%SIV2vT}t+V;$6l&)>TGc0mZ*u zohTaq`*QL!^Re-cIc;3gSCCFH9jM?@zp3P>bF7jtx^xqH9ehSMktTqsDxT+f6)(eB zt&TWdORnbi1lI7K_ScZ#fhlw|U*gDS@&`~>)N<1~7o^p3I{Y+T+{U4n0-eM0ZE84$_*TBw zfmX5+oM&74s26SLOMJOqyn`S0xjEPFjl}Jq*qv)8CkXGV2oesX@#5O1ssU>D&cYo)%lRxg<&wPK6 z^EE-!rj$~(!YtM?V*W(D+t6MKDhqqOTb-f1< z3nl?zLfs~MJC#IRPveqUs#3*Lx$KHMwA(^_2H)0{QkT~$@9-q>ej#fG0jSD-JRhK} zX%A0ybK4Idpp5Ok%VFp>kvo6Z`Q|Jz<@cd!)w_Z7Kq(rQxT#_U!?VY9T(SN06Mygt zdsxBqI3A5~$eqziV6y-6isa?0-lykYbTCn=g1aLhMkx_QhyYKelnK?GF3%Q9wXPs# z@qFQP5oj`0=+Fc0P7~~~Re3T?hy%&6Ug&@VljknrIr=8uOmQRdGM!Bq7 z?W_E8y?QQ#i1!NB#sr`ne^T#S2vqQe`T=ISq^5k0+$-vDW`lLnb#DV^k$3gVDd`5!eB_K& z%E|Mbbb!;L^X`cEMkLB7_Qa^PSAqsDh}-?kJIBgmjsg(tArP;syNouJkfbY?+iQp#`VUXbxw>E(+IaKIH)cS(-$hOOuPM zH7|m(uUfMjF19jZ*L+zC+Q2_F{k)+mcQqE?;d3DS(Rb!Bwsz0|e@Evi zD{oUh_uY7hHo!+zx&D~82Ez{W=CT#sdO*kS^G@y#VyB-Io&P?0nTf@?|9*0liNXI$ zK4oH&u3-6&iJ1Vx;#mrSJ)Wfl49Bx#fS^^Z0-${rs{{CP6)Oa=B(M^IlL_eTo5*aS z>`i1%%=Mi-YdL;iE?LcVvNnkofUP!(H3Hm9Vod-As}a6-gE^V)1k?FsG`*O@YCsuC zVVeQ`&8!HZ!40EsFswn7!wuiN!JLX^yqk(etJ82lrD+)T*E9wnIeB(E?yxqU<-%GQ z(wP+?JcGdnkh?NiH7HLquv6BxSnAcatOHCNGMOFVY9>0ztYfvH99hR&0Tx==7JwcL zD+iddupIzRS*!(MDhtn`Vm&&Kt;fuX+1RL|Z0t{14y%W#-W;|YATSr5U(3ZjPjb=6 zo`-NV4^7!GBV2I9ihNX#x*@0ll}4OR-}erEEVa{$)5(zb|92a33pn%wa`ETh7|Sbg&$wep7*` zbF2avU9^$C4nD&hSpz^+CFVI^iDl@kaKz;*tS7JqW)xZW;21;uKUWacR@D@ZjB6KyfeDL{CBMxoeR!lg)6^Gx{glBrF35Q?Y zhOpTUquVfiWizgIu$gUu%+t*{>V@0U=gaMQzw|q>LdOnP3eG?5KvULE3_rOO*9vH1 co4{vZ3y$%03#$U9q!qVwwiSnM@h(>We^8tVZ~y=R diff --git a/settings/repository/org.broadinstitute/variant-1.88.1401.xml b/settings/repository/org.broadinstitute/variant-1.90.1442.xml similarity index 71% rename from settings/repository/org.broadinstitute/variant-1.88.1401.xml rename to settings/repository/org.broadinstitute/variant-1.90.1442.xml index 5db78b1e4..3838b8b6f 100644 --- a/settings/repository/org.broadinstitute/variant-1.88.1401.xml +++ b/settings/repository/org.broadinstitute/variant-1.90.1442.xml @@ -1,3 +1,3 @@ - + From ba2c3b57edff4f8b4186ffacd499ba59794b0f7c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 24 Apr 2013 11:52:26 -0400 Subject: [PATCH 214/226] Extended the allele-biased down-sampling functionality to handle reduced reads. Note that this works only in the case of pileups (i.e. coming from UG); allele-biased down-sampling for RR just cannot work for haplotypes. Added lots of unit tests for new functionality. --- .../StandardCallerArgumentCollection.java | 6 - ...elGenotypeLikelihoodsCalculationModel.java | 2 +- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 10 +- .../indels/PairHMMIndelErrorModel.java | 6 +- ...dGenotyperReducedReadsIntegrationTest.java | 6 +- .../PerReadAlleleLikelihoodMapUnitTest.java | 2 +- .../arguments/GATKArgumentCollection.java | 5 +- .../AlleleBiasedDownsamplingUtils.java | 181 +++++++++++------- .../genotyper/PerReadAlleleLikelihoodMap.java | 19 +- .../sting/utils/pileup/PileupElement.java | 15 +- .../sting/utils/sam/GATKSAMRecord.java | 35 +++- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 56 +++++- .../utils/sam/GATKSAMRecordUnitTest.java | 34 +++- 14 files changed, 268 insertions(+), 111 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 03698489c..63b8717c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -54,7 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.PrintStream; import java.util.Collections; import java.util.Map; @@ -170,10 +169,6 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); - @Hidden - @Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false) - public PrintStream contaminationLog = null; - @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; @@ -192,7 +187,6 @@ public class StandardCallerArgumentCollection { this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION; this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE; - this.contaminationLog = SCAC.contaminationLog; this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 8a766ba48..c6e9ea379 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey())); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 7d2f794ec..ce5f94478 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); final Double contamination = UAC.getSampleContamination().get(sample.getKey()); if( contamination > 0.0 ) //no need to enter if no contamination reduction - pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog); + pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination); if ( useBAQedPileup ) pileup = createBAQedPileup(pileup); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 5fe98649f..419ea378f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -64,7 +64,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; -import java.io.PrintStream; import java.util.*; public class GenotypingEngine { @@ -196,13 +195,13 @@ public class GenotypingEngine { logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); } - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION ); final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : - convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) ); + convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = call; @@ -406,8 +405,7 @@ public class GenotypingEngine { // BUGBUG: ugh, too complicated protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, final Map> alleleMapper, - final double downsamplingFraction, - final PrintStream downsamplingLog ) { + final double downsamplingFraction ) { final Map alleleReadMap = new LinkedHashMap(); for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample @@ -424,7 +422,7 @@ public class GenotypingEngine { perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); } } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 5702fff2a..93a015718 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import java.io.PrintStream; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.Map; @@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel { final ReferenceContext ref, final int eventLength, final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final double downsamplingFraction, - final PrintStream downsamplingLog) { + final double downsamplingFraction) { final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index 21b7d0f86..c3597b6aa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -63,18 +63,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8")); + Arrays.asList("f82389f2bc6d3f932a36be65b60af648")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6"); + testReducedCalling("SNP", "c87f89af948a554cc66bc3afa5251c3b"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16"); + testReducedCalling("INDEL", "ed4ddc42447ec037c1e14757b6cf0515"); } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index 6ca49d3e5..9530ea41f 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -211,7 +211,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10); Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60); - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1); Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10))); Map> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index e98dcfe9e..8d1fa4638 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -104,8 +104,9 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.") - public boolean disableRandomization = false; + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.") + public boolean disableDithering = false; @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false) public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 26e9febe7..fb7a16bfd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.downsampling; -import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.io.File; import java.io.IOException; -import java.io.PrintStream; import java.util.*; import org.apache.log4j.Logger; public class AlleleBiasedDownsamplingUtils { + // define this class so that we can use Java generics below + private final static class PileupElementList extends ArrayList {} + /** * Computes an allele biased version of the given pileup * * @param pileup the original pileup * @param downsamplingFraction the fraction of total reads to remove per allele - * @param log logging output * @return allele biased pileup */ - public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { + public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { // special case removal of all or no reads if ( downsamplingFraction <= 0.0 ) return pileup; if ( downsamplingFraction >= 1.0 ) return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new ArrayList(); + alleleStratifiedElements[i] = new PileupElementList(); // start by stratifying the reads by the alleles they represent at this position + boolean sawReducedRead = false; for ( final PileupElement pe : pileup ) { - // we do not want to remove a reduced read - if ( !pe.getRead().isReducedRead() ) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } + if ( pe.getRead().isReducedRead() ) + sawReducedRead = true; + + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); } - // make a listing of allele counts - final int[] alleleCounts = new int[4]; - for ( int i = 0; i < 4; i++ ) - alleleCounts[i] = alleleStratifiedElements[i].size(); + // make a listing of allele counts and calculate the total count + final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead); + final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); // do smart down-sampling - int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor + final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); final HashSet readsToRemove = new HashSet(numReadsToRemove); for ( int i = 0; i < 4; i++ ) { - final ArrayList alleleList = alleleStratifiedElements[i]; + final PileupElementList alleleList = alleleStratifiedElements[i]; // if we don't need to remove any reads, then don't - if ( alleleList.size() > targetAlleleCounts[i] ) - readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); + if ( alleleCounts[i] > targetAlleleCounts[i] ) + readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); } - // clean up pointers so memory can be garbage collected if needed - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i].clear(); - // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise - final List readsToKeep = new ArrayList(pileup.getNumberOfElements() - numReadsToRemove); + final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); for ( final PileupElement pe : pileup ) { if ( !readsToRemove.contains(pe) ) { readsToKeep.add(pe); @@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils { return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); } + /** + * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) + * + * @param alleleStratifiedElements pileup elements stratified by allele + * @param sawReducedRead is at least one read a reduced read? + * @return non-null int array representing allele counts + */ + private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) { + final int[] alleleCounts = new int[alleleStratifiedElements.length]; + for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { + if ( !sawReducedRead ) { + alleleCounts[i] = alleleStratifiedElements[i].size(); + } else { + for ( final PileupElement pe : alleleStratifiedElements[i] ) + alleleCounts[i] += pe.getRepresentativeCount(); + } + } + return alleleCounts; + } + private static int scoreAlleleCounts(final int[] alleleCounts) { if ( alleleCounts.length < 2 ) return 0; @@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils { } /** - * Computes an allele biased version of the given pileup + * Computes an allele biased version of the allele counts for a given pileup * - * @param alleleCounts the original pileup - * @param numReadsToRemove fraction of total reads to remove per allele - * @return allele biased pileup + * @param alleleCounts the allele counts for the original pileup + * @param numReadsToRemove number of total reads to remove per allele + * @return non-null array of new counts needed per allele */ protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { final int numAlleles = alleleCounts.length; @@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils { /** * Performs allele biased down-sampling on a pileup and computes the list of elements to remove * - * @param elements original list of records + * @param elements original list of pileup elements + * @param originalElementCount original count of elements (taking reduced reads into account) * @param numElementsToRemove the number of records to remove - * @param log logging output * @return the list of pileup elements TO REMOVE */ - private static List downsampleElements(final List elements, final int numElementsToRemove, final PrintStream log) { - ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - + protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { // are there no elements to remove? if ( numElementsToRemove == 0 ) - return elementsToRemove; + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); // should we remove all of the elements? - final int pileupSize = elements.size(); - if ( numElementsToRemove == pileupSize ) { - logAllElements(elements, log); + if ( numElementsToRemove >= originalElementCount ) { elementsToRemove.addAll(elements); return elementsToRemove; } // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(pileupSize); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - for ( int i = 0; i < pileupSize; i++ ) { - if ( itemsToRemove.get(i) ) { - final T element = elements.get(i); - logElement(element, log); + int currentBitSetIndex = 0; + for ( final PileupElement element : elements ) { + + final int representativeCount = element.getRepresentativeCount(); + + // if it's a reduced read, we need to be smart about how we down-sample + if ( representativeCount > 1 ) { + // count how many bits are set over the span represented by this read + int setBits = 0; + for ( int i = 0; i < representativeCount; i++ ) + setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0; + + // remove that count from the count of the reduced read + if ( setBits == representativeCount ) + elementsToRemove.add(element); + else + element.adjustRepresentativeCount(-1 * setBits); + } + // otherwise it's trivial: remove if the corresponding bit is set + else if ( itemsToRemove.get(currentBitSetIndex++) ) { elementsToRemove.add(element); } } @@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils { * * @param alleleReadMap original list of records per allele * @param downsamplingFraction the fraction of total reads to remove per allele - * @param log logging output * @return list of reads TO REMOVE from allele biased down-sampling */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction, final PrintStream log) { + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { int totalReads = 0; for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); @@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils { final List alleles = new ArrayList(alleleReadMap.keySet()); alleles.remove(Allele.NO_CALL); // ignore the no-call bin final int numAlleles = alleles.size(); + + // TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead final int[] alleleCounts = new int[numAlleles]; for ( int i = 0; i < numAlleles; i++ ) alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); @@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils { final List readsToRemove = new ArrayList(numReadsToRemove); for ( int i = 0; i < numAlleles; i++ ) { - final List alleleBin = alleleReadMap.get(alleles.get(i)); - - if ( alleleBin.size() > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); + if ( alleleCounts[i] > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); } } return readsToRemove; } - private static void logAllElements(final List elements, final PrintStream log) { - if ( log != null ) { - for ( final T obj : elements ) { - logElement(obj, log); - } + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List reads, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + final int originalElementCount = reads.size(); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(reads); + return elementsToRemove; } - } - private static void logElement(final T obj, final PrintStream log) { - if ( log != null ) { - - final GATKSAMRecord read; - if ( obj instanceof PileupElement ) - read = ((PileupElement)obj).getRead(); - else - read = (GATKSAMRecord)obj; - - final SAMReadGroupRecord readGroup = read.getReadGroup(); - log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit())); + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); } - } + int currentBitSetIndex = 0; + for ( final GATKSAMRecord read : reads ) { + if ( read.isReducedRead() ) + throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords"); + + if ( itemsToRemove.get(currentBitSetIndex++) ) + elementsToRemove.add(read); + } + + return elementsToRemove; + } /** * Create sample-contamination maps from file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 8134b1257..150e24c51 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.genotyper; import com.google.java.contract.Ensures; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; -import java.io.PrintStream; import java.util.*; /** @@ -44,9 +42,8 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { - private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class); - protected List alleles; - protected Map> likelihoodReadMap; + protected final List alleles; + protected final Map> likelihoodReadMap; public PerReadAlleleLikelihoodMap() { likelihoodReadMap = new LinkedHashMap>(); @@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap { } - public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { - return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log); + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); } /** * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. - * @param log - a PrintStream to log the removed reads to (passed through to the utility function) */ - public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) { + public void performPerAlleleDownsampling(final double downsamplingFraction) { // special case removal of all or no reads if ( downsamplingFraction <= 0.0 ) return; @@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap { final Map> alleleReadMap = getAlleleStratifiedReadMap(); // compute the reads to remove and actually remove them - final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log); + final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); for ( final GATKSAMRecord read : readsToRemove ) likelihoodReadMap.remove(read); } @@ -117,7 +113,8 @@ public class PerReadAlleleLikelihoodMap { alleleReadMap.put(allele, new ArrayList()); for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // do not remove reduced reads! + // TODO -- come up with a strategy for down-sampling reduced reads + // Currently we are unable to remove reduced reads because their representative base count differs throughout the read if ( !entry.getKey().isReducedRead() ) { final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); if ( bestAllele.isInformative() ) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 51753ca5e..ba5dee34c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -303,7 +303,7 @@ public class PileupElement implements Comparable { * this being a reduced read and a deletion, we return the average number of elements between the left * and right elements to the deletion. We assume the deletion to be left aligned. * - * @return + * @return the representative count */ public int getRepresentativeCount() { if (read.isReducedRead()) { @@ -318,6 +318,19 @@ public class PileupElement implements Comparable { } } + /** + * Adjusts the representative count of this pileup element. + * Throws an exception if this element does not represent a reduced read. + * + * @param adjustmentFactor how much to adjust the representative count (can be positive or negative) + */ + public void adjustRepresentativeCount(final int adjustmentFactor) { + if ( read.isReducedRead() ) + read.adjustReducedCount(offset, adjustmentFactor); + else + throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced"); + } + /** * Get the cigar element aligning this element to the genome * @return a non-null CigarElement diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 0e672b3d7..e4a2bed44 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -400,7 +400,40 @@ public class GATKSAMRecord extends BAMRecord { * @return the number of bases corresponding to the i'th base of the reduced read */ public final byte getReducedCount(final int i) { - return getReducedReadCounts()[i]; + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); + final byte[] reducedCounts = getReducedReadCounts(); + return reducedCounts[i]; + } + + /** + * Sets the number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @param count the new count + */ + public final void setReducedCount(final int i, final byte count) { + if ( count < 0 ) + throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + final byte[] reducedCounts = getReducedReadCounts(); + reducedCounts[i] = count; + setReducedReadCountsTag(reducedCounts); + } + + /** + * Sets the number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @param adjustmentFactor how much to add/subtract to the current count + */ + public final void adjustReducedCount(final int i, final int adjustmentFactor) { + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + final byte[] reducedCounts = getReducedReadCounts(); + final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor); + setReducedCount(i, newCount); } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java index 23b940491..6314d4681 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -25,18 +25,21 @@ package org.broadinstitute.sting.gatk.downsampling; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; /** @@ -115,6 +118,51 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { return true; } + @DataProvider(name = "BiasedDownsamplingTest") + public Object[][] makeBiasedDownsamplingTest() { + final List tests = new LinkedList(); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) { + for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) { + for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) { + if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount ) + continue; + for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { + if ( toRemove <= originalNormalCount + originalReducedCount ) + tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BiasedDownsamplingTest") + public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) { + + final LinkedList elements = new LinkedList(); + for ( int i = 0; i < originalNormalCount; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + if ( originalReducedCount > 0 ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount}); + elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + + final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove); + int pileupCount = 0; + for ( final PileupElement pe : elements ) // reduced reads may have gotten modified + pileupCount += pe.getRepresentativeCount(); + for ( final PileupElement pe : result ) + pileupCount -= pe.getRepresentativeCount(); + + Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove); + } @Test public void testLoadContaminationFileDetails(){ diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 57a7946ae..06cdb366e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -39,7 +39,7 @@ public class GATKSAMRecordUnitTest extends BaseTest { final static String BASES = "ACTG"; final static String QUALS = "!+5?"; final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; - final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets + final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9}; // just the offsets @BeforeClass public void init() { @@ -52,7 +52,7 @@ public class GATKSAMRecordUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS); } @Test @@ -66,6 +66,36 @@ public class GATKSAMRecordUnitTest extends BaseTest { } } + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetReducedCountOnNormalRead() { + read.getReducedCount(0); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetReducedTagOnNormalRead() { + read.setReducedCount(0, (byte)2); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSetReducedCountToNegativeNumber() { + reducedRead.setReducedCount(0, (byte)1000); + } + + @Test + public void testSetReducedTagOnReducedRead() { + for (int i = 0; i < reducedRead.getReadLength(); i++) { + final byte newCount = (byte)i; + reducedRead.setReducedCount(i, newCount); + Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); + } + + for (int i = 0; i < reducedRead.getReadLength(); i++) { + final int newCount = reducedRead.getReducedCount(i) + i; + reducedRead.adjustReducedCount(i, i); + Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); + } + } + @Test public void testReducedReadPileupElement() { PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0); From 021adf4220392c999ec203ca567d55a5267d070e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 25 Apr 2013 15:39:42 -0400 Subject: [PATCH 215/226] WTF - I thought we had disabled the randomized dithering of rank sum tests for integration tests?! Well, it wasn't done so I went ahead and did so. Lots of MD5 changes accordingly. --- .../gatk/walkers/annotator/RankSumTest.java | 2 +- ...dGenotyperIndelCallingIntegrationTest.java | 22 +++++----- .../UnifiedGenotyperIntegrationTest.java | 40 +++++++++---------- ...GenotyperNormalCallingIntegrationTest.java | 24 +++++------ ...dGenotyperReducedReadsIntegrationTest.java | 10 ++--- ...lexAndSymbolicVariantsIntegrationTest.java | 6 +-- .../HaplotypeCallerIntegrationTest.java | 16 ++++---- 7 files changed, 60 insertions(+), 60 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ec107512a..ef456824e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -183,6 +183,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR * @param headerLines */ public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { - useDithering = ! toolkit.getArguments().disableRandomization; + useDithering = ! toolkit.getArguments().disableDithering; } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index ece92f50f..c33e89b99 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -56,8 +56,8 @@ import java.util.List; public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommandIndels = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- // @@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("118ed5b54fc9ce1cde89f06a20afebef")); + Arrays.asList("d8b0c5be39ec6b239641c2f2646d2bc3")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("6ef59013331bc031ea37807b325d7d2c")); + Arrays.asList("d9572a227ccb13a6baa6dc4fb65bc1e5")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("dd3ee4675377191e34aaf67335e0219a")); + Arrays.asList("54e13f696f56eb742bf449ad11d0dc5f")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("bb06ef8262f91664b7d2fe7e1e5df195")); + Arrays.asList("16d975480ff1e689113171805b916b62")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("0a2a8cc2d1a79e84624836a31de5491c")); + Arrays.asList("60ed3f8d5bc3f765e6ce3fa698b68bb7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("939f80c6d2dfb592956aed3bdeaf319d")); + Arrays.asList("e87f5c76661527ef7aa44e528fe19573")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("fc937f92e59dfe07b894411b5dfc166a")); + Arrays.asList("264325878b988acc11d8e5d9d2ba0b7f")); executeTest("test minIndelFraction 0.0", spec); } @@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("41ad9e0edca4b9987390ba5c07f39e4a")); + Arrays.asList("98abcfb0a008050eba8b9c285a25b2a0")); executeTest("test minIndelFraction 0.25", spec); } @@ -200,7 +200,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { @Test public void testHaplotype0Length() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", + "-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null", 0, Collections.emptyList()); executeTest("testHaplotype0Length", spec); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index c89b1dfbf..605be3b2d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -60,8 +60,8 @@ import java.util.Collections; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; + private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; // -------------------------------------------------------------------------------------------------------------- // @@ -73,15 +73,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff")); + Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7")); executeTest("test min_base_quality_score 26", spec); } @Test public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("55760482335497086458b09e415ecf54")); + "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("4aa226c00a242047cf427d0919003048")); executeTest("test SLOD", spec); } @@ -89,15 +89,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("938e888a40182878be4c3cc4859adb69")); + Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3")); executeTest("test NDA", spec); } @Test public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("7dc186d420487e4e156a24ec8dea0951")); + "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("50937942e3d228614d2531c3be237709")); executeTest("test using comp track", spec); } @@ -111,17 +111,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9"); + testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8"); } private void testOutputParameters(final String args, final String md5) { @@ -135,7 +135,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); + Arrays.asList("918109938ef355d759dafc3ebb47d8a5")); executeTest("test confidence 1", spec1); } @@ -143,7 +143,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNoPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1, - Arrays.asList("422656266117f8d01e17e5c491c49a24")); + Arrays.asList("7ac60bdc355d97c0939e644b58de47d7")); executeTest("test no prior 1", spec1); } @@ -155,12 +155,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "ffc1f83a045dc09360e11de7a8efd159" ); + testHeterozosity( 0.01, "3b66f82dbb746875638e076bf51a1583" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "5426a98df9f5fd70aef295d889c4e4f1" ); + testHeterozosity( 1.0 / 1850, "714c1795334c7c62c046a75479381ae6" ); } private void testHeterozosity(final double arg, final String md5) { @@ -176,7 +176,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "d5a7326fdcf6d441b73c381912ad3a2a"; + private final static String COMPRESSED_OUTPUT_MD5 = "6f79205f7ed8006470f056f6805db6c8"; @Test public void testCompressedOutput() { @@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("0a4a78da876bfa3d42170249a94357b4")); + Arrays.asList("d995b76adc3766889f7c2c88da14055c")); executeTest(String.format("test multiple technologies"), spec); } @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("89182fd4d9532ab4b2a0a84bfb557089")); + Arrays.asList("9669e1643d22d5ad047b3941aeefd6db")); executeTest(String.format("test calling with BAQ"), spec); } @@ -281,8 +281,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, + Arrays.asList("2ae3fd39c53a6954d32faed8703adfe8")); executeTest("test calling on reads with Ns in CIGAR", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index a58d3f3a8..a35cb1ecc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -53,7 +53,7 @@ import java.util.Arrays; public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; // -------------------------------------------------------------------------------------------------------------- // @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("52b6086f4597da5b35ab902bea4066fc")); + Arrays.asList("e3efd1917192ea743ac1e9958aa0a98f")); executeTest("test MultiSample Pilot1", spec); } @@ -72,7 +72,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("5b31b811072a4df04524e13604015f9b")); + Arrays.asList("ebfcc3dd8c1788929cb50050c5d456df")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); + Arrays.asList("698e54aeae3130779d246b9480a4052c")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -88,22 +88,22 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("33ab66c2f062cfa1f7fcc077165f778c")); + Arrays.asList("aaadb2a355d87344eabb6ac4495a11e4")); executeTest("test SingleSample Pilot2", spec); } @Test public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("28bfbff3da3af43d6a1eff673e5cb0f8")); + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, + Arrays.asList("09a1a4d4bf0289bcc5e8a958f783a989")); executeTest("test Multiple SNP alleles", spec); } @Test public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, Arrays.asList("d915535c1458733f09f82670092fcab6")); executeTest("test bad read", spec); } @@ -111,16 +111,16 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ @Test public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("a9edd04374ee9c42970291f39a50c191")); + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("57a1bb44967988f2b7ae7779127990ae")); executeTest("test reverse trim", spec); } @Test public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("6fc32ca9de769060f3c2a3d94f8f2f91")); + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, + Arrays.asList("3011c20165951ca43c8a4e86a5835dbd")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index c3597b6aa..191bf3c27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -62,25 +62,25 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("f82389f2bc6d3f932a36be65b60af648")); + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("da3fd775c8add1f7962baabf06b7d372")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "c87f89af948a554cc66bc3afa5251c3b"); + testReducedCalling("SNP", "76244ab1be60814f1412e6cd09e546cc"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "ed4ddc42447ec037c1e14757b6cf0515"); + testReducedCalling("INDEL", "9a986b98ed014576ce923e07452447f4"); } private void testReducedCalling(final String model, final String md5) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, + "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, Arrays.asList(md5)); executeTest("test calling on a ReducedRead BAM with " + model, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 5fe4e6dfa..d3f3a9936 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); } @@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index fff1c0bb9..02567f188 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,7 +73,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCaller: args=" + args, spec); } @@ -105,7 +105,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); } @@ -120,7 +120,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { if( containsDuplicateRecord(vcf, parser) ) { @@ -158,14 +158,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // any of the calls in that region because it is so messy. @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cb190c935541ebb9f660f713a882b922")); executeTest("HCTestStructuralIndels: ", spec); } @@ -173,7 +173,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDoesNotFailOnBadRefBase() { // don't care about the output - just want to make sure it doesn't fail - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); } @@ -187,7 +187,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, Arrays.asList("3c87eb93ffe3a0166aca753050b981e1")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -195,7 +195,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, Arrays.asList("8adfa8a27a312760dab50787da595c57")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } From 360e2ba87e10c00785f75046d7197cdec57fdbfb Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 26 Apr 2013 12:23:11 -0400 Subject: [PATCH 216/226] Fixed bug reported on the forum where using the --exclude_sample_file argument in SV was giving bad results. Added integration test. https://www.pivotaltracker.com/s/projects/793457/stories/47399245 --- .../SelectVariantsIntegrationTest.java | 17 ++++++++++++++++- .../walkers/variantutils/SelectVariants.java | 4 ++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 303a2871a..4b1483cb6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -144,7 +144,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { } @Test - public void testSampleExclusion() { + public void testSampleExclusionFromFileAndSeparateSample() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; @@ -158,6 +158,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleExclusion--" + testfile, spec); } + @Test + public void testSampleExclusionJustFromFile() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sf " + samplesFile + " --variant " + testfile, + 1, + Arrays.asList("875d7e00ac8081e87ab9fb1b20c83677") + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusion--" + testfile, spec); + } + @Test public void testSampleInclusionWithNonexistingSamples() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 02c8ed8d8..1f2b6d09b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -377,10 +377,10 @@ public class SelectVariants extends RodWalker implements TreeR } // now, exclude any requested samples - Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); + final Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); samples.removeAll(XLsamplesFromFile); samples.removeAll(XLsampleNames); - NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty(); + NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty(); if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED ) throw new UserException("All samples requested to be included were also requested to be excluded."); From 7d90bbab082b1d06a858c1eeb33c30444bcb15c5 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 26 Apr 2013 13:42:01 -0400 Subject: [PATCH 218/226] Add support for snpEff "GATK compatibility mode" (-o gatk) -Do not throw an exception when parsing snpEff output files generated by not-officially-supported versions of snpEff, PROVIDED that snpEff was run with -o gatk -Requested by the snpEff author -Relevant integration tests updated/expanded --- .../VariantAnnotatorIntegrationTest.java | 24 ++++++++-- .../sting/gatk/walkers/annotator/SnpEff.java | 48 +++++++++++-------- 2 files changed, 48 insertions(+), 24 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index d82d920a8..5866075a7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -226,15 +226,29 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { } @Test - public void testSnpEffAnnotationsUnsupportedVersion() { + public void testSnpEffAnnotationsUnsupportedVersionGATKMode() { WalkerTestSpec spec = new WalkerTestSpec( - "-T VariantAnnotator -R " + hg19Reference + " --no_cmdline_in_header -o %s -A SnpEff --variant " + - validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000", + "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + + "--variant " + privateTestDir + "vcf4.1.example.vcf " + + "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_gatk_mode.vcf " + + "-L 1:10001292-10012424", + 1, + Arrays.asList("7352cf23a4d45d3d2bb34ab44a4100ae") + ); + executeTest("Testing SnpEff annotations (unsupported version, GATK mode)", spec); + } + + @Test + public void testSnpEffAnnotationsUnsupportedVersionNoGATKMode() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " --no_cmdline_in_header -o %s -A SnpEff " + + "--variant " + privateTestDir + "vcf4.1.example.vcf " + + "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " + + "-L 1:10001292-10012424", 1, UserException.class ); - executeTest("Testing SnpEff annotations (unsupported version)", spec); + executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index bc365c59c..288196d1b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; +import java.util.regex.Pattern; /** * A set of genomic annotations based on the output of the SnpEff variant effect predictor tool @@ -63,6 +64,8 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.5" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; + public static final String SNPEFF_GATK_COMPATIBILITY_ARGUMENT = "-o gatk"; + public static final Pattern SNPEFF_GATK_COMPATIBILITY_ARGUMENT_PATTERN = Pattern.compile("-o\\s+gatk"); // When we write the SnpEff version number and command line to the output VCF, we change // the key name slightly so that the output VCF won't be confused in the future for an @@ -219,8 +222,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY); VCFHeaderLine snpEffCommandLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_COMMAND_LINE_KEY); - checkSnpEffVersion(snpEffVersionLine); - checkSnpEffCommandLine(snpEffCommandLine); + checkSnpEffVersionAndCommandLine(snpEffVersionLine, snpEffCommandLine); // If everything looks ok, add the SnpEff version number and command-line header lines to the // header of the VCF output file, changing the key names so that our output file won't be @@ -267,37 +269,45 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio } } - private void checkSnpEffVersion ( VCFHeaderLine snpEffVersionLine ) { + private void checkSnpEffVersionAndCommandLine( final VCFHeaderLine snpEffVersionLine, final VCFHeaderLine snpEffCommandLine ) { if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) { - throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " + - "input file, and so could not verify that the file was generated by a supported version of SnpEff (" + - Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); + throw new UserException(String.format("Could not find a %s entry in the VCF header for the SnpEff input file, " + + "and so could not verify that the file was generated by a supported version of SnpEff (%s)", + SNPEFF_VCF_HEADER_VERSION_LINE_KEY, supportedSnpEffVersionsString())); + } + + if ( snpEffCommandLine == null || snpEffCommandLine.getValue() == null || snpEffCommandLine.getValue().trim().length() == 0 ) { + throw new UserException(String.format("Could not find a %s entry in the VCF header for the SnpEff input file, " + + "which should be added by all supported versions of SnpEff (%s)", + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY, supportedSnpEffVersionsString())); } String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0]; - if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) { - throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " + - "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS)); + if ( ! isSupportedSnpEffVersion(snpEffVersionString, snpEffCommandLine.getValue()) ) { + throw new UserException(String.format("The version of SnpEff used to generate the SnpEff input file (%s) " + + "is not currently supported by the GATK, and was not run in GATK " + + "compatibility mode. Supported versions are: %s", + snpEffVersionString, supportedSnpEffVersionsString())); } } - private void checkSnpEffCommandLine ( VCFHeaderLine snpEffCommandLine ) { - if ( snpEffCommandLine == null || snpEffCommandLine.getValue() == null || snpEffCommandLine.getValue().trim().length() == 0 ) { - throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_COMMAND_LINE_KEY + " entry in the VCF header for the SnpEff " + - "input file, which should be added by all supported versions of SnpEff (" + - Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); - } - } - - private boolean isSupportedSnpEffVersion ( String versionString ) { + private boolean isSupportedSnpEffVersion( final String versionString, final String commandLine ) { + // first check to see if it's an officially-supported version for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) { if ( supportedVersion.equals(versionString) ) { return true; } } - return false; + // if it's not an officially-supported version, check to see whether the + // "-o gatk" compatibility option was specified + return SNPEFF_GATK_COMPATIBILITY_ARGUMENT_PATTERN.matcher(commandLine).find(); + } + + private String supportedSnpEffVersionsString() { + return String.format("%s, as well as later versions when run with the option %s", + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS), SNPEFF_GATK_COMPATIBILITY_ARGUMENT); } private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) { From 4168aaf2806f5b6925eb6eea312c65b985663cde Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 26 Apr 2013 15:49:31 -0400 Subject: [PATCH 219/226] Add feature to specify Allele frequency priors by command line when calling variants. Use case: The default AF priors used (infinite sites model, neutral variation) is appropriate in the case where the reference allele is ancestral, and the called allele is a derived allele. Most of the times this is true but in several population studies and in ancient DNA analyses this might introduce reference biases, and in some other cases it's hard to ascertain what the ancestral allele is (normally requiring to look up homologous chimp sequence). Specifying no prior is one solution, but this may introduce a lot of artifactual het calls in shallower coverage regions. With this option, users can specify what the prior for each AC should be according to their needs, subject to the restrictions documented in the code and in GATK docs. -- Updated ancient DNA single sample calling script with filtering options and other cleanups. -- Added integration test. Removed old -noPrior syntax. --- .../StandardCallerArgumentCollection.java | 27 +++++++++--- .../genotyper/UnifiedGenotyperEngine.java | 42 ++++++++++++------- .../genotyper/afcalc/AFCalcTestBuilder.java | 3 +- .../UnifiedGenotyperIntegrationTest.java | 10 ++++- .../genotyper/afcalc/AFCalcUnitTest.java | 10 +++-- 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 63b8717c0..5016526c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Map; /** @@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection { public int MAX_ALTERNATE_ALLELES = 6; /** - * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus. - * If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate, + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, + * see e.g. Waterson (1975) or Tajima (1996). + * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * + * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, * as for example when the ancestral status of the reference allele is not known. + * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: + * a) User must specify 2N values, where N is the number of samples. + * b) Only diploid calls supported. + * c) Probability values are specified in double format, in linear space. + * d) No negative values allowed. + * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. + * f) If user-defined values add to more than one, an error will be produced. + * + * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. + * -inputPrior 0.33 -inputPrior 0.33 + * for the single-sample diploid case. */ @Advanced - @Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false) - public boolean ignoreHeterozygosityPrior = false; + @Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false) + public List inputPrior = Collections.emptyList(); /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. @@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection { this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; - this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior; + this.inputPrior = SCAC.inputPrior; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 55db44052..3380efcc9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine { this.N = samples.size() * ploidy; log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); filter.add(LOW_QUAL_FILTER_NAME); @@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine { * where Pr(AC=i) = theta/i where theta is heterozygosity * @param N Number of chromosomes * @param priors (output) array to be filled with priors - * @param theta Heterozygosity - * @param ignorePriors If true, priors are ignored and zeros returned + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) */ - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) { + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + - if (ignorePriors) { - Arrays.fill(priors, 0,N,0.0); - return; - } double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - final double value = theta / (double)i; - priors[i] = Math.log10(value); - sum += value; + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } } // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) if (sum > 1.0) { - throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument"); + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); } // null frequency for AF=0 is (1 - sum(all other frequencies)) priors[0] = Math.log10(1.0 - sum); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index a4224bf6c..042e04767 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; @@ -111,7 +112,7 @@ public class AFCalcTestBuilder { return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 605be3b2d..253467a76 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNoPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1, + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, Arrays.asList("7ac60bdc355d97c0939e644b58de47d7")); executeTest("test no prior 1", spec1); + } + @Test + public void testUserPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, + Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); + executeTest("test user prior 1", spec1); + } // -------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 5eebe9670..2bdf5078d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest { final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( AFCalc model : calcs ) { @@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest { final double[] flatPriors = new double[]{0.0,0.0,0.0}; final double[] noPriors = new double[3]; - // test that function computeAlleleFrequency correctly operates when the -noPrior option is set - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true); + // test that function computeAlleleFrequency correctly operates when the flat prior option is set + // computeAlleleFrequencyPriors takes linear priors + final ArrayList inputPrior = new ArrayList(); + inputPrior.add(1.0/3); + inputPrior.add(1.0/3); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior); GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior"); GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior"); From 76e997895e2549f5b10ff1f6040fad7b7b6e5b83 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 26 Apr 2013 23:33:09 -0400 Subject: [PATCH 220/226] Updates GATKDocs for ReduceReads downsampling [fixes #48258295] --- .../gatk/walkers/compression/reducereads/ReduceReads.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 82a02ca55..71910e566 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -64,7 +64,6 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -229,7 +228,9 @@ public class ReduceReads extends ReadWalker, Redu public double minIndelProportionToTriggerVariant = 0.05; /** - * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). + * The number of reads emitted per sample in a variant region can be downsampled for better compression. + * This level of downsampling only happens after the region has been evaluated, therefore it can + * be combined with the engine level downsampling. * A value of 0 turns downsampling off. */ @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) From 0387ea8df96a8a8fa7cb8a6e2c3e5c590cbe8715 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 29 Apr 2013 10:05:18 -0400 Subject: [PATCH 221/226] Bugfix for ReadClipper with ReducedReads -- The previous version of the read clipping operations wouldn't modify the reduced reads counts, so hardClipToRegion would result in a read with, say, 50 bp of sequence and base qualities but 250 bp of reduced read counts. Updated the hardClip operation to handle reduce reads, and added a unit test to make sure this works properly. Also had to update GATKSAMRecord.emptyRead() to set the reduced count to new byte[0] if the template read is a reduced read -- Update md5s, where the new code recovers a TP variant with count 2 that was missed previously --- .../HaplotypeCallerIntegrationTest.java | 2 +- .../sting/utils/clipping/ClippingOp.java | 8 +++++++- .../sting/utils/sam/GATKSAMRecord.java | 17 +++++++++++++++++ .../utils/clipping/ReadClipperUnitTest.java | 17 +++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 02567f188..50165bd01 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("3c87eb93ffe3a0166aca753050b981e1")); + Arrays.asList("0df626cd0d76aca8a05a545d0b36bf23")); executeTest("HC calling on a ReducedRead BAM", spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index ad6f05563..ccc847092 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -378,7 +378,13 @@ public class ClippingOp { hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); } - + + if (read.isReducedRead()) { + final byte[] reducedCounts = new byte[newLength]; + System.arraycopy(read.getReducedReadCounts(), copyStart, reducedCounts, 0, newLength); + hardClippedRead.setReducedReadCounts(reducedCounts); + } + return hardClippedRead; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index e4a2bed44..c0ea81563 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -393,6 +393,22 @@ public class GATKSAMRecord extends BAMRecord { setAttribute(REDUCED_READ_CONSENSUS_TAG, counts); } + /** + * Set the reduced read counts tag for this record to counts + * + * Note that this function does not set the REDUCED_READ_CONSENSUS_TAG value, it's purely for manipulating + * the underlying reduced reads count + * + * TODO -- this function needs to be fixed when the RR spec is set to 2.0 + * + * @param counts the count array + */ + public void setReducedReadCounts(final byte[] counts) { + if ( counts.length != getReadBases().length ) throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length); + retrievedReduceReadCounts = true; + reducedReadCounts = counts; + } + /** * The number of bases corresponding the i'th base of the reduced read. * @@ -678,6 +694,7 @@ public class GATKSAMRecord extends BAMRecord { emptyRead.setCigarString(""); emptyRead.setReadBases(new byte[0]); emptyRead.setBaseQualities(new byte[0]); + if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new byte[0]); SAMReadGroupRecord samRG = read.getReadGroup(); emptyRead.clearAttributes(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index a85ed2ce0..e1cf82850 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -35,6 +35,7 @@ import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -348,4 +349,20 @@ public class ReadClipperUnitTest extends BaseTest { } + @Test(enabled = true) + public void testHardClipReducedRead() { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M"); + final byte[] counts = new byte[read.getReadLength()]; + for ( int i = 0; i < counts.length; i++ ) counts[i] = (byte)i; + read.setReducedReadCounts(counts); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + int readLength = read.getReadLength(); + for (int i = 0; i < readLength / 2; i++) { + GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); + final byte[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1); + Assert.assertEquals(clippedRead.getReducedReadCounts(), expectedReducedCounts); + } + } + } \ No newline at end of file From 0e7e6d35d852edc837e75a0c93efd49bda192826 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 29 Apr 2013 12:49:02 -0400 Subject: [PATCH 222/226] GATKBAMIndex calls buffer.length() on every read. This is causing much pain. Optimized by getting the read of the file upon opening the index-file and using that instead. --- .../gatk/datasources/reads/GATKBAMIndex.java | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 57b409dcd..9a6b9b521 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -25,16 +25,17 @@ package org.broadinstitute.sting.gatk.datasources.reads; +import net.sf.samtools.Bin; +import net.sf.samtools.GATKBin; +import net.sf.samtools.GATKChunk; +import net.sf.samtools.LinearIndex; import net.sf.samtools.seekablestream.SeekableBufferedStream; import net.sf.samtools.seekablestream.SeekableFileStream; - -import net.sf.samtools.*; - import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import java.io.*; +import java.io.File; +import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.ArrayList; @@ -86,6 +87,7 @@ public class GATKBAMIndex { private SeekableFileStream fileStream; private SeekableBufferedStream bufferedStream; + private Long fileLength; public GATKBAMIndex(final File file) { mFile = file; @@ -307,6 +309,7 @@ public class GATKBAMIndex { try { fileStream = new SeekableFileStream(mFile); bufferedStream = new SeekableBufferedStream(fileStream,BUFFERED_STREAM_BUFFER_SIZE); + fileLength=bufferedStream.length(); } catch (IOException exc) { throw new ReviewedStingException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); @@ -317,6 +320,7 @@ public class GATKBAMIndex { try { bufferedStream.close(); fileStream.close(); + fileLength=null; } catch (IOException exc) { throw new ReviewedStingException("Unable to close index file " + mFile, exc); @@ -368,7 +372,7 @@ public class GATKBAMIndex { // We have a rigid expectation here to read in exactly the number of bytes we've limited // our buffer to -- if there isn't enough data in the file, the index // must be truncated or otherwise corrupt: - if(bytesRequested > bufferedStream.length() - bufferedStream.position()){ + if(bytesRequested > fileLength - bufferedStream.position()){ throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + "It's likely that this file is truncated or corrupt -- " + "Please try re-indexing the corresponding BAM file.", From 20d3137928abb2ee2031e41f69d8e672b8df7c64 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 28 Apr 2013 09:14:35 -0400 Subject: [PATCH 223/226] Fix for indel calling with UG in presence of reduced reads: When a read is long enough so that there's no reference context available, the reads gets clipped so that it falls again within the reference context range. However, the clipping is incorrect, as it makes the read end precisely at the end of the reference context coordinates. This might lead to a case where a read might span beyond the haplotype if one of the candidate haplotypes is shorter than the reference context (As in the case e.g. with deletions). In this case, the HMM will not work properly and the likelihood will be bad, since "insertions" at end of reads when haplotype is done will be penalized and likelihood will be much lower than it should. -- Added check to see if read spans beyond reference window MINUS padding and event length. This guarantees that read will always be contained in haplotype. -- Changed md5's that happen when long reads from old 454 data have their likelihoods changed because of the extra base clipping. --- .../walkers/indels/PairHMMIndelErrorModel.java | 16 ++++++++++------ ...fiedGenotyperIndelCallingIntegrationTest.java | 4 ++-- .../UnifiedGenotyperIntegrationTest.java | 4 ++-- ...iedGenotyperNormalCallingIntegrationTest.java | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 93a015718..363f7a357 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -245,8 +245,13 @@ public class PairHMMIndelErrorModel { } } else { - final int refWindowStart = ref.getWindow().getStart(); - final int refWindowStop = ref.getWindow().getStop(); + // extra padding on candidate haplotypes to make sure reads are always strictly contained + // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly + // in case bases at edge of reads have lower quality. + final int trailingBases = 3; + final int extraOffset = Math.abs(eventLength); + final int refWindowStart = ref.getWindow().getStart()+(trailingBases+extraOffset); + final int refWindowStop = ref.getWindow().getStop()-(trailingBases+extraOffset); if (DEBUG) { System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); @@ -255,10 +260,10 @@ public class PairHMMIndelErrorModel { GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (!read.isEmpty() && (read.getSoftEnd() > refWindowStop && read.getSoftStart() < refWindowStop)) - read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop()); + read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, refWindowStop); if (!read.isEmpty() && (read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart)) - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart()); + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, refWindowStart); if (read.isEmpty()) continue; @@ -270,7 +275,6 @@ public class PairHMMIndelErrorModel { continue; // get bases of candidate haplotypes that overlap with reads - final int trailingBases = 3; final long readStart = read.getSoftStart(); final long readEnd = read.getSoftEnd(); @@ -286,7 +290,6 @@ public class PairHMMIndelErrorModel { final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ; final byte [] unclippedReadBases = read.getReadBases(); final byte [] unclippedReadQuals = read.getBaseQualities(); - final int extraOffset = Math.abs(eventLength); /** * Compute genomic locations that candidate haplotypes will span. @@ -313,6 +316,7 @@ public class PairHMMIndelErrorModel { startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; } + // candidate haplotype cannot go beyond reference context if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index c33e89b99..52970d70d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("54e13f696f56eb742bf449ad11d0dc5f")); + Arrays.asList("8d9b8f8a1479322961c840e461b6dba8")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("e87f5c76661527ef7aa44e528fe19573")); + Arrays.asList("3d4d66cc253eac55f16e5b0a36f17d8d")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 253467a76..d55a923dc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -240,7 +240,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("d995b76adc3766889f7c2c88da14055c")); + Arrays.asList("31be725b2a7c15e9769391ad940c0587")); executeTest(String.format("test multiple technologies"), spec); } @@ -259,7 +259,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("9669e1643d22d5ad047b3941aeefd6db")); + Arrays.asList("dcc5cec42730567982def16da4a7f286")); executeTest(String.format("test calling with BAQ"), spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index a35cb1ecc..8256a8496 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e3efd1917192ea743ac1e9958aa0a98f")); + Arrays.asList("a6c224235c21b4af816b1512eb0624df")); executeTest("test MultiSample Pilot1", spec); } From 73fcacbf1b88978bf542a8b31113a565b89c47c3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 30 Apr 2013 09:21:10 -0400 Subject: [PATCH 224/226] Change Long to long --- .../sting/gatk/datasources/reads/GATKBAMIndex.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 9a6b9b521..6c7a6c867 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -87,7 +87,7 @@ public class GATKBAMIndex { private SeekableFileStream fileStream; private SeekableBufferedStream bufferedStream; - private Long fileLength; + private long fileLength; public GATKBAMIndex(final File file) { mFile = file; @@ -320,7 +320,7 @@ public class GATKBAMIndex { try { bufferedStream.close(); fileStream.close(); - fileLength=null; + fileLength = -1; } catch (IOException exc) { throw new ReviewedStingException("Unable to close index file " + mFile, exc); From 58424e56be1bd59385466f96257befbd3a3ae9bf Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 29 Apr 2013 11:17:38 -0400 Subject: [PATCH 226/226] Setting the reduce reads count tag was all wrong in a previous commit; fixing. RR counts are represented as offsets from the first count, but that wasn't being done correctly when counts are adjusted on the fly. Also, we were triggering the expensive conversion and writing to binary tags even when we weren't going to write the read to disk. The code has been updated so that unconverted counts are passed to the GATKSAMRecord and it knows how to encode the tag correctly. Also, there are now methods to write to the reduced counts array without forcing the conversion (and methods that do force the conversion). Also: 1. counts are now maintained as ints whenever possible. Only the GATKSAMRecord knows about the internal encoding. 2. as discussed in meetings today, we updated the encoding so that it can now handle a range of values that extends to 255 instead of 127 (and is backwards compatible). 3. tests have been moved from SyntheticReadUnitTest to GATKSAMRecordUnitTest accordingly. --- .../reducereads/SyntheticRead.java | 49 ++---- .../haplotypecaller/DeBruijnAssembler.java | 2 +- .../reducereads/SyntheticReadUnitTest.java | 127 -------------- ...dGenotyperReducedReadsIntegrationTest.java | 4 +- .../broadinstitute/sting/utils/MathUtils.java | 2 +- .../sting/utils/clipping/ClippingOp.java | 2 +- .../sting/utils/pileup/PileupElement.java | 2 + .../sting/utils/sam/GATKSAMRecord.java | 158 +++++++++++------- .../sting/utils/sam/ReadUtils.java | 2 +- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 2 +- .../utils/clipping/ReadClipperUnitTest.java | 6 +- .../utils/sam/GATKSAMRecordUnitTest.java | 72 +++++++- 12 files changed, 188 insertions(+), 240 deletions(-) delete mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index ae4366768..9d16ea06f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -47,13 +47,11 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.bytes.ByteArrayList; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; @@ -89,12 +87,12 @@ public class SyntheticRead { // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint. private static class SingleBaseInfo { byte baseIndexOrdinal; // enum BaseIndex.ordinal - byte count; + int count; byte qual; byte insertionQual; byte deletionQual; - SingleBaseInfo(byte baseIndexOrdinal, byte count, byte qual, byte insertionQual, byte deletionQual) { + SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) { this.baseIndexOrdinal = baseIndexOrdinal; this.count = count; this.qual = qual; @@ -170,22 +168,6 @@ public class SyntheticRead { this.strandType = strandType; } - public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { - basesCountsQuals = new ObjectArrayList(bases.size()); - for (int i = 0; i < bases.size(); ++i) { - basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i))); - } - this.mappingQuality = mappingQuality; - this.header = header; - this.readGroupRecord = readGroupRecord; - this.contig = contig; - this.contigIndex = contigIndex; - this.readName = readName; - this.refStart = refStart; - this.hasIndelQualities = hasIndelQualities; - this.strandType = strandType; - } - /** * Easy access to keep adding to a running consensus that has already been * initialized with the correct read name and refStart @@ -194,7 +176,7 @@ public class SyntheticRead { * @param count number of reads with this base */ @Requires("count <= Byte.MAX_VALUE") - public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) { + public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) { basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual)); this.mappingQuality += mappingQuality; } @@ -285,22 +267,14 @@ public class SyntheticRead { }); } - protected byte [] convertBaseCounts() { - byte[] countsArray = convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().count; - } - }); - - if (countsArray.length == 0) - throw new ReviewedStingException("Reduced read has counts array of length 0"); - - byte[] compressedCountsArray = new byte [countsArray.length]; - compressedCountsArray[0] = countsArray[0]; - for (int i = 1; i < countsArray.length; i++) - compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE); - - return compressedCountsArray; + protected int[] convertBaseCounts() { + int[] variableArray = new int[getReadLengthWithNoDeletions()]; + int i = 0; + for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { + if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) + variableArray[i++] = singleBaseInfo.count; + } + return variableArray; } private byte [] convertReadBases() { @@ -376,7 +350,6 @@ public class SyntheticRead { variableArray[i++] = count; } return variableArray; - } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index 0e1d49d81..12a4841bf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -277,7 +277,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { for( final GATKSAMRecord read : reads ) { final byte[] sequence = read.getReadBases(); final byte[] qualities = read.getBaseQualities(); - final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced + final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced if ( sequence.length > kmerLength + KMER_OVERLAP ) { int lastGood = -1; // the index of the last good base we've seen for( int end = 0; end < sequence.length; end++ ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java deleted file mode 100644 index 6886568e8..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java +++ /dev/null @@ -1,127 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.bytes.ByteArrayList; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.Test; - -public class SyntheticReadUnitTest extends BaseTest { - final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1); - final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic"); - final String artificialContig = "1"; - final int artificialContigIndex = 0; - final String artificialReadName = "synth"; - final int artificialRefStart = 1; - final double artificialMappingQuality = 60; - -@Test -public void testBaseCounts() { - BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A}; - byte[] quals = new byte[] {20, 20, 20, 20 }; - - TestRead [] testReads = new TestRead [] { - new TestRead(bases, quals, new byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}), - new TestRead(bases, quals, new byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}), - new TestRead(bases, quals, new byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}), - new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; - - for (TestRead testRead : testReads) { - SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, SyntheticRead.StrandType.STRANDLESS); - Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); - } -} - -private class TestRead { - BaseIndex[] bases; - byte[] quals; - byte[] insQuals; - byte[] delQuals; - byte[] counts; - byte[] expectedCounts; - - private TestRead(BaseIndex[] bases, byte[] quals, byte[] counts, byte[] expectedCounts) { - this.bases = bases; - this.quals = quals; - this.insQuals = quals; - this.delQuals = quals; - this.counts = counts; - this.expectedCounts = expectedCounts; - } - - public BaseIndex[] getBases() { - return bases; - } - - public byte[] getQuals() { - return quals; - } - - public byte[] getInsQuals() { - return insQuals; - } - - public byte[] getDelQuals() { - return delQuals; - } - - public byte[] getCounts() { - return counts; - } - - public byte[] getExpectedCounts() { - return expectedCounts; - } -} - -} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index 191bf3c27..f7ac87cda 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -63,13 +63,13 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("da3fd775c8add1f7962baabf06b7d372")); + Arrays.asList("e6565060b44a7804935973efcd56e596")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "76244ab1be60814f1412e6cd09e546cc"); + testReducedCalling("SNP", "ab776d74c41ce2b859e2b2466a76204a"); } @Test diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 1cc798e36..38c131bc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -698,7 +698,7 @@ public class MathUtils { return maxI; } - public static byte arrayMax(final byte[] array) { + public static int arrayMax(final int[] array) { return array[maxElementIndex(array)]; } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index ccc847092..f51881e0b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -380,7 +380,7 @@ public class ClippingOp { } if (read.isReducedRead()) { - final byte[] reducedCounts = new byte[newLength]; + final int[] reducedCounts = new int[newLength]; System.arraycopy(read.getReducedReadCounts(), copyStart, reducedCounts, 0, newLength); hardClippedRead.setReducedReadCounts(reducedCounts); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index ba5dee34c..f4c673e61 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -322,6 +322,8 @@ public class PileupElement implements Comparable { * Adjusts the representative count of this pileup element. * Throws an exception if this element does not represent a reduced read. * + * See GATKSAMRecord.adjustReducedCount() for warnings on the permanency of this operation. + * * @param adjustmentFactor how much to adjust the representative count (can be positive or negative) */ public void adjustRepresentativeCount(final int adjustmentFactor) { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c0ea81563..c39245730 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; @@ -69,7 +70,7 @@ public class GATKSAMRecord extends BAMRecord { // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; - private byte[] reducedReadCounts = null; + private int[] reducedReadCounts = null; private final static int UNINITIALIZED = -1; private int softStart = UNINITIALIZED; private int softEnd = UNINITIALIZED; @@ -360,18 +361,34 @@ public class GATKSAMRecord extends BAMRecord { * * as one might expect. * - * @return a byte[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read + * @return a int[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read */ - public byte[] getReducedReadCounts() { + public int[] getReducedReadCounts() { if ( ! retrievedReduceReadCounts ) { final byte[] tag = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); - if ( tag != null ) reducedReadCounts = decodeReadReadCounts(tag); + if ( tag != null ) reducedReadCounts = decodeReduceReadCounts(tag); retrievedReduceReadCounts = true; } return reducedReadCounts; } + /** + * The number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @return the number of bases corresponding to the i'th base of the reduced read + */ + public final int getReducedCount(final int i) { + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); + if ( i < 0 || i >= getReadBases().length ) + throw new IllegalArgumentException("illegal offset used when retrieving reduced counts: " + i); + + final int[] reducedCounts = getReducedReadCounts(); + return reducedCounts[i]; + } + /** * Is this read a reduced read? * @return true if yes @@ -381,65 +398,69 @@ public class GATKSAMRecord extends BAMRecord { } /** - * Set the reduced read counts tag for this record to counts - * - * WARNING -- this function assumes that counts is encoded as a difference in value count - * of count[i] - count[0]. It is not a straight counting of the bases in the read. + * Set the reduced read counts tag for this record. + * Note that this method is slightly expensive as it converts to the correct reduced counts representation and sets the + * appropriate binary tag. If you want to modify the reduced count in place without triggering the permanent conversion + * internally, use the #setReducedCount() method. * * @param counts the count array */ - public void setReducedReadCountsTag(final byte[] counts) { - retrievedReduceReadCounts = false; - setAttribute(REDUCED_READ_CONSENSUS_TAG, counts); + public void setReducedReadCountsTag(final int[] counts) { + setAttribute(REDUCED_READ_CONSENSUS_TAG, encodeReduceReadCounts(counts)); + retrievedReduceReadCounts = false; // need to force new decode in case we had to handle precision problems with the counts + } + + /** + * @see #setReducedReadCountsTag() and uses the currently stored values of the internal array. + * Useful if you've been using #setReducedCount() to modify the reduced count and now want to trigger the expensive conversion. + */ + public void setReducedReadCountsTag() { + if ( !retrievedReduceReadCounts ) + throw new IllegalStateException("Trying to write the reduced reads counts using an uninitialized internal array of counts"); + setReducedReadCountsTag(reducedReadCounts); + } + + /** + * Sets the reduced read count corresponding the i'th base of the reduced read. + * + * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion + * and push that value into the read's binary tags, use #setReducedReadCountsTag(). + * + * @param i the read based coordinate inside the read + * @param count the new count + */ + public final void setReducedCount(final int i, final int count) { + if ( count < 0 ) + throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); + if ( !isReducedRead() ) + throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); + if ( i < 0 || i >= getReadBases().length ) + throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); + + // force the initialization of the counts array if it hasn't happened yet + getReducedReadCounts()[i] = count; } /** * Set the reduced read counts tag for this record to counts * - * Note that this function does not set the REDUCED_READ_CONSENSUS_TAG value, it's purely for manipulating - * the underlying reduced reads count - * - * TODO -- this function needs to be fixed when the RR spec is set to 2.0 + * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion + * and push that value into the read's binary tags, use #setReducedReadCountsTag(). * * @param counts the count array */ - public void setReducedReadCounts(final byte[] counts) { - if ( counts.length != getReadBases().length ) throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length); + public void setReducedReadCounts(final int[] counts) { + if ( counts.length != getReadBases().length ) + throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length); retrievedReduceReadCounts = true; reducedReadCounts = counts; } - /** - * The number of bases corresponding the i'th base of the reduced read. - * - * @param i the read based coordinate inside the read - * @return the number of bases corresponding to the i'th base of the reduced read - */ - public final byte getReducedCount(final int i) { - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); - final byte[] reducedCounts = getReducedReadCounts(); - return reducedCounts[i]; - } - /** * Sets the number of bases corresponding the i'th base of the reduced read. * - * @param i the read based coordinate inside the read - * @param count the new count - */ - public final void setReducedCount(final int i, final byte count) { - if ( count < 0 ) - throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); - final byte[] reducedCounts = getReducedReadCounts(); - reducedCounts[i] = count; - setReducedReadCountsTag(reducedCounts); - } - - /** - * Sets the number of bases corresponding the i'th base of the reduced read. + * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion + * and push that value into the read's binary tags, use #setReducedReadCountsTag(). * * @param i the read based coordinate inside the read * @param adjustmentFactor how much to add/subtract to the current count @@ -447,9 +468,10 @@ public class GATKSAMRecord extends BAMRecord { public final void adjustReducedCount(final int i, final int adjustmentFactor) { if ( !isReducedRead() ) throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); - final byte[] reducedCounts = getReducedReadCounts(); - final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor); - setReducedCount(i, newCount); + if ( i < 0 || i >= getReadBases().length ) + throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); + + setReducedCount(i, getReducedReadCounts()[i] + adjustmentFactor); } /** @@ -462,21 +484,43 @@ public class GATKSAMRecord extends BAMRecord { * as one might expect. * * @param countsFromTag a non-null byte[] containing the tag encoded reduce reads counts - * @return a non-null byte[] containing the true depth values for the vector + * @return a non-null int[] containing the true depth values for the vector */ - private byte[] decodeReadReadCounts(final byte[] countsFromTag) { + protected static int[] decodeReduceReadCounts(final byte[] countsFromTag) { final int n = countsFromTag.length; - final byte[] result = new byte[n]; - final byte firstCount = countsFromTag[0]; + final int[] result = new int[n]; + final int firstCount = countsFromTag[0] & 0xff; // unsigned byte result[0] = firstCount; - for ( int i = 1; i < n; i++) { - final byte offsetCount = countsFromTag[i]; - result[i] = (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); + for ( int i = 1; i < n; i++ ) { + final int offsetCount = countsFromTag[i] & 0xff; // unsigned byte + result[i] = (firstCount + offsetCount) % 256; } return result; } + /** + * Converts int array from straight counts to the appropriate reduce reads representation in BAM (offset from first value) + * + * @param counts the counts array + * @return non-null converted byte array + */ + protected static byte[] encodeReduceReadCounts(final int[] counts) { + if ( counts.length == 0 ) + throw new IllegalArgumentException("Trying to write a reduced read with a counts array of length 0"); + + final byte[] compressedCountsArray = new byte[counts.length]; + final int firstCount = (int) MathUtils.bound(counts[0], 0, 255); // we want an unsigned byte capped at max byte representation + compressedCountsArray[0] = (byte)firstCount; + for ( int i = 1; i < counts.length; i++ ) { + final int count = (int) MathUtils.bound(counts[i], 0, 255); + final byte offset = (byte)(count - firstCount + (count >= firstCount ? 0 : 256)); // unsigned byte + compressedCountsArray[i] = offset; + } + + return compressedCountsArray; + } + /////////////////////////////////////////////////////////////////////////////// // *** GATKSAMRecord specific methods ***// /////////////////////////////////////////////////////////////////////////////// @@ -694,7 +738,7 @@ public class GATKSAMRecord extends BAMRecord { emptyRead.setCigarString(""); emptyRead.setReadBases(new byte[0]); emptyRead.setBaseQualities(new byte[0]); - if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new byte[0]); + if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new int[0]); SAMReadGroupRecord samRG = read.getReadGroup(); emptyRead.clearAttributes(); @@ -728,7 +772,7 @@ public class GATKSAMRecord extends BAMRecord { /** * A caching version of ReadUtils.getAdaptorBoundary() * - * @see ReadUtils.getAdaptorBoundary(SAMRecord) for more information about the meaning of this function + * see #ReadUtils.getAdaptorBoundary(SAMRecord) for more information about the meaning of this function * * WARNING -- this function caches a value depending on the inferred insert size and alignment starts * and stops of this read and its mate. Changing these values in any way will invalidate the cached value. diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index c84e4245d..0eed80f3a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -62,7 +62,7 @@ public class ReadUtils { return 1; // compute mean representative read counts - final byte[] counts = read.getReducedReadCounts(); + final int[] counts = read.getReducedReadCounts(); return (int)Math.round((double)MathUtils.sum(counts)/counts.length); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java index 6314d4681..6e908a3bf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -150,7 +150,7 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { } if ( originalReducedCount > 0 ) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); - read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount}); + read.setReducedReadCountsTag(new int[]{originalReducedCount}); elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index e1cf82850..ae7c1e01c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -352,15 +352,15 @@ public class ReadClipperUnitTest extends BaseTest { @Test(enabled = true) public void testHardClipReducedRead() { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M"); - final byte[] counts = new byte[read.getReadLength()]; - for ( int i = 0; i < counts.length; i++ ) counts[i] = (byte)i; + final int[] counts = new int[read.getReadLength()]; + for ( int i = 0; i < counts.length; i++ ) counts[i] = i; read.setReducedReadCounts(counts); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); int readLength = read.getReadLength(); for (int i = 0; i < readLength / 2; i++) { GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); - final byte[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1); + final int[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1); Assert.assertEquals(clippedRead.getReducedReadCounts(), expectedReducedCounts); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 06cdb366e..eefc92799 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -31,15 +31,18 @@ import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.List; + public class GATKSAMRecordUnitTest extends BaseTest { GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; - final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9}; // just the offsets + final private static int[] REDUCED_READ_COUNTS = new int[]{10, 20, 30, 40, 1}; @BeforeClass public void init() { @@ -52,11 +55,13 @@ public class GATKSAMRecordUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS); + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); } @Test public void testReducedReads() { + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); @@ -73,16 +78,17 @@ public class GATKSAMRecordUnitTest extends BaseTest { @Test(expectedExceptions = IllegalArgumentException.class) public void testSetReducedTagOnNormalRead() { - read.setReducedCount(0, (byte)2); + read.setReducedCount(0, 2); } @Test(expectedExceptions = IllegalArgumentException.class) - public void testSetReducedCountToNegativeNumber() { - reducedRead.setReducedCount(0, (byte)1000); + public void testAdjustReducedCountToNegativeNumber() { + reducedRead.setReducedCount(0, 1); + reducedRead.adjustReducedCount(0, -2); } @Test - public void testSetReducedTagOnReducedRead() { + public void testSetReducedCountOnReducedRead() { for (int i = 0; i < reducedRead.getReadLength(); i++) { final byte newCount = (byte)i; reducedRead.setReducedCount(i, newCount); @@ -96,8 +102,35 @@ public class GATKSAMRecordUnitTest extends BaseTest { } } + @Test + public void testReducedReadEncodeAndDecode() { + + // encode + byte[] encoded = GATKSAMRecord.encodeReduceReadCounts(REDUCED_READ_COUNTS); + + // decode + int[] decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); + + // for the heck of it, let's encode and decode again! + encoded = GATKSAMRecord.encodeReduceReadCounts(decoded); + decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); + + for (int i = 0; i < decoded.length; i++) + Assert.assertEquals(decoded[i], REDUCED_READ_COUNTS[i]); + } + + @Test + public void testByteBoundsOnReducedTag() { + reducedRead.setReducedCount(0, 1000); + reducedRead.setReducedReadCountsTag(); + reducedRead.adjustReducedCount(0, -255); + Assert.assertEquals(reducedRead.getReducedCount(0), 0); + } + @Test public void testReducedReadPileupElement() { + reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); + PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0); PileupElement reducedreadp = LocusIteratorByState.createPileupForReadAndOffset(reducedRead, 0); @@ -167,9 +200,32 @@ public class GATKSAMRecordUnitTest extends BaseTest { @Test public void testGetReducedCountsIsCorrect() { - final byte[] counts = reducedRead.getReducedReadCounts(); + final int[] counts = reducedRead.getReducedReadCounts(); Assert.assertNotSame(counts, reducedRead.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); for ( int i = 0; i < counts.length; i++ ) Assert.assertEquals(counts[i], reducedRead.getReducedCount(i), "Reduced counts vector not equal to getReducedCount(i) at " + i); } + + @DataProvider(name = "ReducedReadCountConversionProvider") + public Object[][] ReducedReadCountConversionTestData() { + List tests = new ArrayList(); + + tests.add(new Object[]{new int[] {100, 100, 100, 101}, new byte[] {100, 0, 0, 1}}); + tests.add(new Object[]{new int[] {1, 100, 100, 0}, new byte[] {1, 99, 99, -1}}); + tests.add(new Object[]{new int[] {127, 100, 0, 1}, new byte[] {127, -27, -127, -126}}); + tests.add(new Object[]{new int[] {1, 127, 51, 126}, new byte[] {1, 126, 50, 125}}); + tests.add(new Object[]{new int[] {300, 127, 1, 255}, new byte[] {-1, -128, 2, 0}}); + tests.add(new Object[]{new int[] {1, 300, 51, 126}, new byte[] {1, -2, 50, 125}}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReducedReadCountConversionProvider", enabled = true) + public void reducedReadCountConversionTest(final int[] counts, final byte[] expectedConversion) { + + reducedRead.setReducedReadCountsTag(counts); + final byte[] actualConversion = reducedRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG); + for ( int i = 0; i < actualConversion.length; i++ ) + Assert.assertEquals(actualConversion[i], expectedConversion[i], "Conversion differs at position " + i + ": " + actualConversion[i] + " vs. " + expectedConversion[i]); + } }