diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java new file mode 100644 index 000000000..98a96fbfb --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import java.util.HashMap; +import java.util.Map; + +/** + * An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base. + * + * @author Mauricio Carneiro + * @since 6/15/12 + */ +public class BaseAndQualsCounts extends BaseCounts { + private final Map sumInsertionQuals; + private final Map sumDeletionQuals; + + public BaseAndQualsCounts() { + super(); + this.sumInsertionQuals = new HashMap(); + this.sumDeletionQuals = new HashMap(); + for (BaseIndex i : BaseIndex.values()) { + sumInsertionQuals.put(i, 0L); + sumDeletionQuals.put(i, 0L); + } + } + + public void incr(byte base, byte baseQual, byte insQual, byte delQual) { + super.incr(base, baseQual); + BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) { // do not allow Ns + sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual); + sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual); + } + } + + public void decr(byte base, byte baseQual, byte insQual, byte delQual) { + super.decr(base, baseQual); + BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) { // do not allow Ns + sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual); + sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual); + } + } + + public byte averageInsertionQualsOfMostCommonBase() { + return getGenericAverageQualOfMostCommonBase(sumInsertionQuals); + } + + public byte averageDeletionQualsOfMostCommonBase() { + return getGenericAverageQualOfMostCommonBase(sumDeletionQuals); + } + + private byte getGenericAverageQualOfMostCommonBase(Map sumQuals) { + BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts()); + return (byte) (sumQuals.get(base) / getCount(base)); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java new file mode 100644 index 000000000..ed5802d38 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -0,0 +1,223 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.EnumMap; +import java.util.Map; + +/** + * An object to keep track of the number of occurences of each base and it's quality. + * + * User: depristo + * Date: 4/8/11 + * Time: 2:55 PM + */ + + public class BaseCounts { + public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; + public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); + + private final Map counts; // keeps track of the base counts + private final Map sumQuals; // keeps track of the quals of each base + + public BaseCounts() { + counts = new EnumMap(BaseIndex.class); + sumQuals = new EnumMap(BaseIndex.class); + for (BaseIndex i : BaseIndex.values()) { + counts.put(i, 0); + sumQuals.put(i, 0L); + } + } + + public static BaseCounts createWithCounts(int[] countsACGT) { + BaseCounts baseCounts = new BaseCounts(); + baseCounts.counts.put(BaseIndex.A, countsACGT[0]); + baseCounts.counts.put(BaseIndex.C, countsACGT[1]); + baseCounts.counts.put(BaseIndex.G, countsACGT[2]); + baseCounts.counts.put(BaseIndex.T, countsACGT[3]); + return baseCounts; + } + + @Requires("other != null") + public void add(BaseCounts other) { + for (BaseIndex i : BaseIndex.values()) + counts.put(i, counts.get(i) + other.counts.get(i)); + } + + @Requires("other != null") + public void sub(BaseCounts other) { + for (BaseIndex i : BaseIndex.values()) + counts.put(i, counts.get(i) - other.counts.get(i)); + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") + public void incr(byte base) { + BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) // no Ns + counts.put(i, counts.get(i) + 1); + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") + public void incr(byte base, byte qual) { + BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) { // no Ns + counts.put(i, counts.get(i) + 1); + sumQuals.put(i, sumQuals.get(i) + qual); + } + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") + public void decr(byte base) { + BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) // no Ns + counts.put(i, counts.get(i) - 1); + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") + public void decr(byte base, byte qual) { + BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) { // no Ns + counts.put(i, counts.get(i) - 1); + sumQuals.put(i, sumQuals.get(i) - qual); + } + } + + + + @Ensures("result >= 0") + public int getCount(byte base) { + return getCount(BaseIndex.byteToBase(base)); + } + + @Ensures("result >= 0") + public int getCount(BaseIndex base) { + return counts.get(base); + } + + @Ensures("result >= 0") + public long getSumQuals(byte base) { + return getSumQuals(BaseIndex.byteToBase(base)); + } + + @Ensures("result >= 0") + public long getSumQuals(BaseIndex base) { + return sumQuals.get(base); + } + + @Ensures("result >= 0") + public byte averageQuals(byte base) { + return (byte) (getSumQuals(base) / getCount(base)); + } + + @Ensures("result >= 0") + public byte averageQuals(BaseIndex base) { + return (byte) (getSumQuals(base) / getCount(base)); + } + + public byte baseWithMostCounts() { + return baseIndexWithMostCounts().getByte(); + } + + @Ensures("result >= 0") + public int countOfMostCommonBase() { + return counts.get(baseIndexWithMostCounts()); + } + + @Ensures("result >= 0") + public long sumQualsOfMostCommonBase() { + return sumQuals.get(baseIndexWithMostCounts()); + } + + @Ensures("result >= 0") + public byte averageQualsOfMostCommonBase() { + return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase()); + } + + + @Ensures("result >= 0") + public int totalCount() { + int sum = 0; + for (int c : counts.values()) + sum += c; + + return sum; + } + + /** + * Given a base , it returns the proportional count of this base compared to all other bases + * + * @param base + * @return the proportion of this base over all other bases + */ + @Ensures({"result >=0.0", "result<= 1.0"}) + public double baseCountProportion(byte base) { + return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount(); + } + + /** + * Given a base , it returns the proportional count of this base compared to all other bases + * + * @param baseIndex + * @return the proportion of this base over all other bases + */ + @Ensures({"result >=0.0", "result<= 1.0"}) + public double baseCountProportion(BaseIndex baseIndex) { + int total = totalCount(); + if (total == 0) + return 0.0; + return (double) counts.get(baseIndex) / totalCount(); + } + + + @Ensures("result != null") + public String toString() { + StringBuilder b = new StringBuilder(); + for (Map.Entry elt : counts.entrySet()) { + b.append(elt.toString()).append("=").append(elt.getValue()).append(","); + } + return b.toString(); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostCounts() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (BaseIndex i : counts.keySet()) + if (counts.get(i) > counts.get(maxI)) + maxI = i; + return maxI; + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostCountsWithoutIndels() { + BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (BaseIndex index : counts.keySet()) + if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts)) + mostCounts = index; + return mostCounts; + } + + @Ensures("result >=0") + public int totalCountWithoutIndels() { + int sum = 0; + for (BaseIndex index : counts.keySet()) + if (index.isNucleotide()) + sum += counts.get(index); + return sum; + } + + /** + * Calculates the proportional count of a base compared to all other bases except indels (I and D) + * + * @param index + * @return the proportion of this base over all other bases except indels + */ + @Requires("index.isNucleotide()") + @Ensures({"result >=0.0", "result<= 1.0"}) + public double baseCountProportionWithoutIndels(BaseIndex index) { + int total = totalCountWithoutIndels(); + if (total == 0) + return 0.0; + return (double) counts.get(index) / totalCountWithoutIndels(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java new file mode 100644 index 000000000..a64db5874 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +/** + * Simple byte / base index conversions + * + * + * @author carneiro + * @since 8/26/11 + */ +public enum BaseIndex { + A ( 'A', 0 ), + C ( 'C', 1 ), + G ( 'G', 2 ), + T ( 'T', 3 ), + D ( 'D', 4 ), + I ( 'I', 5 ), // insertion to the right of the base + N ( 'N', 6 ); + + final byte b; + final int index; + + public byte getByte() { return b; } + + private BaseIndex(char base, int index) { + this.b = (byte)base; + this.index = index; + } + + /** + * Converts a byte representation of a base to BaseIndex + * + * @param base the byte representation of the base + * @return the BaseIndex representation of the base; + */ + public static BaseIndex byteToBase(final byte base) { + switch (base) { + case 'A': + case 'a': + return A; + case 'C': + case 'c': + return C; + case 'G': + case 'g': + return G; + case 'T': + case 't': + return T; + case 'D': + case 'd': + case '-': + return D; + case 'I': + case 'i': + return I; + case 'N': + case 'n': + return N; + default: return null; + } + } + + /** + * Definition of a nucleotide for the BaseIndex is anything that has been read as a base + * by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows + * there is a base there (N). + * + * @return whether or not it is a nucleotide, given the definition above + */ + public boolean isNucleotide() { + return this == A || this == C || this == G || this == T || this == N; + } + + /** + * Whether or not this base is an insertion or a deletion + * + * @return true for I or D, false otherwise + */ + public boolean isIndel() { + return this == D || this == I; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAMWalker.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAMWalker.java new file mode 100644 index 000000000..3e07295e7 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompareBAMWalker.java @@ -0,0 +1,182 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Given two BAMs with different read groups, it compares them based on ReduceReads metrics. + *

+ * This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files. + *

+ *

Input

+ *

+ * Two BAM files (using -I) with different read group IDs + *

+ *

Output

+ *

+ * [Output description] + *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author carneiro + * @since 10/30/11 + */ + +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) +public class CompareBAMWalker extends LocusWalker, CompareBAMWalker.TestResults> { + @Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID; + @Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false; + @Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false; + @Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20; + @Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20; + + + @Override + public Map map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + Map result = new HashMap(); + + if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context)); + if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context)); + + return result; + } + + @Override + public TestResults reduceInit () { + TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP. + + if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES); + if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS); + + return sum; + } + + @Override + public TestResults reduce (Map mapResult, TestResults sum) { + for (TestName test : mapResult.keySet()) { + if (mapResult.get(test)) + sum.reportSuccess(test); + else + sum.reportFailed(test); + } + + return sum; + } + + public void onTraversalDone (TestResults finalResults) { + finalResults.report(); + } + + private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) { + return true; + } + + private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) { + + return true; + } + + public enum TestName { + EQUAL_BASES ("testEqualBases"), + BASE_COUNTS ("testBaseCounts"); + + private String testName; + + TestName(String testName) { + this.testName = testName; + } + + public String getTestName() { + return testName; + } + } + + public class TestResults { + private Map testStats = new HashMap(); + + public void createTest (TestName test) { + testStats.put(test, new TestOutcome()); + } + + public void reportSuccess(TestName test) { + if (testStats.containsKey(test)) + testStats.get(test).incPassed(); + else + throw new ReviewedStingException("No such test: " + test); + } + + public void reportFailed(TestName test) { + if (testStats.containsKey(test)) + testStats.get(test).incFailed(); + else + throw new ReviewedStingException("No such test: " + test); + } + + public void report() { + System.out.println(); + System.out.println(String.format("%20s\tPASS\tFAIL", "")); + for (TestName test : testStats.keySet()) + System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed())); + System.out.println(); + } + } + + private class TestOutcome { + private long passed; + private long failed; + + public long getPassed() { + return passed; + } + + public void incPassed() { + this.passed++; + } + + public long getFailed() { + return failed; + } + + public void incFailed() { + this.failed++; + } + } + + private BaseCounts getFilteredBaseCounts(AlignmentContext context) { + return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL); + } + + private BaseCounts getFullBaseCounts(AlignmentContext context) { + return getBaseCounts(context, 3, 0); + } + + private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) { + BaseCounts fullBaseCounts = new BaseCounts(); + for (String rg : context.getBasePileup().getReadGroups()) { + if (!rg.equals(reducedReadGroupID)) { + BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts()); + fullBaseCounts.add(b); + } + } + return fullBaseCounts; + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java new file mode 100644 index 000000000..1d173b444 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 4/10/11 + * Time: 8:49 AM + * + * A general interface for ReadCompressors. Read compressors have the following semantics: + * + * The accept a stream of reads, in order, and after each added read returns a compressed stream + * of reads for emission. This stream of reads is a "reduced" representation of the total stream + * of reads. The actual compression approach is left up to the implementing class. + */ +public interface Compressor { + /** + * Adds the read to the compressor. The returned iteratable collection of + * reads represents the incremental compressed output. + * @param read the next uncompressed read in the input stream to the compressor + * @return an iterator over the incrementally available compressed reads + */ + @Requires("read != null") + @Ensures("result != null") + Iterable addAlignment(GATKSAMRecord read); + + /** + * Must be called after the last read has been added to finalize the compressor state + * and return the last compressed reads from the compressor. + * @return an iterator over the final compressed reads of this compressor + */ + @Ensures("result != null") + Iterable close(); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java new file mode 100644 index 000000000..6b92046de --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -0,0 +1,204 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.LinkedList; + +/** + * The element that describes the header of the sliding window. + * + * Each site has a header element containing the counts of each base, it's reference based location and whether or + * not the site has insertions (to it's right). It also contains information about the bases that have been filtered + * out due to mapping or base quality. + */ +public class HeaderElement { + private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site. + private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site. + private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right + private int nSoftClippedBases; // How many bases in this site came from soft clipped bases + private int location; // Genome location of this site (the sliding window knows which contig we're at + private LinkedList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site) + + public int getLocation() { + return location; + } + + public BaseAndQualsCounts getFilteredBaseCounts() { + return filteredBaseCounts; + } + + public BaseAndQualsCounts getConsensusBaseCounts() { + return consensusBaseCounts; + } + + /** + * Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty + * filteredBaseCounts - 0 insertions to the right - empty mappingQuality list + * + * @param location the reference location for the new element + */ + public HeaderElement(int location) { + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList()); + } + + /** + * Creates a new HeaderElement with all given parameters + * + * @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read + * @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read + * @param insertionsToTheRight number of insertions to the right of this HeaderElement + * @param location the reference location of this reference element + * @param mappingQuality the list of mapping quality values of all reads that contributed to this + * HeaderElement + */ + public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList mappingQuality) { + this.consensusBaseCounts = consensusBaseCounts; + this.filteredBaseCounts = filteredBaseCounts; + this.insertionsToTheRight = insertionsToTheRight; + this.nSoftClippedBases = nSoftClippedBases; + this.location = location; + this.mappingQuality = mappingQuality; + } + + /** + * Whether or not the site represented by this HeaderElement is variant according to the definitions of variant + * by insertion, deletion and mismatches. + * + * @return true if site is variant by any definition. False otherwise. + */ + public boolean isVariant(double minVariantProportion, double minIndelProportion) { + return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); + } + + /** + * Adds a new base to the HeaderElement updating all counts accordingly + * + * @param base the base to add + * @param baseQual the base quality + * @param baseMappingQuality the mapping quality of the read this base belongs to + */ + public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { + if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual)) + consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts + else + filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts + + this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site + nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter + } + + public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { + if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual)) + consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts + else + filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts + + this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site + nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter + } + /** + * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions + * should be added to the right of the element. + */ + public void addInsertionToTheRight() { + insertionsToTheRight++; + } + + /** + * Does this HeaderElement contain consensus data? + * + * @return whether or not this HeaderElement contains consensus data + */ + public boolean hasConsensusData() { + return consensusBaseCounts.totalCount() > 0; + } + + /** + * Does this HeaderElement contain filtered data? + * + * @return whether or not this HeaderElement contains filtered data + */ + public boolean hasFilteredData() { + return filteredBaseCounts.totalCount() > 0; + } + + /** + * A HeaderElement is empty if it has no consensus or filtered data + * + * @return whether or not this HeaderElement has no data + */ + public boolean isEmpty() { + return (!hasFilteredData() && !hasConsensusData()); + } + + /** + * The RMS of the mapping qualities of all reads that contributed to this HeaderElement + * + * @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement + */ + public double getRMS() { + return MathUtils.rms(mappingQuality); + } + + /** + * removes an insertion from this element (if you removed a read that had an insertion) + */ + public void removeInsertionToTheRight() { + this.insertionsToTheRight--; + if (insertionsToTheRight < 0) + throw new ReviewedStingException("Removed too many insertions, header is now negative!"); + } + + /** + * Whether or not the HeaderElement is variant due to excess insertions + * + * @return whether or not the HeaderElement is variant due to excess insertions + */ + private boolean isVariantFromInsertions(double minIndelProportion) { + int numberOfBases = consensusBaseCounts.totalCount(); + if (numberOfBases == 0 && insertionsToTheRight > 0) + return true; // we only have insertions + else if (numberOfBases == 0) + return false; // we don't have anything + + // if we have bases and insertions, check the ratio + return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; + } + + /** + * Whether or not the HeaderElement is variant due to excess deletions + * + * @return whether or not the HeaderElement is variant due to excess insertions + */ + private boolean isVariantFromDeletions(double minIndelProportion) { + return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; + } + + /** + * Whether or not the HeaderElement is variant due to excess mismatches + * + * @return whether or not the HeaderElement is variant due to excess insertions + */ + private boolean isVariantFromMismatches(double minVariantProportion) { + BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels(); + double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); + return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); + } + + /** + * This handles the special case where we have more bases that came from soft clips than bases that came from + * normal bases by forcing it to become a variant region. We don't want a consensus based on too little information. + * + * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. + */ + private boolean isVariantFromSoftClips() { + return nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); + } + + private boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) { + return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; + } + + +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java new file mode 100644 index 000000000..0ac9630c2 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -0,0 +1,81 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import net.sf.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.HashMap; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * + * @author depristo + */ +public class MultiSampleCompressor implements Compressor { + protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); + + protected Map compressorsPerSample = new HashMap(); + + public MultiSampleCompressor(SAMFileHeader header, + final int contextSize, + final int downsampleCoverage, + final int minMappingQuality, + final double minAltProportionToTriggerVariant, + final double minIndelProportionToTriggerVariant, + final int minBaseQual, + final ReduceReadsWalker.DownsampleStrategy downsampleStrategy) { + for ( String name : SampleUtils.getSAMFileSamples(header) ) { + compressorsPerSample.put(name, + new SingleSampleCompressor(name, contextSize, downsampleCoverage, + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + } + } + + @Override + public Iterable addAlignment(GATKSAMRecord read) { + String sample = read.getReadGroup().getSample(); + SingleSampleCompressor compressor = compressorsPerSample.get(sample); + if ( compressor == null ) + throw new ReviewedStingException("No compressor for sample " + sample); + return compressor.addAlignment(read); + } + + @Override + public Iterable close() { + SortedSet reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + for ( SingleSampleCompressor comp : compressorsPerSample.values() ) + for ( GATKSAMRecord read : comp.close() ) + reads.add(read); + return reads; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java new file mode 100644 index 000000000..593047504 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java @@ -0,0 +1,110 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.LinkedList; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +/** + * This class implements a "read stash" that keeps reads always sorted in alignment order. Useful + * for read walkers that alter the alignment information of the incoming reads, but need to + * maintain the reads sorted for the reduce step. (e.g. ReduceReads) + */ + +public class ReduceReadsStash { + protected MultiSampleCompressor compressor; + SortedSet outOfOrderReads; + + /** + * Creates a stash with the default sorting order (read alignment) + * @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close()) + */ + public ReduceReadsStash(MultiSampleCompressor compressor) { + this.compressor = compressor; + this.outOfOrderReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + } + + /** + * Get all reads before a given read (for processing) + * + * @param read the original read + * @return all reads that have alignment start before the original read. + */ + public List getAllReadsBefore(GATKSAMRecord read) { + List result = new LinkedList(); + GATKSAMRecord newHead = null; + + for (GATKSAMRecord stashedRead : outOfOrderReads) { + if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0) + result.add(stashedRead); + else { + newHead = stashedRead; + break; + } + } + + if (result.size() > 0) { + if (result.size() == outOfOrderReads.size()) + outOfOrderReads.clear(); + else + outOfOrderReads = new TreeSet(outOfOrderReads.tailSet(newHead)); + } + + return result; + } + + /** + * sends the read to the MultiSampleCompressor + * + * @param read the read to be compressed + * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) + */ + public Iterable compress(GATKSAMRecord read) { + return compressor.addAlignment(read); + } + + /** + * Add a read to the stash + * + * @param read any read + */ + public void add(GATKSAMRecord read) { + outOfOrderReads.add(read); + } + + /** + * Close the stash, processing all remaining reads in order + * + * @return a list of all the reads produced by the SlidingWindow machinery) + */ + public Iterable close() { + LinkedList result = new LinkedList(); + + // compress all the stashed reads (in order) + for (GATKSAMRecord read : outOfOrderReads) + for (GATKSAMRecord compressedRead : compressor.addAlignment(read)) + result.add(compressedRead); + + // output any remaining reads from the compressor + for (GATKSAMRecord read : compressor.close()) + result.add(read); + + return result; + } + + /** + * Useful debug functionality, outputs all elements in the stash + */ + public void print() { + int i = 1; + System.out.println("Stash Contents:"); + for (GATKSAMRecord read : outOfOrderReads) + System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd())); + System.out.println(); + } + +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsWalker.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsWalker.java new file mode 100644 index 000000000..5b4a660e8 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsWalker.java @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.util.SequenceUtil; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.PartitionBy; +import org.broadinstitute.sting.gatk.walkers.PartitionType; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocComparator; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Reduces the BAM file using read based compression that keeps only essential information for variant calling + *

+ *

+ * This walker will generated reduced versions of the BAM files that still follow the BAM spec + * and contain all the information necessary for the GSA variant calling pipeline. Some options + * allow you to tune in how much compression you want to achieve. The default values have been + * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the + * savings in file size and performance of the downstream tools. + *

+ *

Input

+ *

+ * The BAM file to be compressed + *

+ *

+ *

Output

+ *

+ * The compressed (reduced) BAM file. + *

+ *

+ *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReduceReads \
+ *   -I myData.bam \
+ *   -o myData.reduced.bam
+ * 
+ */ + +@PartitionBy(PartitionType.INTERVAL) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) +public class ReduceReadsWalker extends ReadWalker, ReduceReadsStash> { + + @Output + protected StingSAMFileWriter out; + + /** + * The number of bases to keep around mismatches (potential variation) + */ + @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) + protected int contextSize = 10; + + /** + * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have + * mapping quality below this threshold will not be counted towards consensus, but are still counted + * towards variable regions. + */ + @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) + protected int minMappingQuality = 20; + + /** + * The minimum base quality to be considered for the consensus synthetic read. Reads that have + * base quality below this threshold will not be counted towards consensus, but are still counted + * towards variable regions. + */ + @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) + protected byte minBaseQual = 20; + + /** + * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality + * lower than this threshold will be hard clipped off before entering the reduce reads algorithm. + */ + @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) + protected byte minTailQuality = 2; + + /** + * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals + * and read group). + */ + @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) + protected boolean DONT_SIMPLIFY_READS = false; + + /** + * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. + * The program will behave correctly in those cases. + */ + @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) + protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; + + /** + * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail + * quality. + */ + @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) + protected boolean DONT_CLIP_LOW_QUAL_TAILS = false; + + /** + * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped + * base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant + * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) + */ + @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) + protected boolean DONT_USE_SOFTCLIPPED_BASES = false; + + /** + * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee + * uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather + * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. + */ + @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) + protected boolean DONT_COMPRESS_READ_NAMES = false; + + /** + * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be + * considered consensus. + */ + @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) + protected double minAltProportionToTriggerVariant = 0.05; + + /** + * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be + * considered consensus. + */ + @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) + protected double minIndelProportionToTriggerVariant = 0.01; + + /** + * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). + * A value of 0 turns downsampling off. + */ + @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) + protected int downsampleCoverage = 0; + + @Hidden + @Argument(fullName = "", shortName = "dl", doc = "", required = false) + protected int debugLevel = 0; + + @Hidden + @Argument(fullName = "", shortName = "dr", doc = "", required = false) + protected String debugRead = ""; + + @Hidden + @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) + protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; + + @Hidden + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + private boolean NO_PG_TAG = false; + + public enum DownsampleStrategy { + Normal, + Adaptive + } + + protected int totalReads = 0; + int nCompressedReads = 0; + + HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). + Long nextReadNumber = 1L; // The next number to use for the compressed read name. + + SortedSet intervalList; + + private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag + + /** + * Basic generic initialization of the readNameHash and the intervalList. Output initialization + * is done at the reduceInit method + */ + @Override + public void initialize() { + super.initialize(); + GenomeAnalysisEngine toolkit = getToolkit(); + readNameHash = new HashMap(); // prepare the read name hash to keep track of what reads have had their read names compressed + intervalList = new TreeSet(new GenomeLocComparator()); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode + + if (toolkit.getIntervals() != null) + intervalList.addAll(toolkit.getIntervals()); + + if (!NO_PG_TAG) + Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME); + else + out.setPresorted(false); + } + + /** + * Takes in a read and prepares it for the SlidingWindow machinery by performing the + * following optional clipping operations: + * 1. Hard clip adaptor sequences + * 2. Hard clip low quality tails + * 3. Hard clip all remaining soft clipped bases + * 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads) + * + * @param ref default map parameter + * @param read default map parameter + * @param metaDataTracker default map parameter + * @return a linked list with all the reads produced by the clipping operations + */ + @Override + public LinkedList map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + LinkedList mappedReads; + totalReads++; + if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) + System.out.println("Found debug read!"); + + if (debugLevel == 1) + System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd()); + + // we write the actual alignment starts to their respectiv alignment shift tags in the temporary + // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file + read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart()); + read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd()); + + if (!DONT_SIMPLIFY_READS) + read.simplify(); // Clear all unnecessary attributes + if (!DONT_CLIP_ADAPTOR_SEQUENCES) + read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any. + if (!DONT_CLIP_LOW_QUAL_TAILS) + read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails + if (!isWholeGenome()) + mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval + else { + mappedReads = new LinkedList(); + if (!read.isEmpty()) + mappedReads.add(read); + } + + if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) { + LinkedList tempList = new LinkedList(); + for (GATKSAMRecord mRead : mappedReads) { + GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual); + if (!clippedRead.isEmpty()) + tempList.add(clippedRead); + } + mappedReads = tempList; + } + + if (debugLevel == 1) + for (GATKSAMRecord mappedRead : mappedReads) + System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd()); + + return mappedReads; + + } + + /** + * Initializes the ReduceReadsStash that keeps track of all reads that are waiting to + * enter the SlidingWindow machinery. The stash makes sure reads are served in order + * even though map() may generate reads that are only supposed to enter the machinery + * in the future. + * + * @return the empty stash + */ + @Override + public ReduceReadsStash reduceInit() { + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + } + + /** + * Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process + * all reads that come before the original read (the read that was passed to map) including the original + * read. This is where we send reads, in order, to the SlidingWindow machinery. + * + * @param mappedReads the list of reads sent by map + * @param stash the stash that keeps the reads in order for processing + * @return the stash with all reads that have not been processed yet + */ + public ReduceReadsStash reduce(LinkedList mappedReads, ReduceReadsStash stash) { + if (debugLevel == 1) + stash.print(); + + boolean firstRead = true; + for (GATKSAMRecord read : mappedReads) { + boolean originalRead = firstRead && isOriginalRead(mappedReads, read); + + if (read.getReadLength() == 0) + throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); + + if (originalRead) { + List readsReady = new LinkedList(); + readsReady.addAll(stash.getAllReadsBefore(read)); + readsReady.add(read); + + for (GATKSAMRecord readReady : readsReady) { + if (debugLevel == 1) + System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd()); + + for (GATKSAMRecord compressedRead : stash.compress(readReady)) + outputRead(compressedRead); + + } + } else + stash.add(read); + + firstRead = false; + } + + return stash; + } + + /** + * Now that now more reads will come, we process all the remaining reads in the stash, in order. + * + * @param stash the ReduceReadsStash with all unprocessed reads (from reduce) + */ + @Override + public void onTraversalDone(ReduceReadsStash stash) { + + // output any remaining reads in the compressor + for (GATKSAMRecord read : stash.close()) + outputRead(read); + } + + /** + * Hard clips away all parts of the read that doesn't agree with the intervals selected. + * + * Note: If read overlaps more than one interval, it will be hard clipped to all + * the intervals it overlaps with + * + * @param read the read to be hard clipped to the interval. + * @return a shallow copy of the read hard clipped to the interval + */ + private LinkedList hardClipReadToInterval(GATKSAMRecord read) { + LinkedList clippedReads = new LinkedList(); + + GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list) + + boolean originalRead = true; // false if this is the right tail of the original read + boolean overlap; // keeps track of the interval that overlapped the original read + boolean doneClipping; // triggers an early exit if we are done clipping this read + + if (isWholeGenome()) + clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged + + for (GenomeLoc interval : intervalList) { + + if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before) + break; + + GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch + + switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) { + case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read + if (!originalRead) // something went wrong if this is the tail of the read + throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); + overlap = false; + doneClipping = false; + break; + + + case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore + if (originalRead) { + overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read). + clippedRead = GATKSAMRecord.emptyRead(read); + } else + overlap = false; + + doneClipping = false; + break; + + case NO_OVERLAP_CONTIG: // read is in a different contig + if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one. + if (read.getReferenceIndex() < interval.getContigIndex()) + throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); + else { + overlap = false; + doneClipping = false; + } + } // tail read CANNOT be in a different contig. + else { + if (read.getReferenceIndex() < interval.getContigIndex()) { + overlap = false; + doneClipping = true; + } else + throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); + + } + break; + + case NO_OVERLAP_LEFT: + if (originalRead) // if this is the first read this should never happen. + throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); + + overlap = false; + doneClipping = true; + break; + + case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore + overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here. + doneClipping = true; + break; + + case OVERLAP_LEFT: // clip the left tail of the read + clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1); + + overlap = true; + doneClipping = true; + break; + + case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval + clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1); + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop()); + + overlap = true; + doneClipping = false; + break; + + case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read + clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1); + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop()); + + overlap = true; + doneClipping = false; + break; + + case OVERLAP_CONTAINED: // don't do anything to the read + clippedRead = read; + + overlap = true; + doneClipping = true; + break; + + default: + throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval."); + } + + if (overlap && originalRead) + intervalOverlapped = interval; + + if (clippedRead != null) { + originalRead = false; + + if (!clippedRead.isEmpty()) + clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off + } + + if (doneClipping) + break; + } + + if (intervalOverlapped != null) + intervalList = intervalList.tailSet(intervalOverlapped); + + return clippedReads; + } + + /** + * Compresses the read name and adds it to output BAM file (reduced BAM) + * after performing some quality control + * + * @param read any read + */ + private void outputRead(GATKSAMRecord read) { + if (debugLevel == 2) { + checkForHighMismatch(read); + checkCigar(read); + } + + if (read.isReducedRead()) + nCompressedReads++; + else { + int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); + int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); + + int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression + int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression + + if (startShift > 0) + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start) + if (endShift > 0) + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end) + + totalReads++; + } + + if (debugLevel == 1) + System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd()); + +// if (!DONT_USE_SOFTCLIPPED_BASES) +// reSoftClipBases(read); + + if (!DONT_COMPRESS_READ_NAMES) + compressReadName(read); + + out.addAlignment(read); + } + + private void reSoftClipBases(GATKSAMRecord read) { + Integer left = (Integer) read.getTemporaryAttribute("SL"); + Integer right = (Integer) read.getTemporaryAttribute("SR"); + if (left != null || right != null) { + Cigar newCigar = new Cigar(); + for (CigarElement element : read.getCigar().getCigarElements()) { + newCigar.add(new CigarElement(element.getLength(), element.getOperator())); + } + + if (left != null) { + newCigar = updateFirstSoftClipCigarElement(left, newCigar); + read.setAlignmentStart(read.getAlignmentStart() + left); + } + + if (right != null) { + Cigar invertedCigar = invertCigar(newCigar); + newCigar = invertCigar(updateFirstSoftClipCigarElement(right, invertedCigar)); + } + read.setCigar(newCigar); + } + } + + /** + * Facility routine to revert the first element of a Cigar string (skipping hard clips) into a soft-clip. + * To be used on both ends if provided a flipped Cigar + * + * @param softClipSize the length of the soft clipped element to add + * @param originalCigar the original Cigar string + * @return a new Cigar object with the soft clips added + */ + private Cigar updateFirstSoftClipCigarElement (int softClipSize, Cigar originalCigar) { + Cigar result = new Cigar(); + CigarElement leftElement = new CigarElement(softClipSize, CigarOperator.S); + boolean updated = false; + for (CigarElement element : originalCigar.getCigarElements()) { + if (!updated && element.getOperator() == CigarOperator.M) { + result.add(leftElement); + int newLength = element.getLength() - softClipSize; + if (newLength > 0) + result.add(new CigarElement(newLength, CigarOperator.M)); + updated = true; + } + else + result.add(element); + } + return result; + } + + /** + * Given a cigar string, returns the inverted cigar string. + * + * @param cigar the original cigar + * @return the inverted cigar + */ + private Cigar invertCigar(Cigar cigar) { + Stack stack = new Stack(); + for (CigarElement e : cigar.getCigarElements()) + stack.push(e); + Cigar inverted = new Cigar(); + while (!stack.empty()) { + inverted.add(stack.pop()); + } + return inverted; + } + + + /** + * Quality control procedure that checks if the consensus reads contains too many + * mismatches with the reference. This should never happen and is a good trigger for + * errors with the algorithm. + * + * @param read any read + */ + private void checkForHighMismatch(GATKSAMRecord read) { + final int start = read.getAlignmentStart(); + final int stop = read.getAlignmentEnd(); + final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases(); + final int nm = SequenceUtil.countMismatches(read, ref, start - 1); + final int readLen = read.getReadLength(); + final double nmFraction = nm / (1.0 * readLen); + if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus")) + throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); + } + + private void checkCigar (GATKSAMRecord read) { + if (read.getCigar().isValid(null, -1) != null) { + throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString()); + } + + } + + + /** + * Compresses the read name using the readNameHash if we have already compressed + * this read name before. + * + * @param read any read + */ + private void compressReadName(GATKSAMRecord read) { + String name = read.getReadName(); + String compressedName = read.isReducedRead() ? "C" : ""; + if (readNameHash.containsKey(name)) + compressedName += readNameHash.get(name).toString(); + else { + readNameHash.put(name, nextReadNumber); + compressedName += nextReadNumber.toString(); + nextReadNumber++; + } + + read.setReadName(compressedName); + } + + /** + * Returns true if the read is the original read that went through map(). + * + * This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled. + * + * @param list the list + * @param read the read + * @return Returns true if the read is the original read that went through map(). + */ + private boolean isOriginalRead(LinkedList list, GATKSAMRecord read) { + return isWholeGenome() || (list.getFirst().equals(read) && ReadUtils.getReadAndIntervalOverlapType(read, intervalList.first()) == ReadUtils.ReadAndIntervalOverlap.OVERLAP_CONTAINED); + } + + /** + * Checks whether or not the intervalList is empty, meaning we're running in WGS mode. + * + * @return whether or not we're running in WGS mode. + */ + private boolean isWholeGenome() { + return intervalList.isEmpty(); + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java new file mode 100644 index 000000000..fd0dfa1ff --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.TreeSet; + +/** + * + * @author depristo + * @version 0.1 + */ +public class SingleSampleCompressor implements Compressor { + protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class); + + protected final int contextSize; + protected final int downsampleCoverage; + protected int minMappingQuality; + protected int slidingWindowCounter; + + protected final String sampleName; + + protected SlidingWindow slidingWindow; + protected double minAltProportionToTriggerVariant; + protected double minIndelProportionToTriggerVariant; + protected int minBaseQual; + + protected ReduceReadsWalker.DownsampleStrategy downsampleStrategy; + + public SingleSampleCompressor(final String sampleName, + final int contextSize, + final int downsampleCoverage, + final int minMappingQuality, + final double minAltProportionToTriggerVariant, + final double minIndelProportionToTriggerVariant, + final int minBaseQual, + final ReduceReadsWalker.DownsampleStrategy downsampleStrategy) { + this.sampleName = sampleName; + this.contextSize = contextSize; + this.downsampleCoverage = downsampleCoverage; + this.minMappingQuality = minMappingQuality; + this.slidingWindowCounter = 0; + this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant; + this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; + this.minBaseQual = minBaseQual; + this.downsampleStrategy = downsampleStrategy; + } + + /** + * @{inheritDoc} + */ + @Override + public Iterable addAlignment( GATKSAMRecord read ) { + TreeSet result = new TreeSet(new AlignmentStartWithNoTiesComparator()); + int readOriginalStart = read.getUnclippedStart(); + + // create a new window if: + if ((slidingWindow != null) && + ( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig + (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window + + // close the current sliding window + result.addAll(slidingWindow.close()); + slidingWindow = null; // so we create a new one on the next if + } + + if ( slidingWindow == null) { // this is the first read + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); + slidingWindowCounter++; + } + + result.addAll(slidingWindow.addRead(read)); + return result; + } + + @Override + public Iterable close() { + return (slidingWindow != null) ? slidingWindow.close() : new TreeSet(); + } + +} + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java new file mode 100644 index 000000000..68dfd041b --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -0,0 +1,713 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.gatk.downsampling.FractionalDownsampler; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; + +/** + * Created by IntelliJ IDEA. + * User: roger + * Date: 8/3/11 + * Time: 2:24 PM + */ +public class SlidingWindow { + + // Sliding Window data + final private LinkedList readsInWindow; + final private LinkedList windowHeader; + protected int contextSize; // the largest context size (between mismatches and indels) + protected int stopLocation; + protected String contig; + protected int contigIndex; + protected SAMFileHeader header; + protected GATKSAMReadGroupRecord readGroupAttribute; + protected int downsampleCoverage; + + // Running consensus data + protected SyntheticRead runningConsensus; + protected int consensusCounter; + protected String consensusReadName; + + // Filtered Data Consensus data + protected SyntheticRead filteredDataConsensus; + protected int filteredDataConsensusCounter; + protected String filteredDataReadName; + + + // Additional parameters + protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches + protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions + protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value + protected int MIN_MAPPING_QUALITY; + + protected ReduceReadsWalker.DownsampleStrategy downsampleStrategy; + private boolean hasIndelQualities; + + /** + * The types of synthetic reads to use in the finalizeAndAdd method + */ + private enum ConsensusType { + CONSENSUS, + FILTERED, + BOTH + } + + public int getStopLocation() { + return stopLocation; + } + + public String getContig() { + return contig; + } + + public int getContigIndex() { + return contigIndex; + } + + public int getStartLocation() { + return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation(); + } + + + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReadsWalker.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { + this.stopLocation = -1; + this.contextSize = contextSize; + this.downsampleCoverage = downsampleCoverage; + + this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant; + this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant; + this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; + this.MIN_MAPPING_QUALITY = minMappingQuality; + + this.windowHeader = new LinkedList(); + this.readsInWindow = new LinkedList(); + + this.contig = contig; + this.contigIndex = contigIndex; + this.header = header; + this.readGroupAttribute = readGroupAttribute; + + this.consensusCounter = 0; + this.consensusReadName = "Consensus-" + windowNumber + "-"; + + this.filteredDataConsensusCounter = 0; + this.filteredDataReadName = "Filtered-" + windowNumber + "-"; + + this.runningConsensus = null; + this.filteredDataConsensus = null; + + this.downsampleStrategy = downsampleStrategy; + this.hasIndelQualities = hasIndelQualities; + } + + /** + * Add a read to the sliding window and slides the window accordingly. + * + * Reads are assumed to be in order, therefore, when a read is added the sliding window can + * assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window + * slides forward to that position and returns all reads that may have been finalized in the + * sliding process. + * + * @param read the read + * @return a list of reads that have been finished by sliding the window. + */ + public List addRead(GATKSAMRecord read) { + updateHeaderCounts(read, false); // update the window header counts + readsInWindow.add(read); // add read to sliding reads + return slideWindow(read.getUnclippedStart()); + } + + /** + * returns the next complete or incomplete variant region between 'from' (inclusive) and 'to' (exclusive) + * + * @param from beginning window header index of the search window (inclusive) + * @param to end window header index of the search window (exclusive) + * @param variantSite boolean array with true marking variant regions + * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. + */ + private Pair getNextVariantRegion(int from, int to, boolean[] variantSite) { + boolean foundStart = false; + int variantRegionStartIndex = 0; + for (int i=from; i(variantRegionStartIndex, i-1)); + } + } + return (foundStart) ? new Pair(variantRegionStartIndex, -1) : null; + } + + /** + * Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive) + * + * @param from beginning window header index of the search window (inclusive) + * @param to end window header index of the search window (exclusive) + * @param variantSite boolean array with true marking variant regions + * @return a list with start/stops of variant regions following getNextVariantRegion description + */ + private List> getAllVariantRegions(int from, int to, boolean[] variantSite) { + List> regions = new LinkedList>(); + int index = from; + while(index < to) { + Pair result = getNextVariantRegion(index, to, variantSite); + if (result == null) + break; + + regions.add(result); + if (result.getSecond() < 0) + break; + index = result.getSecond() + 1; + } + return regions; + } + + + /** + * Determines if the window can be slid given the new incoming read. + * + * We check from the start of the window to the (unclipped) start of the new incoming read if there + * is any variant. + * If there are variant sites, we check if it's time to close the variant region. + * + * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start! + * @return all reads that have fallen to the left of the sliding window after the slide + */ + protected List slideWindow(int incomingReadUnclippedStart) { + List finalizedReads = new LinkedList(); + + if (incomingReadUnclippedStart - contextSize > getStartLocation()) { + int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(); + boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex); + int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) + + List> regions = getAllVariantRegions(0, breakpoint, variantSite); + finalizedReads = closeVariantRegions(regions, false); + + List readsToRemove = new LinkedList(); + for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) + if (read.getAlignmentEnd() < getStartLocation()) { + readsToRemove.add(read); + } + } + for (GATKSAMRecord read : readsToRemove) { + readsInWindow.remove(read); + } + } + + return finalizedReads; + } + + /** + * returns an array marked with variant and non-variant regions (it uses + * markVariantRegions to make the marks) + * + * @param stop check the window from start to stop (not-inclusive) + * @return a boolean array with 'true' marking variant regions and false marking consensus sites + */ + protected boolean[] markSites(int stop) { + + boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1]; + + Iterator headerElementIterator = windowHeader.iterator(); + for (int i = getStartLocation(); i < stop; i++) { + if (headerElementIterator.hasNext()) { + HeaderElement headerElement = headerElementIterator.next(); + + if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) + markVariantRegion(markedSites, i - getStartLocation()); + + } else + break; + } + return markedSites; + } + + /** + * Marks the sites around the variant site (as true) + * + * @param markedSites the boolean array to bear the marks + * @param variantSiteLocation the location where a variant site was found + */ + protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) { + int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; + int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1; + for (int i = from; i < to; i++) + markedSites[i] = true; + } + + /** + * Adds bases to the running consensus or filtered data accordingly + * + * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus + * + * @param start the first header index to add to consensus + * @param end the first header index NOT TO add to consensus + * @return a list of consensus reads generated by this call. Empty list if no consensus was generated. + */ + protected List addToSyntheticReads(int start, int end) { + LinkedList reads = new LinkedList(); + if (start < end) { + + ListIterator headerElementIterator = windowHeader.listIterator(start); + + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end)); + + HeaderElement headerElement = headerElementIterator.next(); + + if (headerElement.hasConsensusData()) { + reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); + + int endOfConsensus = findNextNonConsensusElement(start, end); + addToRunningConsensus(start, endOfConsensus); + + if (endOfConsensus <= start) + throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); + + reads.addAll(addToSyntheticReads(endOfConsensus, end)); + } else if (headerElement.hasFilteredData()) { + reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); + + int endOfFilteredData = findNextNonFilteredDataElement(start, end); + addToFilteredData(start, endOfFilteredData); + + if (endOfFilteredData <= start) + throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); + + reads.addAll(addToSyntheticReads(endOfFilteredData, end)); + } else if (headerElement.isEmpty()) { + reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); + + int endOfEmptyData = findNextNonEmptyElement(start, end); + + if (endOfEmptyData <= start) + throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); + + reads.addAll(addToSyntheticReads(endOfEmptyData, end)); + } else + throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start)); + + } + + return reads; + } + + /** + * Finalizes one or more synthetic reads. + * + * @param type the synthetic reads you want to close + * @return the GATKSAMRecords generated by finalizing the synthetic reads + */ + private List finalizeAndAdd(ConsensusType type) { + GATKSAMRecord read = null; + List list = new LinkedList(); + + switch (type) { + case CONSENSUS: + read = finalizeRunningConsensus(); + break; + case FILTERED: + read = finalizeFilteredDataConsensus(); + break; + case BOTH: + read = finalizeRunningConsensus(); + if (read != null) list.add(read); + read = finalizeFilteredDataConsensus(); + } + if (read != null) + list.add(read); + + return list; + } + + /** + * Looks for the next position without consensus data + * + * @param start beginning of the filtered region + * @param upTo limit to search for another consensus element + * @return next position with consensus data or empty + */ + private int findNextNonConsensusElement(int start, int upTo) { + Iterator headerElementIterator = windowHeader.listIterator(start); + int index = start; + while (index < upTo) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("There are no more header elements in this window"); + + HeaderElement headerElement = headerElementIterator.next(); + if (!headerElement.hasConsensusData()) + break; + index++; + } + return index; + } + + /** + * Looks for the next position without filtered data + * + * @param start beginning of the region + * @param upTo limit to search for + * @return next position with no filtered data + */ + private int findNextNonFilteredDataElement(int start, int upTo) { + Iterator headerElementIterator = windowHeader.listIterator(start); + int index = start; + while (index < upTo) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("There are no more header elements in this window"); + + HeaderElement headerElement = headerElementIterator.next(); + if (!headerElement.hasFilteredData() || headerElement.hasConsensusData()) + break; + index++; + } + return index; + } + + /** + * Looks for the next non-empty header element + * + * @param start beginning of the region + * @param upTo limit to search for + * @return next position with non-empty element + */ + private int findNextNonEmptyElement(int start, int upTo) { + ListIterator headerElementIterator = windowHeader.listIterator(start); + int index = start; + while (index < upTo) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("There are no more header elements in this window"); + + HeaderElement headerElement = headerElementIterator.next(); + if (!headerElement.isEmpty()) + break; + index++; + } + return index; + } + + + /** + * Adds bases to the filtered data synthetic read. + * + * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData + * bases. + * + * @param start the first header index to add to consensus + * @param end the first header index NOT TO add to consensus + */ + private void addToFilteredData(int start, int end) { + if (filteredDataConsensus == null) + filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities); + + ListIterator headerElementIterator = windowHeader.listIterator(start); + for (int index = start; index < end; index++) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist"); + + HeaderElement headerElement = headerElementIterator.next(); + if (headerElement.hasConsensusData()) + throw new ReviewedStingException("Found consensus data inside region to add to filtered data."); + + if (!headerElement.hasFilteredData()) + throw new ReviewedStingException("No filtered data in " + index); + + genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); + } + } + + /** + * Adds bases to the filtered data synthetic read. + * + * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData + * bases. + * + * @param start the first header index to add to consensus + * @param end the first header index NOT TO add to consensus + */ + private void addToRunningConsensus(int start, int end) { + if (runningConsensus == null) + runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities); + + Iterator headerElementIterator = windowHeader.listIterator(start); + for (int index = start; index < end; index++) { + if (!headerElementIterator.hasNext()) + throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist"); + + HeaderElement headerElement = headerElementIterator.next(); + if (!headerElement.hasConsensusData()) + throw new ReviewedStingException("No CONSENSUS data in " + index); + + genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS()); + } + } + + /** + * Generic accessor to add base and qualities to a synthetic read + * + * @param syntheticRead the synthetic read to add to + * @param baseCounts the base counts object in the header element + * @param rms the rms mapping quality in the header element + */ + private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) { + BaseIndex base = baseCounts.baseIndexWithMostCounts(); + byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE); + byte qual = baseCounts.averageQualsOfMostCommonBase(); + byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase(); + byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase(); + syntheticRead.add(base, count, qual, insQual, delQual, rms); + } + + /** + * Finalizes a variant region, any adjacent synthetic reads. + * + * @param start the first window header index in the variant region (inclusive) + * @param stop the last window header index of the variant region (inclusive) + * @return all reads contained in the variant region plus any adjacent synthetic reads + */ + @Requires("start <= stop") + protected List closeVariantRegion(int start, int stop) { + List allReads = new LinkedList(); + + int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based + int refStop = windowHeader.get(stop).getLocation(); + + for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region + if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) { + allReads.add(read); + updateHeaderCounts(read, true); // Remove this read from the window header entirely + } + } + + List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; + result.addAll(addToSyntheticReads(0, start)); + result.addAll(finalizeAndAdd(ConsensusType.BOTH)); + + for (GATKSAMRecord read : result) { + readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts. + } + + return result; // finalized reads will be downsampled if necessary + } + + + private List closeVariantRegions(List> regions, boolean forceClose) { + List allReads = new LinkedList(); + if (!regions.isEmpty()) { + int lastStop = -1; + for (Pair region : regions) { + int start = region.getFirst(); + int stop = region.getSecond(); + if (stop < 0 && forceClose) + stop = windowHeader.size() - 1; + if (stop >= 0) { + allReads.addAll(closeVariantRegion(start, stop)); + lastStop = stop; + } + } + for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + } + return allReads; + } + + /** + * Downsamples a variant region to the downsample coverage of the sliding window. + * + * It will use the downsampling strategy defined by the SlidingWindow + * + * @param allReads the reads to select from (all reads that cover the window) + * @return a list of reads selected by the downsampler to cover the window to at least the desired coverage + */ + protected List downsampleVariantRegion(final List allReads) { + double fraction = 100 / allReads.size(); + if (fraction >= 1) + return allReads; + + FractionalDownsampler downsampler = new FractionalDownsampler(fraction); + downsampler.submit(allReads); + return downsampler.consumeDownsampledItems(); + } + + /** + * Properly closes a Sliding Window, finalizing all consensus and variant + * regions that still exist regardless of being able to fulfill the + * context size requirement in the end. + * + * @return All reads generated + */ + public List close() { + // mark variant regions + List finalizedReads = new LinkedList(); + + if (!windowHeader.isEmpty()) { + boolean[] variantSite = markSites(stopLocation + 1); + List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite); + finalizedReads = closeVariantRegions(regions, true); + + if (!windowHeader.isEmpty()) { + finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1)); + finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up + } + + } + return finalizedReads; + } + + /** + * generates the SAM record for the running consensus read and resets it (to null) + * + * @return the read contained in the running consensus + */ + protected GATKSAMRecord finalizeRunningConsensus() { + GATKSAMRecord finalizedRead = null; + if (runningConsensus != null) { + if (runningConsensus.size() > 0) + finalizedRead = runningConsensus.close(); + else + consensusCounter--; + + runningConsensus = null; + } + return finalizedRead; + } + + /** + * generates the SAM record for the filtered data consensus and resets it (to null) + * + * @return the read contained in the running consensus + */ + protected GATKSAMRecord finalizeFilteredDataConsensus() { + GATKSAMRecord finalizedRead = null; + if (filteredDataConsensus != null) { + if (filteredDataConsensus.size() > 0) + finalizedRead = filteredDataConsensus.close(); + else + filteredDataConsensusCounter--; + + filteredDataConsensus = null; + } + return finalizedRead; + } + + + /** + * Updates the sliding window's header counts with the incoming read bases, insertions + * and deletions. + * + * @param read the incoming read to be added to the sliding window + */ + protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) { + byte[] bases = read.getReadBases(); + byte[] quals = read.getBaseQualities(); + byte[] insQuals = read.getExistingBaseInsertionQualities(); + byte[] delQuals = read.getExistingBaseDeletionQualities(); + int readStart = read.getSoftStart(); + int readEnd = read.getSoftEnd(); + Cigar cigar = read.getCigar(); + + int readBaseIndex = 0; + int startLocation = getStartLocation(); + int locationIndex = startLocation < 0 ? 0 : readStart - startLocation; + + if (removeRead && locationIndex < 0) + throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation); + + if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it + if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window + for (int i = 1; i <= -locationIndex; i++) + windowHeader.addFirst(new HeaderElement(startLocation - i)); + + startLocation = readStart; // update start location accordingly + locationIndex = 0; + } + + if (stopLocation < readEnd) { // Do we need to add extra elements to the header? + int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation; + while (elementsToAdd-- > 0) + windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd)); + + stopLocation = readEnd; // update stopLocation accordingly + } + + // Special case for leading insertions before the beginning of the sliding read + if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) { + windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added + locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing. + } + } + + Iterator headerElementIterator = windowHeader.listIterator(locationIndex); + HeaderElement headerElement; + for (CigarElement cigarElement : cigar.getCigarElements()) { + switch (cigarElement.getOperator()) { + case H: + break; + case I: + if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it. + break; + } + + headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element) + + if (removeRead) { + headerElement.removeInsertionToTheRight(); + } + else { + headerElement.addInsertionToTheRight(); + } + readBaseIndex += cigarElement.getLength(); + break; // just ignore the insertions at the beginning of the read + case D: + int nDeletions = cigarElement.getLength(); + while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score + headerElement = headerElementIterator.next(); + byte mq = (byte) read.getMappingQuality(); + if (removeRead) + headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); + else + headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); + + locationIndex++; + } + break; + case S: + case M: + case P: + case EQ: + case X: + int nBasesToAdd = cigarElement.getLength(); + while (nBasesToAdd-- > 0) { + headerElement = headerElementIterator.next(); + byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything) + byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex]; + if (removeRead) + headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S); + else + headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S); + + readBaseIndex++; + locationIndex++; + } + break; + } + } + } +} + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java new file mode 100644 index 000000000..9ee1a4634 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -0,0 +1,285 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Running Consensus is a read that is compressed as a sliding window travels over the reads + * and keeps track of all the bases that are outside of variant regions. + * + * Consensus reads have qual fields that correspond to the number of reads that had the base + * and passed the minimum quality threshold. + * + * The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads + * that compose the consensus + * + * @author Mauricio Carneiro + * @since 8/26/11 + */ +public class SyntheticRead { + private List bases; + private List counts; + private List quals; + private List insertionQuals; + private List deletionQuals; + private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus + private String readTag; + + // Information to produce a GATKSAMRecord + private SAMFileHeader header; + private GATKSAMReadGroupRecord readGroupRecord; + private String contig; + private int contigIndex; + private String readName; + private Integer refStart; + private boolean hasIndelQualities = false; + + /** + * Full initialization of the running consensus if you have all the information and are ready to + * start adding to the running consensus. + * + * @param header GATKSAMRecord file header + * @param readGroupRecord Read Group for the GATKSAMRecord + * @param contig the read's contig name + * @param contigIndex the read's contig index + * @param readName the read's name + * @param refStart the alignment start (reference based) + * @param readTag the reduce reads tag for the synthetic read + */ + public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) { + final int initialCapacity = 10000; + bases = new ArrayList(initialCapacity); + counts = new ArrayList(initialCapacity); + quals = new ArrayList(initialCapacity); + insertionQuals = new ArrayList(initialCapacity); + deletionQuals = new ArrayList(initialCapacity); + mappingQuality = 0.0; + + this.readTag = readTag; + this.header = header; + this.readGroupRecord = readGroupRecord; + this.contig = contig; + this.contigIndex = contigIndex; + this.readName = readName; + this.refStart = refStart; + this.hasIndelQualities = hasIndelQualities; + } + + public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) { + this.bases = bases; + this.counts = counts; + this.quals = quals; + this.insertionQuals = insertionQuals; + this.deletionQuals = deletionQuals; + this.mappingQuality = mappingQuality; + this.readTag = readTag; + this.header = header; + this.readGroupRecord = readGroupRecord; + this.contig = contig; + this.contigIndex = contigIndex; + this.readName = readName; + this.refStart = refStart; + this.hasIndelQualities = hasIndelQualities; + } + + /** + * Easy access to keep adding to a running consensus that has already been + * initialized with the correct read name and refStart + * + * @param base the base to add + * @param count number of reads with this base + */ + @Requires("count < Byte.MAX_VALUE") + public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) { + counts.add(count); + bases.add(base); + quals.add(qual); + insertionQuals.add(insQual); + deletionQuals.add(delQual); + this.mappingQuality += mappingQuality; + } + + public BaseIndex getBase(int readCoordinate) { + return bases.get(readCoordinate); + } + + /** + * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid. + * + * Invalid reads are : + * - exclusively composed of deletions + * + * @return a GATKSAMRecord or null + */ + public GATKSAMRecord close () { + if (isAllDeletions()) + return null; + + GATKSAMRecord read = new GATKSAMRecord(header); + read.setReferenceName(contig); + read.setReferenceIndex(contigIndex); + read.setReadPairedFlag(false); + read.setReadUnmappedFlag(false); + read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) + read.setAlignmentStart(refStart); + read.setReadName(readName); + read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION); + read.setReadBases(convertReadBases()); + read.setMappingQuality((int) Math.ceil(mappingQuality / bases.size())); + read.setReadGroup(readGroupRecord); + read.setAttribute(readTag, convertBaseCounts()); + + if (hasIndelQualities) { + read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION); + read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION); + } + + return read; + } + + /** + * Checks if the synthetic read is composed exclusively of deletions + * + * @return true if it is, false if it isn't. + */ + private boolean isAllDeletions() { + for (BaseIndex b : bases) + if (b != BaseIndex.D) + return false; + return true; + } + + public int size () { + return bases.size(); + } + + private byte [] convertBaseQualities() { + return convertVariableGivenBases(bases, quals); + } + + private byte [] convertInsertionQualities() { + return convertVariableGivenBases(bases, insertionQuals); + } + + private byte [] convertDeletionQualities() { + return convertVariableGivenBases(bases, deletionQuals); + } + + protected byte [] convertBaseCounts() { + byte[] countsArray = convertVariableGivenBases(bases, counts); + + if (countsArray.length == 0) + throw new ReviewedStingException("Reduced read has counts array of length 0"); + + byte[] compressedCountsArray = new byte [countsArray.length]; + compressedCountsArray[0] = countsArray[0]; + for (int i = 1; i < countsArray.length; i++) + compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE); + + return compressedCountsArray; + } + + private byte [] convertReadBases() { + byte [] readArray = new byte[getReadLengthWithNoDeletions(bases)]; + int i = 0; + for (BaseIndex baseIndex : bases) + if (baseIndex != BaseIndex.D) + readArray[i++] = baseIndex.getByte(); + + return readArray; + } + + /** + * Builds the cigar string for the synthetic read + * + * Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read. + * + * @return the cigar string for the synthetic read + */ + private Cigar buildCigar() { + LinkedList cigarElements = new LinkedList(); + CigarOperator cigarOperator = null; + int length = 0; + for (BaseIndex b : bases) { + CigarOperator op; + switch (b) { + case D: + op = CigarOperator.DELETION; + break; + case I: + throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported."); + default: + op = CigarOperator.MATCH_OR_MISMATCH; + break; + } + if (cigarOperator == null) { + if (op == CigarOperator.D) // read cannot start with a deletion + refStart++; // if it does, we need to move the reference start forward + else + cigarOperator = op; + } + else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one + cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator + cigarOperator = op; + length = 0; + } + + if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions) + length++; + } + if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion + cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element + + return new Cigar(cigarElements); + } + + /** + * Shared functionality for all conversion utilities + * + * @param bases the read bases + * @param variable the list to convert + * @return a converted variable given the bases and skipping deletions + */ + + private static byte [] convertVariableGivenBases (List bases, List variable) { + byte [] variableArray = new byte[getReadLengthWithNoDeletions(bases)]; + int i = 0; + Iterator variableIterator = variable.iterator(); + for (BaseIndex baseIndex : bases) { + byte count = variableIterator.next(); + if (baseIndex != BaseIndex.D) + variableArray[i++] = count; + } + return variableArray; + + } + + /** + * Shared functionality for all conversion utilities + * + * @param bases the read bases + * @return the length of the read with no deletions + */ + private static int getReadLengthWithNoDeletions(List bases) { + int readLength = bases.size(); + for (BaseIndex baseIndex : bases) + if (baseIndex == BaseIndex.D) + readLength--; + return readLength; + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java new file mode 100644 index 000000000..a8707641a --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java @@ -0,0 +1,69 @@ +// our package +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Basic unit test for BaseCounts in reduced reads + */ +public class BaseCountsUnitTest extends BaseTest { + private class SingleTest { + public String bases; + public byte mostCountBase; + public int mostCommonCount; + + private SingleTest(String bases, char mostCountBase, int mostCommonCount) { + this.mostCommonCount = mostCommonCount; + this.mostCountBase = (byte)mostCountBase; + this.bases = bases; + } + } + + + @DataProvider(name = "data") + public Object[][] createData1() { + List params = new ArrayList(); + + params.add(new SingleTest("A", 'A', 1 )); + params.add(new SingleTest("AA", 'A', 2 )); + params.add(new SingleTest("AC", 'A', 1 )); + params.add(new SingleTest("AAC", 'A', 2 )); + params.add(new SingleTest("AAA", 'A', 3 )); + params.add(new SingleTest("AAAN", 'A', 3 )); + params.add(new SingleTest("AAANNNN", 'N', 4 )); + params.add(new SingleTest("AACTG", 'A', 2 )); + params.add(new SingleTest("D", 'D', 1 )); + params.add(new SingleTest("DDAAD", 'D', 3)); + params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 )); + params.add(new SingleTest("AAIIIAI", 'I', 4 )); + + List params2 = new ArrayList(); + for ( SingleTest x : params ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + + + + @Test(dataProvider = "data", enabled = true) + public void testCounting(SingleTest params) { + BaseCounts counts = new BaseCounts(); + + for ( byte base : params.bases.getBytes() ) + counts.incr(base); + + String name = String.format("Test-%s", params.bases); + Assert.assertEquals(counts.totalCount(), params.bases.length(), name); + Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name); + Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java new file mode 100755 index 000000000..f65d383e3 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -0,0 +1,68 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class ReduceReadsIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam"; + final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam"; + final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556"; + final String L = " -L 20:10,100,000-10,120,000 "; + + private void RRTest(String testName, String args, String md5) { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s "; + WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); + executeTest(testName, spec); + } + + @Test(enabled = true) + public void testDefaultCompression() { + RRTest("testDefaultCompression ", L, "4c92d59d4a5292af1f968dc922c2c63e"); + } + + @Test(enabled = true) + public void testMultipleIntervals() { + String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; + RRTest("testMultipleIntervals ", intervals, "97d5c3fda5551741676793ba325ec7ed"); + } + + @Test(enabled = true) + public void testHighCompression() { + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e6bc1cd0e9de961cf0fb1789bf6ab108"); + } + + @Test(enabled = true) + public void testLowCompression() { + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "f33ec7cd0b98eebd73d1025ca656cd7e"); + } + + @Test(enabled = true) + public void testIndelCompression() { + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "e6e2bc889e4f342a7fedc5d38b391d20"); + } + + @Test(enabled = true) + public void testFilteredDeletionCompression() { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb"))); + } + + /** + * Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead + * and a subsequent tail leaves only this read in the stash. The next read to come in is in fact + * before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error + * that was freaking hard to catch. + * + * This bam is simplified to replicate the exact bug with the three provided intervals. + */ + @Test(enabled = true) + public void testAddingReadAfterTailingTheStash() { + String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("022931f032a4122cfe41e58e74d0aede"))); + } +} + diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java new file mode 100644 index 000000000..e651c018c --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java @@ -0,0 +1,85 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Random; + +public class SyntheticReadUnitTest extends BaseTest { + final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1); + final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic"); + final String artificialContig = "1"; + final int artificialContigIndex = 0; + final String artificialReadName = "synth"; + final int artificialRefStart = 1; + final double artificialMappingQuality = 60; + + final Random random = new Random(8854875); + + +@Test +public void testBaseCounts() { + BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A}; + Byte[] quals = new Byte[] {20, 20, 20, 20 }; + + TestRead [] testReads = new TestRead [] { + new TestRead(bases, quals, new Byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}), + new TestRead(bases, quals, new Byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}), + new TestRead(bases, quals, new Byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}), + new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; + + for (TestRead testRead : testReads) { + SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false); + Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); + } +} + +private class TestRead { + BaseIndex[] bases; + Byte[] quals; + Byte[] insQuals; + Byte[] delQuals; + Byte[] counts; + byte [] expectedCounts; + + private TestRead(BaseIndex[] bases, Byte[] quals, Byte[] counts, byte[] expectedCounts) { + this.bases = bases; + this.quals = quals; + this.insQuals = quals; + this.delQuals = quals; + this.counts = counts; + this.expectedCounts = expectedCounts; + } + + public BaseIndex[] getBases() { + return bases; + } + + public Byte[] getQuals() { + return quals; + } + + public Byte[] getInsQuals() { + return insQuals; + } + + public Byte[] getDelQuals() { + return delQuals; + } + + public Byte[] getCounts() { + return counts; + } + + public byte[] getExpectedCounts() { + return expectedCounts; + } +} + +}