Moving ReduceReads into protected
This commit is contained in:
parent
2d6511b9ac
commit
12d1c594df
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 6/15/12
|
||||
*/
|
||||
public class BaseAndQualsCounts extends BaseCounts {
|
||||
private final Map<BaseIndex, Long> sumInsertionQuals;
|
||||
private final Map<BaseIndex, Long> sumDeletionQuals;
|
||||
|
||||
public BaseAndQualsCounts() {
|
||||
super();
|
||||
this.sumInsertionQuals = new HashMap<BaseIndex, Long>();
|
||||
this.sumDeletionQuals = new HashMap<BaseIndex, Long>();
|
||||
for (BaseIndex i : BaseIndex.values()) {
|
||||
sumInsertionQuals.put(i, 0L);
|
||||
sumDeletionQuals.put(i, 0L);
|
||||
}
|
||||
}
|
||||
|
||||
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
super.incr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual);
|
||||
sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual);
|
||||
}
|
||||
}
|
||||
|
||||
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
super.decr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual);
|
||||
sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual);
|
||||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
|
||||
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
|
||||
return (byte) (sumQuals.get(base) / getCount(base));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object to keep track of the number of occurences of each base and it's quality.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
* Time: 2:55 PM
|
||||
*/
|
||||
|
||||
public class BaseCounts {
|
||||
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
|
||||
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
|
||||
|
||||
private final Map<BaseIndex, Integer> counts; // keeps track of the base counts
|
||||
private final Map<BaseIndex, Long> sumQuals; // keeps track of the quals of each base
|
||||
|
||||
public BaseCounts() {
|
||||
counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class);
|
||||
sumQuals = new EnumMap<BaseIndex, Long>(BaseIndex.class);
|
||||
for (BaseIndex i : BaseIndex.values()) {
|
||||
counts.put(i, 0);
|
||||
sumQuals.put(i, 0L);
|
||||
}
|
||||
}
|
||||
|
||||
public static BaseCounts createWithCounts(int[] countsACGT) {
|
||||
BaseCounts baseCounts = new BaseCounts();
|
||||
baseCounts.counts.put(BaseIndex.A, countsACGT[0]);
|
||||
baseCounts.counts.put(BaseIndex.C, countsACGT[1]);
|
||||
baseCounts.counts.put(BaseIndex.G, countsACGT[2]);
|
||||
baseCounts.counts.put(BaseIndex.T, countsACGT[3]);
|
||||
return baseCounts;
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void add(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) + other.counts.get(i));
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) - other.counts.get(i));
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
sumQuals.put(i, sumQuals.get(i) + qual);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
sumQuals.put(i, sumQuals.get(i) - qual);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(byte base) {
|
||||
return getCount(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(byte base) {
|
||||
return getSumQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(byte base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfMostCommonBase() {
|
||||
return counts.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfMostCommonBase() {
|
||||
return sumQuals.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQualsOfMostCommonBase() {
|
||||
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
|
||||
}
|
||||
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int totalCount() {
|
||||
int sum = 0;
|
||||
for (int c : counts.values())
|
||||
sum += c;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param base
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(byte base) {
|
||||
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param baseIndex
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(BaseIndex baseIndex) {
|
||||
int total = totalCount();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(baseIndex) / totalCount();
|
||||
}
|
||||
|
||||
|
||||
@Ensures("result != null")
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (Map.Entry<BaseIndex, Integer> elt : counts.entrySet()) {
|
||||
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex i : counts.keySet())
|
||||
if (counts.get(i) > counts.get(maxI))
|
||||
maxI = i;
|
||||
return maxI;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts))
|
||||
mostCounts = index;
|
||||
return mostCounts;
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
int sum = 0;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide())
|
||||
sum += counts.get(index);
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the proportional count of a base compared to all other bases except indels (I and D)
|
||||
*
|
||||
* @param index
|
||||
* @return the proportion of this base over all other bases except indels
|
||||
*/
|
||||
@Requires("index.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(BaseIndex index) {
|
||||
int total = totalCountWithoutIndels();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(index) / totalCountWithoutIndels();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
/**
|
||||
* Simple byte / base index conversions
|
||||
*
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 8/26/11
|
||||
*/
|
||||
public enum BaseIndex {
|
||||
A ( 'A', 0 ),
|
||||
C ( 'C', 1 ),
|
||||
G ( 'G', 2 ),
|
||||
T ( 'T', 3 ),
|
||||
D ( 'D', 4 ),
|
||||
I ( 'I', 5 ), // insertion to the right of the base
|
||||
N ( 'N', 6 );
|
||||
|
||||
final byte b;
|
||||
final int index;
|
||||
|
||||
public byte getByte() { return b; }
|
||||
|
||||
private BaseIndex(char base, int index) {
|
||||
this.b = (byte)base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a byte representation of a base to BaseIndex
|
||||
*
|
||||
* @param base the byte representation of the base
|
||||
* @return the BaseIndex representation of the base;
|
||||
*/
|
||||
public static BaseIndex byteToBase(final byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
return A;
|
||||
case 'C':
|
||||
case 'c':
|
||||
return C;
|
||||
case 'G':
|
||||
case 'g':
|
||||
return G;
|
||||
case 'T':
|
||||
case 't':
|
||||
return T;
|
||||
case 'D':
|
||||
case 'd':
|
||||
case '-':
|
||||
return D;
|
||||
case 'I':
|
||||
case 'i':
|
||||
return I;
|
||||
case 'N':
|
||||
case 'n':
|
||||
return N;
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Definition of a nucleotide for the BaseIndex is anything that has been read as a base
|
||||
* by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows
|
||||
* there is a base there (N).
|
||||
*
|
||||
* @return whether or not it is a nucleotide, given the definition above
|
||||
*/
|
||||
public boolean isNucleotide() {
|
||||
return this == A || this == C || this == G || this == T || this == N;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not this base is an insertion or a deletion
|
||||
*
|
||||
* @return true for I or D, false otherwise
|
||||
*/
|
||||
public boolean isIndel() {
|
||||
return this == D || this == I;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Given two BAMs with different read groups, it compares them based on ReduceReads metrics.
|
||||
* <p>
|
||||
* This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files.
|
||||
* </p>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Two BAM files (using -I) with different read group IDs
|
||||
* </p>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T $WalkerName
|
||||
* </pre>
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 10/30/11
|
||||
*/
|
||||
|
||||
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
|
||||
public class CompareBAMWalker extends LocusWalker<Map<CompareBAMWalker.TestName, Boolean>, CompareBAMWalker.TestResults> {
|
||||
@Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID;
|
||||
@Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false;
|
||||
@Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false;
|
||||
@Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20;
|
||||
@Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20;
|
||||
|
||||
|
||||
@Override
|
||||
public Map<TestName, Boolean> map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
Map<TestName, Boolean> result = new HashMap<TestName, Boolean>();
|
||||
|
||||
if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context));
|
||||
if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TestResults reduceInit () {
|
||||
TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP.
|
||||
|
||||
if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES);
|
||||
if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TestResults reduce (Map<TestName,Boolean> mapResult, TestResults sum) {
|
||||
for (TestName test : mapResult.keySet()) {
|
||||
if (mapResult.get(test))
|
||||
sum.reportSuccess(test);
|
||||
else
|
||||
sum.reportFailed(test);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone (TestResults finalResults) {
|
||||
finalResults.report();
|
||||
}
|
||||
|
||||
private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public enum TestName {
|
||||
EQUAL_BASES ("testEqualBases"),
|
||||
BASE_COUNTS ("testBaseCounts");
|
||||
|
||||
private String testName;
|
||||
|
||||
TestName(String testName) {
|
||||
this.testName = testName;
|
||||
}
|
||||
|
||||
public String getTestName() {
|
||||
return testName;
|
||||
}
|
||||
}
|
||||
|
||||
public class TestResults {
|
||||
private Map<TestName, TestOutcome> testStats = new HashMap<TestName, TestOutcome>();
|
||||
|
||||
public void createTest (TestName test) {
|
||||
testStats.put(test, new TestOutcome());
|
||||
}
|
||||
|
||||
public void reportSuccess(TestName test) {
|
||||
if (testStats.containsKey(test))
|
||||
testStats.get(test).incPassed();
|
||||
else
|
||||
throw new ReviewedStingException("No such test: " + test);
|
||||
}
|
||||
|
||||
public void reportFailed(TestName test) {
|
||||
if (testStats.containsKey(test))
|
||||
testStats.get(test).incFailed();
|
||||
else
|
||||
throw new ReviewedStingException("No such test: " + test);
|
||||
}
|
||||
|
||||
public void report() {
|
||||
System.out.println();
|
||||
System.out.println(String.format("%20s\tPASS\tFAIL", ""));
|
||||
for (TestName test : testStats.keySet())
|
||||
System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed()));
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
private class TestOutcome {
|
||||
private long passed;
|
||||
private long failed;
|
||||
|
||||
public long getPassed() {
|
||||
return passed;
|
||||
}
|
||||
|
||||
public void incPassed() {
|
||||
this.passed++;
|
||||
}
|
||||
|
||||
public long getFailed() {
|
||||
return failed;
|
||||
}
|
||||
|
||||
public void incFailed() {
|
||||
this.failed++;
|
||||
}
|
||||
}
|
||||
|
||||
private BaseCounts getFilteredBaseCounts(AlignmentContext context) {
|
||||
return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL);
|
||||
}
|
||||
|
||||
private BaseCounts getFullBaseCounts(AlignmentContext context) {
|
||||
return getBaseCounts(context, 3, 0);
|
||||
}
|
||||
|
||||
private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) {
|
||||
BaseCounts fullBaseCounts = new BaseCounts();
|
||||
for (String rg : context.getBasePileup().getReadGroups()) {
|
||||
if (!rg.equals(reducedReadGroupID)) {
|
||||
BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts());
|
||||
fullBaseCounts.add(b);
|
||||
}
|
||||
}
|
||||
return fullBaseCounts;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/10/11
|
||||
* Time: 8:49 AM
|
||||
*
|
||||
* A general interface for ReadCompressors. Read compressors have the following semantics:
|
||||
*
|
||||
* The accept a stream of reads, in order, and after each added read returns a compressed stream
|
||||
* of reads for emission. This stream of reads is a "reduced" representation of the total stream
|
||||
* of reads. The actual compression approach is left up to the implementing class.
|
||||
*/
|
||||
public interface Compressor {
|
||||
/**
|
||||
* Adds the read to the compressor. The returned iteratable collection of
|
||||
* reads represents the incremental compressed output.
|
||||
* @param read the next uncompressed read in the input stream to the compressor
|
||||
* @return an iterator over the incrementally available compressed reads
|
||||
*/
|
||||
@Requires("read != null")
|
||||
@Ensures("result != null")
|
||||
Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read);
|
||||
|
||||
/**
|
||||
* Must be called after the last read has been added to finalize the compressor state
|
||||
* and return the last compressed reads from the compressor.
|
||||
* @return an iterator over the final compressed reads of this compressor
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
Iterable<GATKSAMRecord> close();
|
||||
}
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* The element that describes the header of the sliding window.
|
||||
*
|
||||
* Each site has a header element containing the counts of each base, it's reference based location and whether or
|
||||
* not the site has insertions (to it's right). It also contains information about the bases that have been filtered
|
||||
* out due to mapping or base quality.
|
||||
*/
|
||||
public class HeaderElement {
|
||||
private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
|
||||
private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site.
|
||||
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
|
||||
private int nSoftClippedBases; // How many bases in this site came from soft clipped bases
|
||||
private int location; // Genome location of this site (the sliding window knows which contig we're at
|
||||
private LinkedList<Integer> mappingQuality; // keeps the mapping quality of each read that contributed to this element (site)
|
||||
|
||||
public int getLocation() {
|
||||
return location;
|
||||
}
|
||||
|
||||
public BaseAndQualsCounts getFilteredBaseCounts() {
|
||||
return filteredBaseCounts;
|
||||
}
|
||||
|
||||
public BaseAndQualsCounts getConsensusBaseCounts() {
|
||||
return consensusBaseCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty
|
||||
* filteredBaseCounts - 0 insertions to the right - empty mappingQuality list
|
||||
*
|
||||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(int location) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList<Integer>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with all given parameters
|
||||
*
|
||||
* @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read
|
||||
* @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read
|
||||
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
|
||||
* @param location the reference location of this reference element
|
||||
* @param mappingQuality the list of mapping quality values of all reads that contributed to this
|
||||
* HeaderElement
|
||||
*/
|
||||
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList<Integer> mappingQuality) {
|
||||
this.consensusBaseCounts = consensusBaseCounts;
|
||||
this.filteredBaseCounts = filteredBaseCounts;
|
||||
this.insertionsToTheRight = insertionsToTheRight;
|
||||
this.nSoftClippedBases = nSoftClippedBases;
|
||||
this.location = location;
|
||||
this.mappingQuality = mappingQuality;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
|
||||
* by insertion, deletion and mismatches.
|
||||
*
|
||||
* @return true if site is variant by any definition. False otherwise.
|
||||
*/
|
||||
public boolean isVariant(double minVariantProportion, double minIndelProportion) {
|
||||
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
*/
|
||||
public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
else
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
|
||||
this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
}
|
||||
|
||||
public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
else
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
|
||||
this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
}
|
||||
/**
|
||||
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
|
||||
* should be added to the right of the element.
|
||||
*/
|
||||
public void addInsertionToTheRight() {
|
||||
insertionsToTheRight++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this HeaderElement contain consensus data?
|
||||
*
|
||||
* @return whether or not this HeaderElement contains consensus data
|
||||
*/
|
||||
public boolean hasConsensusData() {
|
||||
return consensusBaseCounts.totalCount() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this HeaderElement contain filtered data?
|
||||
*
|
||||
* @return whether or not this HeaderElement contains filtered data
|
||||
*/
|
||||
public boolean hasFilteredData() {
|
||||
return filteredBaseCounts.totalCount() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* A HeaderElement is empty if it has no consensus or filtered data
|
||||
*
|
||||
* @return whether or not this HeaderElement has no data
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
return (!hasFilteredData() && !hasConsensusData());
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQuality);
|
||||
}
|
||||
|
||||
/**
|
||||
* removes an insertion from this element (if you removed a read that had an insertion)
|
||||
*/
|
||||
public void removeInsertionToTheRight() {
|
||||
this.insertionsToTheRight--;
|
||||
if (insertionsToTheRight < 0)
|
||||
throw new ReviewedStingException("Removed too many insertions, header is now negative!");
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess insertions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromInsertions(double minIndelProportion) {
|
||||
int numberOfBases = consensusBaseCounts.totalCount();
|
||||
if (numberOfBases == 0 && insertionsToTheRight > 0)
|
||||
return true; // we only have insertions
|
||||
else if (numberOfBases == 0)
|
||||
return false; // we don't have anything
|
||||
|
||||
// if we have bases and insertions, check the ratio
|
||||
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess deletions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromDeletions(double minIndelProportion) {
|
||||
return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess mismatches
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
}
|
||||
|
||||
/**
|
||||
* This handles the special case where we have more bases that came from soft clips than bases that came from
|
||||
* normal bases by forcing it to become a variant region. We don't want a consensus based on too little information.
|
||||
*
|
||||
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
|
||||
*/
|
||||
private boolean isVariantFromSoftClips() {
|
||||
return nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
|
||||
}
|
||||
|
||||
private boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
|
||||
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public class MultiSampleCompressor implements Compressor {
|
||||
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
||||
|
||||
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
|
||||
|
||||
public MultiSampleCompressor(SAMFileHeader header,
|
||||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReadsWalker.DownsampleStrategy downsampleStrategy) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(name, contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||
String sample = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sample);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sample);
|
||||
return compressor.addAlignment(read);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
SortedSet<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor comp : compressorsPerSample.values() )
|
||||
for ( GATKSAMRecord read : comp.close() )
|
||||
reads.add(read);
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* This class implements a "read stash" that keeps reads always sorted in alignment order. Useful
|
||||
* for read walkers that alter the alignment information of the incoming reads, but need to
|
||||
* maintain the reads sorted for the reduce step. (e.g. ReduceReads)
|
||||
*/
|
||||
|
||||
public class ReduceReadsStash {
|
||||
protected MultiSampleCompressor compressor;
|
||||
SortedSet<GATKSAMRecord> outOfOrderReads;
|
||||
|
||||
/**
|
||||
* Creates a stash with the default sorting order (read alignment)
|
||||
* @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close())
|
||||
*/
|
||||
public ReduceReadsStash(MultiSampleCompressor compressor) {
|
||||
this.compressor = compressor;
|
||||
this.outOfOrderReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all reads before a given read (for processing)
|
||||
*
|
||||
* @param read the original read
|
||||
* @return all reads that have alignment start before the original read.
|
||||
*/
|
||||
public List<GATKSAMRecord> getAllReadsBefore(GATKSAMRecord read) {
|
||||
List<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
GATKSAMRecord newHead = null;
|
||||
|
||||
for (GATKSAMRecord stashedRead : outOfOrderReads) {
|
||||
if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0)
|
||||
result.add(stashedRead);
|
||||
else {
|
||||
newHead = stashedRead;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (result.size() > 0) {
|
||||
if (result.size() == outOfOrderReads.size())
|
||||
outOfOrderReads.clear();
|
||||
else
|
||||
outOfOrderReads = new TreeSet<GATKSAMRecord>(outOfOrderReads.tailSet(newHead));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* sends the read to the MultiSampleCompressor
|
||||
*
|
||||
* @param read the read to be compressed
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> compress(GATKSAMRecord read) {
|
||||
return compressor.addAlignment(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a read to the stash
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
public void add(GATKSAMRecord read) {
|
||||
outOfOrderReads.add(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the stash, processing all remaining reads in order
|
||||
*
|
||||
* @return a list of all the reads produced by the SlidingWindow machinery)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
LinkedList<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// compress all the stashed reads (in order)
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
|
||||
result.add(compressedRead);
|
||||
|
||||
// output any remaining reads from the compressor
|
||||
for (GATKSAMRecord read : compressor.close())
|
||||
result.add(read);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful debug functionality, outputs all elements in the stash
|
||||
*/
|
||||
public void print() {
|
||||
int i = 1;
|
||||
System.out.println("Stash Contents:");
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd()));
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,665 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
|
||||
* <p/>
|
||||
* <p>
|
||||
* This walker will generated reduced versions of the BAM files that still follow the BAM spec
|
||||
* and contain all the information necessary for the GSA variant calling pipeline. Some options
|
||||
* allow you to tune in how much compression you want to achieve. The default values have been
|
||||
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
|
||||
* savings in file size and performance of the downstream tools.
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The BAM file to be compressed
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* The compressed (reduced) BAM file.
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ReduceReads \
|
||||
* -I myData.bam \
|
||||
* -o myData.reduced.bam
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
public class ReduceReadsWalker extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
|
||||
@Output
|
||||
protected StingSAMFileWriter out;
|
||||
|
||||
/**
|
||||
* The number of bases to keep around mismatches (potential variation)
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
|
||||
protected int contextSize = 10;
|
||||
|
||||
/**
|
||||
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
|
||||
* mapping quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
|
||||
protected int minMappingQuality = 20;
|
||||
|
||||
/**
|
||||
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
|
||||
* base quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
|
||||
protected byte minBaseQual = 20;
|
||||
|
||||
/**
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
|
||||
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
*/
|
||||
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
|
||||
protected byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
|
||||
* and read group).
|
||||
*/
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
|
||||
protected boolean DONT_SIMPLIFY_READS = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly in those cases.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
|
||||
protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
|
||||
* quality.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
|
||||
protected boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
|
||||
/**
|
||||
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
* base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant
|
||||
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
|
||||
*/
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
|
||||
protected boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
|
||||
/**
|
||||
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
|
||||
* uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather
|
||||
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
|
||||
*/
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
|
||||
protected boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
|
||||
/**
|
||||
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
|
||||
protected double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
|
||||
protected double minIndelProportionToTriggerVariant = 0.01;
|
||||
|
||||
/**
|
||||
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
|
||||
* A value of 0 turns downsampling off.
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
|
||||
protected int downsampleCoverage = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
|
||||
protected int debugLevel = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
|
||||
protected String debugRead = "";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
|
||||
protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
|
||||
private boolean NO_PG_TAG = false;
|
||||
|
||||
public enum DownsampleStrategy {
|
||||
Normal,
|
||||
Adaptive
|
||||
}
|
||||
|
||||
protected int totalReads = 0;
|
||||
int nCompressedReads = 0;
|
||||
|
||||
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
|
||||
SortedSet<GenomeLoc> intervalList;
|
||||
|
||||
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
|
||||
/**
|
||||
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
|
||||
* is done at the reduceInit method
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
GenomeAnalysisEngine toolkit = getToolkit();
|
||||
readNameHash = new HashMap<String, Long>(); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator()); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
|
||||
if (toolkit.getIntervals() != null)
|
||||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
if (!NO_PG_TAG)
|
||||
Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
|
||||
else
|
||||
out.setPresorted(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes in a read and prepares it for the SlidingWindow machinery by performing the
|
||||
* following optional clipping operations:
|
||||
* 1. Hard clip adaptor sequences
|
||||
* 2. Hard clip low quality tails
|
||||
* 3. Hard clip all remaining soft clipped bases
|
||||
* 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads)
|
||||
*
|
||||
* @param ref default map parameter
|
||||
* @param read default map parameter
|
||||
* @param metaDataTracker default map parameter
|
||||
* @return a linked list with all the reads produced by the clipping operations
|
||||
*/
|
||||
@Override
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
LinkedList<GATKSAMRecord> mappedReads;
|
||||
totalReads++;
|
||||
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
|
||||
System.out.println("Found debug read!");
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||
|
||||
// we write the actual alignment starts to their respectiv alignment shift tags in the temporary
|
||||
// attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
|
||||
|
||||
if (!DONT_SIMPLIFY_READS)
|
||||
read.simplify(); // Clear all unnecessary attributes
|
||||
if (!DONT_CLIP_ADAPTOR_SEQUENCES)
|
||||
read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any.
|
||||
if (!DONT_CLIP_LOW_QUAL_TAILS)
|
||||
read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails
|
||||
if (!isWholeGenome())
|
||||
mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval
|
||||
else {
|
||||
mappedReads = new LinkedList<GATKSAMRecord>();
|
||||
if (!read.isEmpty())
|
||||
mappedReads.add(read);
|
||||
}
|
||||
|
||||
if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
|
||||
LinkedList<GATKSAMRecord> tempList = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord mRead : mappedReads) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
|
||||
if (!clippedRead.isEmpty())
|
||||
tempList.add(clippedRead);
|
||||
}
|
||||
mappedReads = tempList;
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
for (GATKSAMRecord mappedRead : mappedReads)
|
||||
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
|
||||
|
||||
return mappedReads;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the ReduceReadsStash that keeps track of all reads that are waiting to
|
||||
* enter the SlidingWindow machinery. The stash makes sure reads are served in order
|
||||
* even though map() may generate reads that are only supposed to enter the machinery
|
||||
* in the future.
|
||||
*
|
||||
* @return the empty stash
|
||||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process
|
||||
* all reads that come before the original read (the read that was passed to map) including the original
|
||||
* read. This is where we send reads, in order, to the SlidingWindow machinery.
|
||||
*
|
||||
* @param mappedReads the list of reads sent by map
|
||||
* @param stash the stash that keeps the reads in order for processing
|
||||
* @return the stash with all reads that have not been processed yet
|
||||
*/
|
||||
public ReduceReadsStash reduce(LinkedList<GATKSAMRecord> mappedReads, ReduceReadsStash stash) {
|
||||
if (debugLevel == 1)
|
||||
stash.print();
|
||||
|
||||
boolean firstRead = true;
|
||||
for (GATKSAMRecord read : mappedReads) {
|
||||
boolean originalRead = firstRead && isOriginalRead(mappedReads, read);
|
||||
|
||||
if (read.getReadLength() == 0)
|
||||
throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
|
||||
if (originalRead) {
|
||||
List<GATKSAMRecord> readsReady = new LinkedList<GATKSAMRecord>();
|
||||
readsReady.addAll(stash.getAllReadsBefore(read));
|
||||
readsReady.add(read);
|
||||
|
||||
for (GATKSAMRecord readReady : readsReady) {
|
||||
if (debugLevel == 1)
|
||||
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
|
||||
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady))
|
||||
outputRead(compressedRead);
|
||||
|
||||
}
|
||||
} else
|
||||
stash.add(read);
|
||||
|
||||
firstRead = false;
|
||||
}
|
||||
|
||||
return stash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that now more reads will come, we process all the remaining reads in the stash, in order.
|
||||
*
|
||||
* @param stash the ReduceReadsStash with all unprocessed reads (from reduce)
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(ReduceReadsStash stash) {
|
||||
|
||||
// output any remaining reads in the compressor
|
||||
for (GATKSAMRecord read : stash.close())
|
||||
outputRead(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
|
||||
*
|
||||
* Note: If read overlaps more than one interval, it will be hard clipped to all
|
||||
* the intervals it overlaps with
|
||||
*
|
||||
* @param read the read to be hard clipped to the interval.
|
||||
* @return a shallow copy of the read hard clipped to the interval
|
||||
*/
|
||||
private LinkedList<GATKSAMRecord> hardClipReadToInterval(GATKSAMRecord read) {
|
||||
LinkedList<GATKSAMRecord> clippedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
|
||||
|
||||
boolean originalRead = true; // false if this is the right tail of the original read
|
||||
boolean overlap; // keeps track of the interval that overlapped the original read
|
||||
boolean doneClipping; // triggers an early exit if we are done clipping this read
|
||||
|
||||
if (isWholeGenome())
|
||||
clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged
|
||||
|
||||
for (GenomeLoc interval : intervalList) {
|
||||
|
||||
if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before)
|
||||
break;
|
||||
|
||||
GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch
|
||||
|
||||
switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) {
|
||||
case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read
|
||||
if (!originalRead) // something went wrong if this is the tail of the read
|
||||
throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
overlap = false;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
|
||||
case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore
|
||||
if (originalRead) {
|
||||
overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read).
|
||||
clippedRead = GATKSAMRecord.emptyRead(read);
|
||||
} else
|
||||
overlap = false;
|
||||
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_CONTIG: // read is in a different contig
|
||||
if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one.
|
||||
if (read.getReferenceIndex() < interval.getContigIndex())
|
||||
throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
else {
|
||||
overlap = false;
|
||||
doneClipping = false;
|
||||
}
|
||||
} // tail read CANNOT be in a different contig.
|
||||
else {
|
||||
if (read.getReferenceIndex() < interval.getContigIndex()) {
|
||||
overlap = false;
|
||||
doneClipping = true;
|
||||
} else
|
||||
throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_LEFT:
|
||||
if (originalRead) // if this is the first read this should never happen.
|
||||
throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
|
||||
overlap = false;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore
|
||||
overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here.
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case OVERLAP_LEFT: // clip the left tail of the read
|
||||
clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1);
|
||||
|
||||
overlap = true;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval
|
||||
clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1);
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
|
||||
|
||||
overlap = true;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read
|
||||
clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1);
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
|
||||
|
||||
overlap = true;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case OVERLAP_CONTAINED: // don't do anything to the read
|
||||
clippedRead = read;
|
||||
|
||||
overlap = true;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval.");
|
||||
}
|
||||
|
||||
if (overlap && originalRead)
|
||||
intervalOverlapped = interval;
|
||||
|
||||
if (clippedRead != null) {
|
||||
originalRead = false;
|
||||
|
||||
if (!clippedRead.isEmpty())
|
||||
clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off
|
||||
}
|
||||
|
||||
if (doneClipping)
|
||||
break;
|
||||
}
|
||||
|
||||
if (intervalOverlapped != null)
|
||||
intervalList = intervalList.tailSet(intervalOverlapped);
|
||||
|
||||
return clippedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compresses the read name and adds it to output BAM file (reduced BAM)
|
||||
* after performing some quality control
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void outputRead(GATKSAMRecord read) {
|
||||
if (debugLevel == 2) {
|
||||
checkForHighMismatch(read);
|
||||
checkCigar(read);
|
||||
}
|
||||
|
||||
if (read.isReducedRead())
|
||||
nCompressedReads++;
|
||||
else {
|
||||
int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT);
|
||||
int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT);
|
||||
|
||||
int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression
|
||||
int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression
|
||||
|
||||
if (startShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
|
||||
if (endShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
|
||||
|
||||
totalReads++;
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
|
||||
|
||||
// if (!DONT_USE_SOFTCLIPPED_BASES)
|
||||
// reSoftClipBases(read);
|
||||
|
||||
if (!DONT_COMPRESS_READ_NAMES)
|
||||
compressReadName(read);
|
||||
|
||||
out.addAlignment(read);
|
||||
}
|
||||
|
||||
private void reSoftClipBases(GATKSAMRecord read) {
|
||||
Integer left = (Integer) read.getTemporaryAttribute("SL");
|
||||
Integer right = (Integer) read.getTemporaryAttribute("SR");
|
||||
if (left != null || right != null) {
|
||||
Cigar newCigar = new Cigar();
|
||||
for (CigarElement element : read.getCigar().getCigarElements()) {
|
||||
newCigar.add(new CigarElement(element.getLength(), element.getOperator()));
|
||||
}
|
||||
|
||||
if (left != null) {
|
||||
newCigar = updateFirstSoftClipCigarElement(left, newCigar);
|
||||
read.setAlignmentStart(read.getAlignmentStart() + left);
|
||||
}
|
||||
|
||||
if (right != null) {
|
||||
Cigar invertedCigar = invertCigar(newCigar);
|
||||
newCigar = invertCigar(updateFirstSoftClipCigarElement(right, invertedCigar));
|
||||
}
|
||||
read.setCigar(newCigar);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Facility routine to revert the first element of a Cigar string (skipping hard clips) into a soft-clip.
|
||||
* To be used on both ends if provided a flipped Cigar
|
||||
*
|
||||
* @param softClipSize the length of the soft clipped element to add
|
||||
* @param originalCigar the original Cigar string
|
||||
* @return a new Cigar object with the soft clips added
|
||||
*/
|
||||
private Cigar updateFirstSoftClipCigarElement (int softClipSize, Cigar originalCigar) {
|
||||
Cigar result = new Cigar();
|
||||
CigarElement leftElement = new CigarElement(softClipSize, CigarOperator.S);
|
||||
boolean updated = false;
|
||||
for (CigarElement element : originalCigar.getCigarElements()) {
|
||||
if (!updated && element.getOperator() == CigarOperator.M) {
|
||||
result.add(leftElement);
|
||||
int newLength = element.getLength() - softClipSize;
|
||||
if (newLength > 0)
|
||||
result.add(new CigarElement(newLength, CigarOperator.M));
|
||||
updated = true;
|
||||
}
|
||||
else
|
||||
result.add(element);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, returns the inverted cigar string.
|
||||
*
|
||||
* @param cigar the original cigar
|
||||
* @return the inverted cigar
|
||||
*/
|
||||
private Cigar invertCigar(Cigar cigar) {
|
||||
Stack<CigarElement> stack = new Stack<CigarElement>();
|
||||
for (CigarElement e : cigar.getCigarElements())
|
||||
stack.push(e);
|
||||
Cigar inverted = new Cigar();
|
||||
while (!stack.empty()) {
|
||||
inverted.add(stack.pop());
|
||||
}
|
||||
return inverted;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Quality control procedure that checks if the consensus reads contains too many
|
||||
* mismatches with the reference. This should never happen and is a good trigger for
|
||||
* errors with the algorithm.
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void checkForHighMismatch(GATKSAMRecord read) {
|
||||
final int start = read.getAlignmentStart();
|
||||
final int stop = read.getAlignmentEnd();
|
||||
final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases();
|
||||
final int nm = SequenceUtil.countMismatches(read, ref, start - 1);
|
||||
final int readLen = read.getReadLength();
|
||||
final double nmFraction = nm / (1.0 * readLen);
|
||||
if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus"))
|
||||
throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
private void checkCigar (GATKSAMRecord read) {
|
||||
if (read.getCigar().isValid(null, -1) != null) {
|
||||
throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compresses the read name using the readNameHash if we have already compressed
|
||||
* this read name before.
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void compressReadName(GATKSAMRecord read) {
|
||||
String name = read.getReadName();
|
||||
String compressedName = read.isReducedRead() ? "C" : "";
|
||||
if (readNameHash.containsKey(name))
|
||||
compressedName += readNameHash.get(name).toString();
|
||||
else {
|
||||
readNameHash.put(name, nextReadNumber);
|
||||
compressedName += nextReadNumber.toString();
|
||||
nextReadNumber++;
|
||||
}
|
||||
|
||||
read.setReadName(compressedName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the read is the original read that went through map().
|
||||
*
|
||||
* This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled.
|
||||
*
|
||||
* @param list the list
|
||||
* @param read the read
|
||||
* @return Returns true if the read is the original read that went through map().
|
||||
*/
|
||||
private boolean isOriginalRead(LinkedList<GATKSAMRecord> list, GATKSAMRecord read) {
|
||||
return isWholeGenome() || (list.getFirst().equals(read) && ReadUtils.getReadAndIntervalOverlapType(read, intervalList.first()) == ReadUtils.ReadAndIntervalOverlap.OVERLAP_CONTAINED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether or not the intervalList is empty, meaning we're running in WGS mode.
|
||||
*
|
||||
* @return whether or not we're running in WGS mode.
|
||||
*/
|
||||
private boolean isWholeGenome() {
|
||||
return intervalList.isEmpty();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SingleSampleCompressor implements Compressor {
|
||||
protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class);
|
||||
|
||||
protected final int contextSize;
|
||||
protected final int downsampleCoverage;
|
||||
protected int minMappingQuality;
|
||||
protected int slidingWindowCounter;
|
||||
|
||||
protected final String sampleName;
|
||||
|
||||
protected SlidingWindow slidingWindow;
|
||||
protected double minAltProportionToTriggerVariant;
|
||||
protected double minIndelProportionToTriggerVariant;
|
||||
protected int minBaseQual;
|
||||
|
||||
protected ReduceReadsWalker.DownsampleStrategy downsampleStrategy;
|
||||
|
||||
public SingleSampleCompressor(final String sampleName,
|
||||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReadsWalker.DownsampleStrategy downsampleStrategy) {
|
||||
this.sampleName = sampleName;
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
this.slidingWindowCounter = 0;
|
||||
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
|
||||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> addAlignment( GATKSAMRecord read ) {
|
||||
TreeSet<GATKSAMRecord> result = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
int readOriginalStart = read.getUnclippedStart();
|
||||
|
||||
// create a new window if:
|
||||
if ((slidingWindow != null) &&
|
||||
( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig
|
||||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||
|
||||
// close the current sliding window
|
||||
result.addAll(slidingWindow.close());
|
||||
slidingWindow = null; // so we create a new one on the next if
|
||||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
result.addAll(slidingWindow.addRead(read));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
return (slidingWindow != null) ? slidingWindow.close() : new TreeSet<GATKSAMRecord>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,713 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.gatk.downsampling.FractionalDownsampler;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: roger
|
||||
* Date: 8/3/11
|
||||
* Time: 2:24 PM
|
||||
*/
|
||||
public class SlidingWindow {
|
||||
|
||||
// Sliding Window data
|
||||
final private LinkedList<GATKSAMRecord> readsInWindow;
|
||||
final private LinkedList<HeaderElement> windowHeader;
|
||||
protected int contextSize; // the largest context size (between mismatches and indels)
|
||||
protected int stopLocation;
|
||||
protected String contig;
|
||||
protected int contigIndex;
|
||||
protected SAMFileHeader header;
|
||||
protected GATKSAMReadGroupRecord readGroupAttribute;
|
||||
protected int downsampleCoverage;
|
||||
|
||||
// Running consensus data
|
||||
protected SyntheticRead runningConsensus;
|
||||
protected int consensusCounter;
|
||||
protected String consensusReadName;
|
||||
|
||||
// Filtered Data Consensus data
|
||||
protected SyntheticRead filteredDataConsensus;
|
||||
protected int filteredDataConsensusCounter;
|
||||
protected String filteredDataReadName;
|
||||
|
||||
|
||||
// Additional parameters
|
||||
protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
|
||||
protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions
|
||||
protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value
|
||||
protected int MIN_MAPPING_QUALITY;
|
||||
|
||||
protected ReduceReadsWalker.DownsampleStrategy downsampleStrategy;
|
||||
private boolean hasIndelQualities;
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
*/
|
||||
private enum ConsensusType {
|
||||
CONSENSUS,
|
||||
FILTERED,
|
||||
BOTH
|
||||
}
|
||||
|
||||
public int getStopLocation() {
|
||||
return stopLocation;
|
||||
}
|
||||
|
||||
public String getContig() {
|
||||
return contig;
|
||||
}
|
||||
|
||||
public int getContigIndex() {
|
||||
return contigIndex;
|
||||
}
|
||||
|
||||
public int getStartLocation() {
|
||||
return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation();
|
||||
}
|
||||
|
||||
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReadsWalker.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
|
||||
this.stopLocation = -1;
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
|
||||
this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
|
||||
this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
|
||||
this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
|
||||
this.MIN_MAPPING_QUALITY = minMappingQuality;
|
||||
|
||||
this.windowHeader = new LinkedList<HeaderElement>();
|
||||
this.readsInWindow = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.header = header;
|
||||
this.readGroupAttribute = readGroupAttribute;
|
||||
|
||||
this.consensusCounter = 0;
|
||||
this.consensusReadName = "Consensus-" + windowNumber + "-";
|
||||
|
||||
this.filteredDataConsensusCounter = 0;
|
||||
this.filteredDataReadName = "Filtered-" + windowNumber + "-";
|
||||
|
||||
this.runningConsensus = null;
|
||||
this.filteredDataConsensus = null;
|
||||
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a read to the sliding window and slides the window accordingly.
|
||||
*
|
||||
* Reads are assumed to be in order, therefore, when a read is added the sliding window can
|
||||
* assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window
|
||||
* slides forward to that position and returns all reads that may have been finalized in the
|
||||
* sliding process.
|
||||
*
|
||||
* @param read the read
|
||||
* @return a list of reads that have been finished by sliding the window.
|
||||
*/
|
||||
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
|
||||
updateHeaderCounts(read, false); // update the window header counts
|
||||
readsInWindow.add(read); // add read to sliding reads
|
||||
return slideWindow(read.getUnclippedStart());
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the next complete or incomplete variant region between 'from' (inclusive) and 'to' (exclusive)
|
||||
*
|
||||
* @param from beginning window header index of the search window (inclusive)
|
||||
* @param to end window header index of the search window (exclusive)
|
||||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
|
||||
*/
|
||||
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
|
||||
boolean foundStart = false;
|
||||
int variantRegionStartIndex = 0;
|
||||
for (int i=from; i<to; i++) {
|
||||
if (variantSite[i] && !foundStart) {
|
||||
variantRegionStartIndex = i;
|
||||
foundStart = true;
|
||||
}
|
||||
else if(!variantSite[i] && foundStart) {
|
||||
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
|
||||
}
|
||||
}
|
||||
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive)
|
||||
*
|
||||
* @param from beginning window header index of the search window (inclusive)
|
||||
* @param to end window header index of the search window (exclusive)
|
||||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return a list with start/stops of variant regions following getNextVariantRegion description
|
||||
*/
|
||||
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
|
||||
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
|
||||
int index = from;
|
||||
while(index < to) {
|
||||
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
|
||||
if (result == null)
|
||||
break;
|
||||
|
||||
regions.add(result);
|
||||
if (result.getSecond() < 0)
|
||||
break;
|
||||
index = result.getSecond() + 1;
|
||||
}
|
||||
return regions;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the window can be slid given the new incoming read.
|
||||
*
|
||||
* We check from the start of the window to the (unclipped) start of the new incoming read if there
|
||||
* is any variant.
|
||||
* If there are variant sites, we check if it's time to close the variant region.
|
||||
*
|
||||
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
||||
* @return all reads that have fallen to the left of the sliding window after the slide
|
||||
*/
|
||||
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (incomingReadUnclippedStart - contextSize > getStartLocation()) {
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
|
||||
boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex);
|
||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getAlignmentEnd() < getStartLocation()) {
|
||||
readsToRemove.add(read);
|
||||
}
|
||||
}
|
||||
for (GATKSAMRecord read : readsToRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
}
|
||||
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns an array marked with variant and non-variant regions (it uses
|
||||
* markVariantRegions to make the marks)
|
||||
*
|
||||
* @param stop check the window from start to stop (not-inclusive)
|
||||
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
|
||||
*/
|
||||
protected boolean[] markSites(int stop) {
|
||||
|
||||
boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1];
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
|
||||
for (int i = getStartLocation(); i < stop; i++) {
|
||||
if (headerElementIterator.hasNext()) {
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||
markVariantRegion(markedSites, i - getStartLocation());
|
||||
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return markedSites;
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the sites around the variant site (as true)
|
||||
*
|
||||
* @param markedSites the boolean array to bear the marks
|
||||
* @param variantSiteLocation the location where a variant site was found
|
||||
*/
|
||||
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
|
||||
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
|
||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
|
||||
for (int i = from; i < to; i++)
|
||||
markedSites[i] = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds bases to the running consensus or filtered data accordingly
|
||||
*
|
||||
* If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
|
||||
*
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
* @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
|
||||
*/
|
||||
protected List<GATKSAMRecord> addToSyntheticReads(int start, int end) {
|
||||
LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
|
||||
if (start < end) {
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end));
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.hasConsensusData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
|
||||
|
||||
int endOfConsensus = findNextNonConsensusElement(start, end);
|
||||
addToRunningConsensus(start, endOfConsensus);
|
||||
|
||||
if (endOfConsensus <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfConsensus, end));
|
||||
} else if (headerElement.hasFilteredData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
|
||||
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(start, end);
|
||||
addToFilteredData(start, endOfFilteredData);
|
||||
|
||||
if (endOfFilteredData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfFilteredData, end));
|
||||
} else if (headerElement.isEmpty()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
int endOfEmptyData = findNextNonEmptyElement(start, end);
|
||||
|
||||
if (endOfEmptyData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfEmptyData, end));
|
||||
} else
|
||||
throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
|
||||
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes one or more synthetic reads.
|
||||
*
|
||||
* @param type the synthetic reads you want to close
|
||||
* @return the GATKSAMRecords generated by finalizing the synthetic reads
|
||||
*/
|
||||
private List<GATKSAMRecord> finalizeAndAdd(ConsensusType type) {
|
||||
GATKSAMRecord read = null;
|
||||
List<GATKSAMRecord> list = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
switch (type) {
|
||||
case CONSENSUS:
|
||||
read = finalizeRunningConsensus();
|
||||
break;
|
||||
case FILTERED:
|
||||
read = finalizeFilteredDataConsensus();
|
||||
break;
|
||||
case BOTH:
|
||||
read = finalizeRunningConsensus();
|
||||
if (read != null) list.add(read);
|
||||
read = finalizeFilteredDataConsensus();
|
||||
}
|
||||
if (read != null)
|
||||
list.add(read);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next position without consensus data
|
||||
*
|
||||
* @param start beginning of the filtered region
|
||||
* @param upTo limit to search for another consensus element
|
||||
* @return next position with consensus data or empty
|
||||
*/
|
||||
private int findNextNonConsensusElement(int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("There are no more header elements in this window");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.hasConsensusData())
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next position without filtered data
|
||||
*
|
||||
* @param start beginning of the region
|
||||
* @param upTo limit to search for
|
||||
* @return next position with no filtered data
|
||||
*/
|
||||
private int findNextNonFilteredDataElement(int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("There are no more header elements in this window");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.hasFilteredData() || headerElement.hasConsensusData())
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next non-empty header element
|
||||
*
|
||||
* @param start beginning of the region
|
||||
* @param upTo limit to search for
|
||||
* @return next position with non-empty element
|
||||
*/
|
||||
private int findNextNonEmptyElement(int start, int upTo) {
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("There are no more header elements in this window");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.isEmpty())
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds bases to the filtered data synthetic read.
|
||||
*
|
||||
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
|
||||
* bases.
|
||||
*
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToFilteredData(int start, int end) {
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (headerElement.hasConsensusData())
|
||||
throw new ReviewedStingException("Found consensus data inside region to add to filtered data.");
|
||||
|
||||
if (!headerElement.hasFilteredData())
|
||||
throw new ReviewedStingException("No filtered data in " + index);
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds bases to the filtered data synthetic read.
|
||||
*
|
||||
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
|
||||
* bases.
|
||||
*
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToRunningConsensus(int start, int end) {
|
||||
if (runningConsensus == null)
|
||||
runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.hasConsensusData())
|
||||
throw new ReviewedStingException("No CONSENSUS data in " + index);
|
||||
|
||||
genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic accessor to add base and qualities to a synthetic read
|
||||
*
|
||||
* @param syntheticRead the synthetic read to add to
|
||||
* @param baseCounts the base counts object in the header element
|
||||
* @param rms the rms mapping quality in the header element
|
||||
*/
|
||||
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
|
||||
BaseIndex base = baseCounts.baseIndexWithMostCounts();
|
||||
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfMostCommonBase();
|
||||
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
|
||||
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, rms);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes a variant region, any adjacent synthetic reads.
|
||||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @return all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires("start <= stop")
|
||||
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region
|
||||
if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
|
||||
allReads.add(read);
|
||||
updateHeaderCounts(read, true); // Remove this read from the window header entirely
|
||||
}
|
||||
}
|
||||
|
||||
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(0, start));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
for (GATKSAMRecord read : result) {
|
||||
readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
|
||||
}
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
}
|
||||
|
||||
|
||||
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
if (!regions.isEmpty()) {
|
||||
int lastStop = -1;
|
||||
for (Pair<Integer, Integer> region : regions) {
|
||||
int start = region.getFirst();
|
||||
int stop = region.getSecond();
|
||||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop));
|
||||
lastStop = stop;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Downsamples a variant region to the downsample coverage of the sliding window.
|
||||
*
|
||||
* It will use the downsampling strategy defined by the SlidingWindow
|
||||
*
|
||||
* @param allReads the reads to select from (all reads that cover the window)
|
||||
* @return a list of reads selected by the downsampler to cover the window to at least the desired coverage
|
||||
*/
|
||||
protected List<GATKSAMRecord> downsampleVariantRegion(final List<GATKSAMRecord> allReads) {
|
||||
double fraction = 100 / allReads.size();
|
||||
if (fraction >= 1)
|
||||
return allReads;
|
||||
|
||||
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
|
||||
downsampler.submit(allReads);
|
||||
return downsampler.consumeDownsampledItems();
|
||||
}
|
||||
|
||||
/**
|
||||
* Properly closes a Sliding Window, finalizing all consensus and variant
|
||||
* regions that still exist regardless of being able to fulfill the
|
||||
* context size requirement in the end.
|
||||
*
|
||||
* @return All reads generated
|
||||
*/
|
||||
public List<GATKSAMRecord> close() {
|
||||
// mark variant regions
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
boolean[] variantSite = markSites(stopLocation + 1);
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, true);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1));
|
||||
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
||||
}
|
||||
|
||||
}
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* generates the SAM record for the running consensus read and resets it (to null)
|
||||
*
|
||||
* @return the read contained in the running consensus
|
||||
*/
|
||||
protected GATKSAMRecord finalizeRunningConsensus() {
|
||||
GATKSAMRecord finalizedRead = null;
|
||||
if (runningConsensus != null) {
|
||||
if (runningConsensus.size() > 0)
|
||||
finalizedRead = runningConsensus.close();
|
||||
else
|
||||
consensusCounter--;
|
||||
|
||||
runningConsensus = null;
|
||||
}
|
||||
return finalizedRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* generates the SAM record for the filtered data consensus and resets it (to null)
|
||||
*
|
||||
* @return the read contained in the running consensus
|
||||
*/
|
||||
protected GATKSAMRecord finalizeFilteredDataConsensus() {
|
||||
GATKSAMRecord finalizedRead = null;
|
||||
if (filteredDataConsensus != null) {
|
||||
if (filteredDataConsensus.size() > 0)
|
||||
finalizedRead = filteredDataConsensus.close();
|
||||
else
|
||||
filteredDataConsensusCounter--;
|
||||
|
||||
filteredDataConsensus = null;
|
||||
}
|
||||
return finalizedRead;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Updates the sliding window's header counts with the incoming read bases, insertions
|
||||
* and deletions.
|
||||
*
|
||||
* @param read the incoming read to be added to the sliding window
|
||||
*/
|
||||
protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) {
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
byte[] insQuals = read.getExistingBaseInsertionQualities();
|
||||
byte[] delQuals = read.getExistingBaseDeletionQualities();
|
||||
int readStart = read.getSoftStart();
|
||||
int readEnd = read.getSoftEnd();
|
||||
Cigar cigar = read.getCigar();
|
||||
|
||||
int readBaseIndex = 0;
|
||||
int startLocation = getStartLocation();
|
||||
int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
|
||||
|
||||
if (removeRead && locationIndex < 0)
|
||||
throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
|
||||
|
||||
if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
|
||||
if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
|
||||
for (int i = 1; i <= -locationIndex; i++)
|
||||
windowHeader.addFirst(new HeaderElement(startLocation - i));
|
||||
|
||||
startLocation = readStart; // update start location accordingly
|
||||
locationIndex = 0;
|
||||
}
|
||||
|
||||
if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
|
||||
int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
|
||||
while (elementsToAdd-- > 0)
|
||||
windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd));
|
||||
|
||||
stopLocation = readEnd; // update stopLocation accordingly
|
||||
}
|
||||
|
||||
// Special case for leading insertions before the beginning of the sliding read
|
||||
if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
|
||||
windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
|
||||
locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
|
||||
}
|
||||
}
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(locationIndex);
|
||||
HeaderElement headerElement;
|
||||
for (CigarElement cigarElement : cigar.getCigarElements()) {
|
||||
switch (cigarElement.getOperator()) {
|
||||
case H:
|
||||
break;
|
||||
case I:
|
||||
if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
|
||||
break;
|
||||
}
|
||||
|
||||
headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
|
||||
|
||||
if (removeRead) {
|
||||
headerElement.removeInsertionToTheRight();
|
||||
}
|
||||
else {
|
||||
headerElement.addInsertionToTheRight();
|
||||
}
|
||||
readBaseIndex += cigarElement.getLength();
|
||||
break; // just ignore the insertions at the beginning of the read
|
||||
case D:
|
||||
int nDeletions = cigarElement.getLength();
|
||||
while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score
|
||||
headerElement = headerElementIterator.next();
|
||||
byte mq = (byte) read.getMappingQuality();
|
||||
if (removeRead)
|
||||
headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
|
||||
else
|
||||
headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
|
||||
|
||||
locationIndex++;
|
||||
}
|
||||
break;
|
||||
case S:
|
||||
case M:
|
||||
case P:
|
||||
case EQ:
|
||||
case X:
|
||||
int nBasesToAdd = cigarElement.getLength();
|
||||
while (nBasesToAdd-- > 0) {
|
||||
headerElement = headerElementIterator.next();
|
||||
byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything)
|
||||
byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex];
|
||||
if (removeRead)
|
||||
headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
|
||||
else
|
||||
headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
|
||||
|
||||
readBaseIndex++;
|
||||
locationIndex++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Running Consensus is a read that is compressed as a sliding window travels over the reads
|
||||
* and keeps track of all the bases that are outside of variant regions.
|
||||
*
|
||||
* Consensus reads have qual fields that correspond to the number of reads that had the base
|
||||
* and passed the minimum quality threshold.
|
||||
*
|
||||
* The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads
|
||||
* that compose the consensus
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 8/26/11
|
||||
*/
|
||||
public class SyntheticRead {
|
||||
private List<BaseIndex> bases;
|
||||
private List<Byte> counts;
|
||||
private List<Byte> quals;
|
||||
private List<Byte> insertionQuals;
|
||||
private List<Byte> deletionQuals;
|
||||
private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
|
||||
private String readTag;
|
||||
|
||||
// Information to produce a GATKSAMRecord
|
||||
private SAMFileHeader header;
|
||||
private GATKSAMReadGroupRecord readGroupRecord;
|
||||
private String contig;
|
||||
private int contigIndex;
|
||||
private String readName;
|
||||
private Integer refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
|
||||
/**
|
||||
* Full initialization of the running consensus if you have all the information and are ready to
|
||||
* start adding to the running consensus.
|
||||
*
|
||||
* @param header GATKSAMRecord file header
|
||||
* @param readGroupRecord Read Group for the GATKSAMRecord
|
||||
* @param contig the read's contig name
|
||||
* @param contigIndex the read's contig index
|
||||
* @param readName the read's name
|
||||
* @param refStart the alignment start (reference based)
|
||||
* @param readTag the reduce reads tag for the synthetic read
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) {
|
||||
final int initialCapacity = 10000;
|
||||
bases = new ArrayList<BaseIndex>(initialCapacity);
|
||||
counts = new ArrayList<Byte>(initialCapacity);
|
||||
quals = new ArrayList<Byte>(initialCapacity);
|
||||
insertionQuals = new ArrayList<Byte>(initialCapacity);
|
||||
deletionQuals = new ArrayList<Byte>(initialCapacity);
|
||||
mappingQuality = 0.0;
|
||||
|
||||
this.readTag = readTag;
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
}
|
||||
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) {
|
||||
this.bases = bases;
|
||||
this.counts = counts;
|
||||
this.quals = quals;
|
||||
this.insertionQuals = insertionQuals;
|
||||
this.deletionQuals = deletionQuals;
|
||||
this.mappingQuality = mappingQuality;
|
||||
this.readTag = readTag;
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Easy access to keep adding to a running consensus that has already been
|
||||
* initialized with the correct read name and refStart
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param count number of reads with this base
|
||||
*/
|
||||
@Requires("count < Byte.MAX_VALUE")
|
||||
public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
|
||||
counts.add(count);
|
||||
bases.add(base);
|
||||
quals.add(qual);
|
||||
insertionQuals.add(insQual);
|
||||
deletionQuals.add(delQual);
|
||||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
||||
public BaseIndex getBase(int readCoordinate) {
|
||||
return bases.get(readCoordinate);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
|
||||
*
|
||||
* Invalid reads are :
|
||||
* - exclusively composed of deletions
|
||||
*
|
||||
* @return a GATKSAMRecord or null
|
||||
*/
|
||||
public GATKSAMRecord close () {
|
||||
if (isAllDeletions())
|
||||
return null;
|
||||
|
||||
GATKSAMRecord read = new GATKSAMRecord(header);
|
||||
read.setReferenceName(contig);
|
||||
read.setReferenceIndex(contigIndex);
|
||||
read.setReadPairedFlag(false);
|
||||
read.setReadUnmappedFlag(false);
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
read.setAlignmentStart(refStart);
|
||||
read.setReadName(readName);
|
||||
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
|
||||
read.setReadBases(convertReadBases());
|
||||
read.setMappingQuality((int) Math.ceil(mappingQuality / bases.size()));
|
||||
read.setReadGroup(readGroupRecord);
|
||||
read.setAttribute(readTag, convertBaseCounts());
|
||||
|
||||
if (hasIndelQualities) {
|
||||
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
|
||||
read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION);
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the synthetic read is composed exclusively of deletions
|
||||
*
|
||||
* @return true if it is, false if it isn't.
|
||||
*/
|
||||
private boolean isAllDeletions() {
|
||||
for (BaseIndex b : bases)
|
||||
if (b != BaseIndex.D)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public int size () {
|
||||
return bases.size();
|
||||
}
|
||||
|
||||
private byte [] convertBaseQualities() {
|
||||
return convertVariableGivenBases(bases, quals);
|
||||
}
|
||||
|
||||
private byte [] convertInsertionQualities() {
|
||||
return convertVariableGivenBases(bases, insertionQuals);
|
||||
}
|
||||
|
||||
private byte [] convertDeletionQualities() {
|
||||
return convertVariableGivenBases(bases, deletionQuals);
|
||||
}
|
||||
|
||||
protected byte [] convertBaseCounts() {
|
||||
byte[] countsArray = convertVariableGivenBases(bases, counts);
|
||||
|
||||
if (countsArray.length == 0)
|
||||
throw new ReviewedStingException("Reduced read has counts array of length 0");
|
||||
|
||||
byte[] compressedCountsArray = new byte [countsArray.length];
|
||||
compressedCountsArray[0] = countsArray[0];
|
||||
for (int i = 1; i < countsArray.length; i++)
|
||||
compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
|
||||
|
||||
return compressedCountsArray;
|
||||
}
|
||||
|
||||
private byte [] convertReadBases() {
|
||||
byte [] readArray = new byte[getReadLengthWithNoDeletions(bases)];
|
||||
int i = 0;
|
||||
for (BaseIndex baseIndex : bases)
|
||||
if (baseIndex != BaseIndex.D)
|
||||
readArray[i++] = baseIndex.getByte();
|
||||
|
||||
return readArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the cigar string for the synthetic read
|
||||
*
|
||||
* Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read.
|
||||
*
|
||||
* @return the cigar string for the synthetic read
|
||||
*/
|
||||
private Cigar buildCigar() {
|
||||
LinkedList<CigarElement> cigarElements = new LinkedList<CigarElement>();
|
||||
CigarOperator cigarOperator = null;
|
||||
int length = 0;
|
||||
for (BaseIndex b : bases) {
|
||||
CigarOperator op;
|
||||
switch (b) {
|
||||
case D:
|
||||
op = CigarOperator.DELETION;
|
||||
break;
|
||||
case I:
|
||||
throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported.");
|
||||
default:
|
||||
op = CigarOperator.MATCH_OR_MISMATCH;
|
||||
break;
|
||||
}
|
||||
if (cigarOperator == null) {
|
||||
if (op == CigarOperator.D) // read cannot start with a deletion
|
||||
refStart++; // if it does, we need to move the reference start forward
|
||||
else
|
||||
cigarOperator = op;
|
||||
}
|
||||
else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one
|
||||
cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator
|
||||
cigarOperator = op;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions)
|
||||
length++;
|
||||
}
|
||||
if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion
|
||||
cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element
|
||||
|
||||
return new Cigar(cigarElements);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality for all conversion utilities
|
||||
*
|
||||
* @param bases the read bases
|
||||
* @param variable the list to convert
|
||||
* @return a converted variable given the bases and skipping deletions
|
||||
*/
|
||||
|
||||
private static byte [] convertVariableGivenBases (List<BaseIndex> bases, List<Byte> variable) {
|
||||
byte [] variableArray = new byte[getReadLengthWithNoDeletions(bases)];
|
||||
int i = 0;
|
||||
Iterator<Byte> variableIterator = variable.iterator();
|
||||
for (BaseIndex baseIndex : bases) {
|
||||
byte count = variableIterator.next();
|
||||
if (baseIndex != BaseIndex.D)
|
||||
variableArray[i++] = count;
|
||||
}
|
||||
return variableArray;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality for all conversion utilities
|
||||
*
|
||||
* @param bases the read bases
|
||||
* @return the length of the read with no deletions
|
||||
*/
|
||||
private static int getReadLengthWithNoDeletions(List<BaseIndex> bases) {
|
||||
int readLength = bases.size();
|
||||
for (BaseIndex baseIndex : bases)
|
||||
if (baseIndex == BaseIndex.D)
|
||||
readLength--;
|
||||
return readLength;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Basic unit test for BaseCounts in reduced reads
|
||||
*/
|
||||
public class BaseCountsUnitTest extends BaseTest {
|
||||
private class SingleTest {
|
||||
public String bases;
|
||||
public byte mostCountBase;
|
||||
public int mostCommonCount;
|
||||
|
||||
private SingleTest(String bases, char mostCountBase, int mostCommonCount) {
|
||||
this.mostCommonCount = mostCommonCount;
|
||||
this.mostCountBase = (byte)mostCountBase;
|
||||
this.bases = bases;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "data")
|
||||
public Object[][] createData1() {
|
||||
List<SingleTest> params = new ArrayList<SingleTest>();
|
||||
|
||||
params.add(new SingleTest("A", 'A', 1 ));
|
||||
params.add(new SingleTest("AA", 'A', 2 ));
|
||||
params.add(new SingleTest("AC", 'A', 1 ));
|
||||
params.add(new SingleTest("AAC", 'A', 2 ));
|
||||
params.add(new SingleTest("AAA", 'A', 3 ));
|
||||
params.add(new SingleTest("AAAN", 'A', 3 ));
|
||||
params.add(new SingleTest("AAANNNN", 'N', 4 ));
|
||||
params.add(new SingleTest("AACTG", 'A', 2 ));
|
||||
params.add(new SingleTest("D", 'D', 1 ));
|
||||
params.add(new SingleTest("DDAAD", 'D', 3));
|
||||
params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
|
||||
params.add(new SingleTest("AAIIIAI", 'I', 4 ));
|
||||
|
||||
List<Object[]> params2 = new ArrayList<Object[]>();
|
||||
for ( SingleTest x : params ) params2.add(new Object[]{x});
|
||||
return params2.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test(dataProvider = "data", enabled = true)
|
||||
public void testCounting(SingleTest params) {
|
||||
BaseCounts counts = new BaseCounts();
|
||||
|
||||
for ( byte base : params.bases.getBytes() )
|
||||
counts.incr(base);
|
||||
|
||||
String name = String.format("Test-%s", params.bases);
|
||||
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
|
||||
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
|
||||
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ReduceReadsIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam";
|
||||
final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam";
|
||||
final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556";
|
||||
final String L = " -L 20:10,100,000-10,120,000 ";
|
||||
|
||||
private void RRTest(String testName, String args, String md5) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5));
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "4c92d59d4a5292af1f968dc922c2c63e");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
RRTest("testMultipleIntervals ", intervals, "97d5c3fda5551741676793ba325ec7ed");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e6bc1cd0e9de961cf0fb1789bf6ab108");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "f33ec7cd0b98eebd73d1025ca656cd7e");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "e6e2bc889e4f342a7fedc5d38b391d20");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead
|
||||
* and a subsequent tail leaves only this read in the stash. The next read to come in is in fact
|
||||
* before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error
|
||||
* that was freaking hard to catch.
|
||||
*
|
||||
* This bam is simplified to replicate the exact bug with the three provided intervals.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("022931f032a4122cfe41e58e74d0aede")));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
|
||||
public class SyntheticReadUnitTest extends BaseTest {
|
||||
final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1);
|
||||
final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic");
|
||||
final String artificialContig = "1";
|
||||
final int artificialContigIndex = 0;
|
||||
final String artificialReadName = "synth";
|
||||
final int artificialRefStart = 1;
|
||||
final double artificialMappingQuality = 60;
|
||||
|
||||
final Random random = new Random(8854875);
|
||||
|
||||
|
||||
@Test
|
||||
public void testBaseCounts() {
|
||||
BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A};
|
||||
Byte[] quals = new Byte[] {20, 20, 20, 20 };
|
||||
|
||||
TestRead [] testReads = new TestRead [] {
|
||||
new TestRead(bases, quals, new Byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}),
|
||||
new TestRead(bases, quals, new Byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}),
|
||||
new TestRead(bases, quals, new Byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}),
|
||||
new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
|
||||
|
||||
for (TestRead testRead : testReads) {
|
||||
SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false);
|
||||
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
|
||||
}
|
||||
}
|
||||
|
||||
private class TestRead {
|
||||
BaseIndex[] bases;
|
||||
Byte[] quals;
|
||||
Byte[] insQuals;
|
||||
Byte[] delQuals;
|
||||
Byte[] counts;
|
||||
byte [] expectedCounts;
|
||||
|
||||
private TestRead(BaseIndex[] bases, Byte[] quals, Byte[] counts, byte[] expectedCounts) {
|
||||
this.bases = bases;
|
||||
this.quals = quals;
|
||||
this.insQuals = quals;
|
||||
this.delQuals = quals;
|
||||
this.counts = counts;
|
||||
this.expectedCounts = expectedCounts;
|
||||
}
|
||||
|
||||
public BaseIndex[] getBases() {
|
||||
return bases;
|
||||
}
|
||||
|
||||
public Byte[] getQuals() {
|
||||
return quals;
|
||||
}
|
||||
|
||||
public Byte[] getInsQuals() {
|
||||
return insQuals;
|
||||
}
|
||||
|
||||
public Byte[] getDelQuals() {
|
||||
return delQuals;
|
||||
}
|
||||
|
||||
public Byte[] getCounts() {
|
||||
return counts;
|
||||
}
|
||||
|
||||
public byte[] getExpectedCounts() {
|
||||
return expectedCounts;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue