Moving ReduceReads into protected

This commit is contained in:
Mauricio Carneiro 2012-06-25 17:00:56 -04:00
parent 2d6511b9ac
commit 12d1c594df
15 changed files with 2968 additions and 0 deletions

View File

@ -0,0 +1,56 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import java.util.HashMap;
import java.util.Map;
/**
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
*
* @author Mauricio Carneiro
* @since 6/15/12
*/
public class BaseAndQualsCounts extends BaseCounts {
private final Map<BaseIndex, Long> sumInsertionQuals;
private final Map<BaseIndex, Long> sumDeletionQuals;
public BaseAndQualsCounts() {
super();
this.sumInsertionQuals = new HashMap<BaseIndex, Long>();
this.sumDeletionQuals = new HashMap<BaseIndex, Long>();
for (BaseIndex i : BaseIndex.values()) {
sumInsertionQuals.put(i, 0L);
sumDeletionQuals.put(i, 0L);
}
}
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
super.incr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual);
sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual);
}
}
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
super.decr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual);
sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual);
}
}
public byte averageInsertionQualsOfMostCommonBase() {
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
}
public byte averageDeletionQualsOfMostCommonBase() {
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
}
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
return (byte) (sumQuals.get(base) / getCount(base));
}
}

View File

@ -0,0 +1,223 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import java.util.EnumMap;
import java.util.Map;
/**
* An object to keep track of the number of occurences of each base and it's quality.
*
* User: depristo
* Date: 4/8/11
* Time: 2:55 PM
*/
public class BaseCounts {
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
private final Map<BaseIndex, Integer> counts; // keeps track of the base counts
private final Map<BaseIndex, Long> sumQuals; // keeps track of the quals of each base
public BaseCounts() {
counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class);
sumQuals = new EnumMap<BaseIndex, Long>(BaseIndex.class);
for (BaseIndex i : BaseIndex.values()) {
counts.put(i, 0);
sumQuals.put(i, 0L);
}
}
public static BaseCounts createWithCounts(int[] countsACGT) {
BaseCounts baseCounts = new BaseCounts();
baseCounts.counts.put(BaseIndex.A, countsACGT[0]);
baseCounts.counts.put(BaseIndex.C, countsACGT[1]);
baseCounts.counts.put(BaseIndex.G, countsACGT[2]);
baseCounts.counts.put(BaseIndex.T, countsACGT[3]);
return baseCounts;
}
@Requires("other != null")
public void add(BaseCounts other) {
for (BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) + other.counts.get(i));
}
@Requires("other != null")
public void sub(BaseCounts other) {
for (BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) - other.counts.get(i));
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) + 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) + 1);
sumQuals.put(i, sumQuals.get(i) + qual);
}
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) - 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) - 1);
sumQuals.put(i, sumQuals.get(i) - qual);
}
}
@Ensures("result >= 0")
public int getCount(byte base) {
return getCount(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public int getCount(BaseIndex base) {
return counts.get(base);
}
@Ensures("result >= 0")
public long getSumQuals(byte base) {
return getSumQuals(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public long getSumQuals(BaseIndex base) {
return sumQuals.get(base);
}
@Ensures("result >= 0")
public byte averageQuals(byte base) {
return (byte) (getSumQuals(base) / getCount(base));
}
@Ensures("result >= 0")
public byte averageQuals(BaseIndex base) {
return (byte) (getSumQuals(base) / getCount(base));
}
public byte baseWithMostCounts() {
return baseIndexWithMostCounts().getByte();
}
@Ensures("result >= 0")
public int countOfMostCommonBase() {
return counts.get(baseIndexWithMostCounts());
}
@Ensures("result >= 0")
public long sumQualsOfMostCommonBase() {
return sumQuals.get(baseIndexWithMostCounts());
}
@Ensures("result >= 0")
public byte averageQualsOfMostCommonBase() {
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
}
@Ensures("result >= 0")
public int totalCount() {
int sum = 0;
for (int c : counts.values())
sum += c;
return sum;
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param base
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(byte base) {
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param baseIndex
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(BaseIndex baseIndex) {
int total = totalCount();
if (total == 0)
return 0.0;
return (double) counts.get(baseIndex) / totalCount();
}
@Ensures("result != null")
public String toString() {
StringBuilder b = new StringBuilder();
for (Map.Entry<BaseIndex, Integer> elt : counts.entrySet()) {
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
}
return b.toString();
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostCounts() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex i : counts.keySet())
if (counts.get(i) > counts.get(maxI))
maxI = i;
return maxI;
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex index : counts.keySet())
if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts))
mostCounts = index;
return mostCounts;
}
@Ensures("result >=0")
public int totalCountWithoutIndels() {
int sum = 0;
for (BaseIndex index : counts.keySet())
if (index.isNucleotide())
sum += counts.get(index);
return sum;
}
/**
* Calculates the proportional count of a base compared to all other bases except indels (I and D)
*
* @param index
* @return the proportion of this base over all other bases except indels
*/
@Requires("index.isNucleotide()")
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportionWithoutIndels(BaseIndex index) {
int total = totalCountWithoutIndels();
if (total == 0)
return 0.0;
return (double) counts.get(index) / totalCountWithoutIndels();
}
}

View File

@ -0,0 +1,82 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
/**
* Simple byte / base index conversions
*
*
* @author carneiro
* @since 8/26/11
*/
public enum BaseIndex {
A ( 'A', 0 ),
C ( 'C', 1 ),
G ( 'G', 2 ),
T ( 'T', 3 ),
D ( 'D', 4 ),
I ( 'I', 5 ), // insertion to the right of the base
N ( 'N', 6 );
final byte b;
final int index;
public byte getByte() { return b; }
private BaseIndex(char base, int index) {
this.b = (byte)base;
this.index = index;
}
/**
* Converts a byte representation of a base to BaseIndex
*
* @param base the byte representation of the base
* @return the BaseIndex representation of the base;
*/
public static BaseIndex byteToBase(final byte base) {
switch (base) {
case 'A':
case 'a':
return A;
case 'C':
case 'c':
return C;
case 'G':
case 'g':
return G;
case 'T':
case 't':
return T;
case 'D':
case 'd':
case '-':
return D;
case 'I':
case 'i':
return I;
case 'N':
case 'n':
return N;
default: return null;
}
}
/**
* Definition of a nucleotide for the BaseIndex is anything that has been read as a base
* by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows
* there is a base there (N).
*
* @return whether or not it is a nucleotide, given the definition above
*/
public boolean isNucleotide() {
return this == A || this == C || this == G || this == T || this == N;
}
/**
* Whether or not this base is an insertion or a deletion
*
* @return true for I or D, false otherwise
*/
public boolean isIndel() {
return this == D || this == I;
}
}

View File

@ -0,0 +1,182 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.HashMap;
import java.util.Map;
/**
* Given two BAMs with different read groups, it compares them based on ReduceReads metrics.
* <p>
* This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files.
* </p>
* <h2>Input</h2>
* <p>
* Two BAM files (using -I) with different read group IDs
* </p>
* <h2>Output</h2>
* <p>
* [Output description]
* </p>
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author carneiro
* @since 10/30/11
*/
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
public class CompareBAMWalker extends LocusWalker<Map<CompareBAMWalker.TestName, Boolean>, CompareBAMWalker.TestResults> {
@Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID;
@Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false;
@Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false;
@Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20;
@Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20;
@Override
public Map<TestName, Boolean> map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
Map<TestName, Boolean> result = new HashMap<TestName, Boolean>();
if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context));
if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context));
return result;
}
@Override
public TestResults reduceInit () {
TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP.
if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES);
if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS);
return sum;
}
@Override
public TestResults reduce (Map<TestName,Boolean> mapResult, TestResults sum) {
for (TestName test : mapResult.keySet()) {
if (mapResult.get(test))
sum.reportSuccess(test);
else
sum.reportFailed(test);
}
return sum;
}
public void onTraversalDone (TestResults finalResults) {
finalResults.report();
}
private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) {
return true;
}
private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) {
return true;
}
public enum TestName {
EQUAL_BASES ("testEqualBases"),
BASE_COUNTS ("testBaseCounts");
private String testName;
TestName(String testName) {
this.testName = testName;
}
public String getTestName() {
return testName;
}
}
public class TestResults {
private Map<TestName, TestOutcome> testStats = new HashMap<TestName, TestOutcome>();
public void createTest (TestName test) {
testStats.put(test, new TestOutcome());
}
public void reportSuccess(TestName test) {
if (testStats.containsKey(test))
testStats.get(test).incPassed();
else
throw new ReviewedStingException("No such test: " + test);
}
public void reportFailed(TestName test) {
if (testStats.containsKey(test))
testStats.get(test).incFailed();
else
throw new ReviewedStingException("No such test: " + test);
}
public void report() {
System.out.println();
System.out.println(String.format("%20s\tPASS\tFAIL", ""));
for (TestName test : testStats.keySet())
System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed()));
System.out.println();
}
}
private class TestOutcome {
private long passed;
private long failed;
public long getPassed() {
return passed;
}
public void incPassed() {
this.passed++;
}
public long getFailed() {
return failed;
}
public void incFailed() {
this.failed++;
}
}
private BaseCounts getFilteredBaseCounts(AlignmentContext context) {
return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL);
}
private BaseCounts getFullBaseCounts(AlignmentContext context) {
return getBaseCounts(context, 3, 0);
}
private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) {
BaseCounts fullBaseCounts = new BaseCounts();
for (String rg : context.getBasePileup().getReadGroups()) {
if (!rg.equals(reducedReadGroupID)) {
BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts());
fullBaseCounts.add(b);
}
}
return fullBaseCounts;
}
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 4/10/11
* Time: 8:49 AM
*
* A general interface for ReadCompressors. Read compressors have the following semantics:
*
* The accept a stream of reads, in order, and after each added read returns a compressed stream
* of reads for emission. This stream of reads is a "reduced" representation of the total stream
* of reads. The actual compression approach is left up to the implementing class.
*/
public interface Compressor {
/**
* Adds the read to the compressor. The returned iteratable collection of
* reads represents the incremental compressed output.
* @param read the next uncompressed read in the input stream to the compressor
* @return an iterator over the incrementally available compressed reads
*/
@Requires("read != null")
@Ensures("result != null")
Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read);
/**
* Must be called after the last read has been added to finalize the compressor state
* and return the last compressed reads from the compressor.
* @return an iterator over the final compressed reads of this compressor
*/
@Ensures("result != null")
Iterable<GATKSAMRecord> close();
}

View File

@ -0,0 +1,204 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.LinkedList;
/**
* The element that describes the header of the sliding window.
*
* Each site has a header element containing the counts of each base, it's reference based location and whether or
* not the site has insertions (to it's right). It also contains information about the bases that have been filtered
* out due to mapping or base quality.
*/
public class HeaderElement {
private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site.
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
private int nSoftClippedBases; // How many bases in this site came from soft clipped bases
private int location; // Genome location of this site (the sliding window knows which contig we're at
private LinkedList<Integer> mappingQuality; // keeps the mapping quality of each read that contributed to this element (site)
public int getLocation() {
return location;
}
public BaseAndQualsCounts getFilteredBaseCounts() {
return filteredBaseCounts;
}
public BaseAndQualsCounts getConsensusBaseCounts() {
return consensusBaseCounts;
}
/**
* Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty
* filteredBaseCounts - 0 insertions to the right - empty mappingQuality list
*
* @param location the reference location for the new element
*/
public HeaderElement(int location) {
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList<Integer>());
}
/**
* Creates a new HeaderElement with all given parameters
*
* @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read
* @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
* @param location the reference location of this reference element
* @param mappingQuality the list of mapping quality values of all reads that contributed to this
* HeaderElement
*/
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList<Integer> mappingQuality) {
this.consensusBaseCounts = consensusBaseCounts;
this.filteredBaseCounts = filteredBaseCounts;
this.insertionsToTheRight = insertionsToTheRight;
this.nSoftClippedBases = nSoftClippedBases;
this.location = location;
this.mappingQuality = mappingQuality;
}
/**
* Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
* by insertion, deletion and mismatches.
*
* @return true if site is variant by any definition. False otherwise.
*/
public boolean isVariant(double minVariantProportion, double minIndelProportion) {
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
}
/**
* Adds a new base to the HeaderElement updating all counts accordingly
*
* @param base the base to add
* @param baseQual the base quality
* @param baseMappingQuality the mapping quality of the read this base belongs to
*/
public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
else
filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
}
public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
else
filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
}
/**
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
* should be added to the right of the element.
*/
public void addInsertionToTheRight() {
insertionsToTheRight++;
}
/**
* Does this HeaderElement contain consensus data?
*
* @return whether or not this HeaderElement contains consensus data
*/
public boolean hasConsensusData() {
return consensusBaseCounts.totalCount() > 0;
}
/**
* Does this HeaderElement contain filtered data?
*
* @return whether or not this HeaderElement contains filtered data
*/
public boolean hasFilteredData() {
return filteredBaseCounts.totalCount() > 0;
}
/**
* A HeaderElement is empty if it has no consensus or filtered data
*
* @return whether or not this HeaderElement has no data
*/
public boolean isEmpty() {
return (!hasFilteredData() && !hasConsensusData());
}
/**
* The RMS of the mapping qualities of all reads that contributed to this HeaderElement
*
* @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
*/
public double getRMS() {
return MathUtils.rms(mappingQuality);
}
/**
* removes an insertion from this element (if you removed a read that had an insertion)
*/
public void removeInsertionToTheRight() {
this.insertionsToTheRight--;
if (insertionsToTheRight < 0)
throw new ReviewedStingException("Removed too many insertions, header is now negative!");
}
/**
* Whether or not the HeaderElement is variant due to excess insertions
*
* @return whether or not the HeaderElement is variant due to excess insertions
*/
private boolean isVariantFromInsertions(double minIndelProportion) {
int numberOfBases = consensusBaseCounts.totalCount();
if (numberOfBases == 0 && insertionsToTheRight > 0)
return true; // we only have insertions
else if (numberOfBases == 0)
return false; // we don't have anything
// if we have bases and insertions, check the ratio
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
}
/**
* Whether or not the HeaderElement is variant due to excess deletions
*
* @return whether or not the HeaderElement is variant due to excess insertions
*/
private boolean isVariantFromDeletions(double minIndelProportion) {
return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
}
/**
* Whether or not the HeaderElement is variant due to excess mismatches
*
* @return whether or not the HeaderElement is variant due to excess insertions
*/
private boolean isVariantFromMismatches(double minVariantProportion) {
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
}
/**
* This handles the special case where we have more bases that came from soft clips than bases that came from
* normal bases by forcing it to become a variant region. We don't want a consensus based on too little information.
*
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
*/
private boolean isVariantFromSoftClips() {
return nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
}
private boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
}
}

View File

@ -0,0 +1,81 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
*
* @author depristo
*/
public class MultiSampleCompressor implements Compressor {
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
public MultiSampleCompressor(SAMFileHeader header,
final int contextSize,
final int downsampleCoverage,
final int minMappingQuality,
final double minAltProportionToTriggerVariant,
final double minIndelProportionToTriggerVariant,
final int minBaseQual,
final ReduceReadsWalker.DownsampleStrategy downsampleStrategy) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name,
new SingleSampleCompressor(name, contextSize, downsampleCoverage,
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
}
}
@Override
public Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
String sample = read.getReadGroup().getSample();
SingleSampleCompressor compressor = compressorsPerSample.get(sample);
if ( compressor == null )
throw new ReviewedStingException("No compressor for sample " + sample);
return compressor.addAlignment(read);
}
@Override
public Iterable<GATKSAMRecord> close() {
SortedSet<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
for ( SingleSampleCompressor comp : compressorsPerSample.values() )
for ( GATKSAMRecord read : comp.close() )
reads.add(read);
return reads;
}
}

View File

@ -0,0 +1,110 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.LinkedList;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
/**
* This class implements a "read stash" that keeps reads always sorted in alignment order. Useful
* for read walkers that alter the alignment information of the incoming reads, but need to
* maintain the reads sorted for the reduce step. (e.g. ReduceReads)
*/
public class ReduceReadsStash {
protected MultiSampleCompressor compressor;
SortedSet<GATKSAMRecord> outOfOrderReads;
/**
* Creates a stash with the default sorting order (read alignment)
* @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close())
*/
public ReduceReadsStash(MultiSampleCompressor compressor) {
this.compressor = compressor;
this.outOfOrderReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
}
/**
* Get all reads before a given read (for processing)
*
* @param read the original read
* @return all reads that have alignment start before the original read.
*/
public List<GATKSAMRecord> getAllReadsBefore(GATKSAMRecord read) {
List<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
GATKSAMRecord newHead = null;
for (GATKSAMRecord stashedRead : outOfOrderReads) {
if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0)
result.add(stashedRead);
else {
newHead = stashedRead;
break;
}
}
if (result.size() > 0) {
if (result.size() == outOfOrderReads.size())
outOfOrderReads.clear();
else
outOfOrderReads = new TreeSet<GATKSAMRecord>(outOfOrderReads.tailSet(newHead));
}
return result;
}
/**
* sends the read to the MultiSampleCompressor
*
* @param read the read to be compressed
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
*/
public Iterable<GATKSAMRecord> compress(GATKSAMRecord read) {
return compressor.addAlignment(read);
}
/**
* Add a read to the stash
*
* @param read any read
*/
public void add(GATKSAMRecord read) {
outOfOrderReads.add(read);
}
/**
* Close the stash, processing all remaining reads in order
*
* @return a list of all the reads produced by the SlidingWindow machinery)
*/
public Iterable<GATKSAMRecord> close() {
LinkedList<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
// compress all the stashed reads (in order)
for (GATKSAMRecord read : outOfOrderReads)
for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
result.add(compressedRead);
// output any remaining reads from the compressor
for (GATKSAMRecord read : compressor.close())
result.add(read);
return result;
}
/**
* Useful debug functionality, outputs all elements in the stash
*/
public void print() {
int i = 1;
System.out.println("Stash Contents:");
for (GATKSAMRecord read : outOfOrderReads)
System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd()));
System.out.println();
}
}

View File

@ -0,0 +1,665 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.util.SequenceUtil;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocComparator;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/**
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
* <p/>
* <p>
* This walker will generated reduced versions of the BAM files that still follow the BAM spec
* and contain all the information necessary for the GSA variant calling pipeline. Some options
* allow you to tune in how much compression you want to achieve. The default values have been
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
* savings in file size and performance of the downstream tools.
* <p/>
* <h2>Input</h2>
* <p>
* The BAM file to be compressed
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* The compressed (reduced) BAM file.
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ReduceReads \
* -I myData.bam \
* -o myData.reduced.bam
* </pre>
*/
@PartitionBy(PartitionType.INTERVAL)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
public class ReduceReadsWalker extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
@Output
protected StingSAMFileWriter out;
/**
* The number of bases to keep around mismatches (potential variation)
*/
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
protected int contextSize = 10;
/**
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
* mapping quality below this threshold will not be counted towards consensus, but are still counted
* towards variable regions.
*/
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
protected int minMappingQuality = 20;
/**
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
* base quality below this threshold will not be counted towards consensus, but are still counted
* towards variable regions.
*/
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
protected byte minBaseQual = 20;
/**
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
*/
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
protected byte minTailQuality = 2;
/**
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
* and read group).
*/
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
protected boolean DONT_SIMPLIFY_READS = false;
/**
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
* The program will behave correctly in those cases.
*/
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
/**
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
* quality.
*/
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
protected boolean DONT_CLIP_LOW_QUAL_TAILS = false;
/**
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
* base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
*/
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
protected boolean DONT_USE_SOFTCLIPPED_BASES = false;
/**
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
* uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
*/
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
protected boolean DONT_COMPRESS_READ_NAMES = false;
/**
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
* considered consensus.
*/
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
protected double minAltProportionToTriggerVariant = 0.05;
/**
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
* considered consensus.
*/
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
protected double minIndelProportionToTriggerVariant = 0.01;
/**
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
* A value of 0 turns downsampling off.
*/
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
protected int downsampleCoverage = 0;
@Hidden
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
protected int debugLevel = 0;
@Hidden
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
protected String debugRead = "";
@Hidden
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
@Hidden
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
private boolean NO_PG_TAG = false;
public enum DownsampleStrategy {
Normal,
Adaptive
}
protected int totalReads = 0;
int nCompressedReads = 0;
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
SortedSet<GenomeLoc> intervalList;
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
/**
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
* is done at the reduceInit method
*/
@Override
public void initialize() {
super.initialize();
GenomeAnalysisEngine toolkit = getToolkit();
readNameHash = new HashMap<String, Long>(); // prepare the read name hash to keep track of what reads have had their read names compressed
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator()); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
if (toolkit.getIntervals() != null)
intervalList.addAll(toolkit.getIntervals());
if (!NO_PG_TAG)
Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
else
out.setPresorted(false);
}
/**
* Takes in a read and prepares it for the SlidingWindow machinery by performing the
* following optional clipping operations:
* 1. Hard clip adaptor sequences
* 2. Hard clip low quality tails
* 3. Hard clip all remaining soft clipped bases
* 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads)
*
* @param ref default map parameter
* @param read default map parameter
* @param metaDataTracker default map parameter
* @return a linked list with all the reads produced by the clipping operations
*/
@Override
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
LinkedList<GATKSAMRecord> mappedReads;
totalReads++;
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
System.out.println("Found debug read!");
if (debugLevel == 1)
System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
// we write the actual alignment starts to their respectiv alignment shift tags in the temporary
// attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
if (!DONT_SIMPLIFY_READS)
read.simplify(); // Clear all unnecessary attributes
if (!DONT_CLIP_ADAPTOR_SEQUENCES)
read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any.
if (!DONT_CLIP_LOW_QUAL_TAILS)
read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails
if (!isWholeGenome())
mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval
else {
mappedReads = new LinkedList<GATKSAMRecord>();
if (!read.isEmpty())
mappedReads.add(read);
}
if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
LinkedList<GATKSAMRecord> tempList = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord mRead : mappedReads) {
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
if (!clippedRead.isEmpty())
tempList.add(clippedRead);
}
mappedReads = tempList;
}
if (debugLevel == 1)
for (GATKSAMRecord mappedRead : mappedReads)
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
return mappedReads;
}
/**
* Initializes the ReduceReadsStash that keeps track of all reads that are waiting to
* enter the SlidingWindow machinery. The stash makes sure reads are served in order
* even though map() may generate reads that are only supposed to enter the machinery
* in the future.
*
* @return the empty stash
*/
@Override
public ReduceReadsStash reduceInit() {
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
}
/**
* Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process
* all reads that come before the original read (the read that was passed to map) including the original
* read. This is where we send reads, in order, to the SlidingWindow machinery.
*
* @param mappedReads the list of reads sent by map
* @param stash the stash that keeps the reads in order for processing
* @return the stash with all reads that have not been processed yet
*/
public ReduceReadsStash reduce(LinkedList<GATKSAMRecord> mappedReads, ReduceReadsStash stash) {
if (debugLevel == 1)
stash.print();
boolean firstRead = true;
for (GATKSAMRecord read : mappedReads) {
boolean originalRead = firstRead && isOriginalRead(mappedReads, read);
if (read.getReadLength() == 0)
throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
if (originalRead) {
List<GATKSAMRecord> readsReady = new LinkedList<GATKSAMRecord>();
readsReady.addAll(stash.getAllReadsBefore(read));
readsReady.add(read);
for (GATKSAMRecord readReady : readsReady) {
if (debugLevel == 1)
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
for (GATKSAMRecord compressedRead : stash.compress(readReady))
outputRead(compressedRead);
}
} else
stash.add(read);
firstRead = false;
}
return stash;
}
/**
* Now that now more reads will come, we process all the remaining reads in the stash, in order.
*
* @param stash the ReduceReadsStash with all unprocessed reads (from reduce)
*/
@Override
public void onTraversalDone(ReduceReadsStash stash) {
// output any remaining reads in the compressor
for (GATKSAMRecord read : stash.close())
outputRead(read);
}
/**
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
*
* Note: If read overlaps more than one interval, it will be hard clipped to all
* the intervals it overlaps with
*
* @param read the read to be hard clipped to the interval.
* @return a shallow copy of the read hard clipped to the interval
*/
private LinkedList<GATKSAMRecord> hardClipReadToInterval(GATKSAMRecord read) {
LinkedList<GATKSAMRecord> clippedReads = new LinkedList<GATKSAMRecord>();
GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
boolean originalRead = true; // false if this is the right tail of the original read
boolean overlap; // keeps track of the interval that overlapped the original read
boolean doneClipping; // triggers an early exit if we are done clipping this read
if (isWholeGenome())
clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged
for (GenomeLoc interval : intervalList) {
if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before)
break;
GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch
switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) {
case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read
if (!originalRead) // something went wrong if this is the tail of the read
throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
overlap = false;
doneClipping = false;
break;
case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore
if (originalRead) {
overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read).
clippedRead = GATKSAMRecord.emptyRead(read);
} else
overlap = false;
doneClipping = false;
break;
case NO_OVERLAP_CONTIG: // read is in a different contig
if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one.
if (read.getReferenceIndex() < interval.getContigIndex())
throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
else {
overlap = false;
doneClipping = false;
}
} // tail read CANNOT be in a different contig.
else {
if (read.getReferenceIndex() < interval.getContigIndex()) {
overlap = false;
doneClipping = true;
} else
throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
}
break;
case NO_OVERLAP_LEFT:
if (originalRead) // if this is the first read this should never happen.
throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
overlap = false;
doneClipping = true;
break;
case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore
overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here.
doneClipping = true;
break;
case OVERLAP_LEFT: // clip the left tail of the read
clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1);
overlap = true;
doneClipping = true;
break;
case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval
clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1);
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
overlap = true;
doneClipping = false;
break;
case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read
clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1);
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
overlap = true;
doneClipping = false;
break;
case OVERLAP_CONTAINED: // don't do anything to the read
clippedRead = read;
overlap = true;
doneClipping = true;
break;
default:
throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval.");
}
if (overlap && originalRead)
intervalOverlapped = interval;
if (clippedRead != null) {
originalRead = false;
if (!clippedRead.isEmpty())
clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off
}
if (doneClipping)
break;
}
if (intervalOverlapped != null)
intervalList = intervalList.tailSet(intervalOverlapped);
return clippedReads;
}
/**
* Compresses the read name and adds it to output BAM file (reduced BAM)
* after performing some quality control
*
* @param read any read
*/
private void outputRead(GATKSAMRecord read) {
if (debugLevel == 2) {
checkForHighMismatch(read);
checkCigar(read);
}
if (read.isReducedRead())
nCompressedReads++;
else {
int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT);
int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT);
int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression
int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression
if (startShift > 0)
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
if (endShift > 0)
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
totalReads++;
}
if (debugLevel == 1)
System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
// if (!DONT_USE_SOFTCLIPPED_BASES)
// reSoftClipBases(read);
if (!DONT_COMPRESS_READ_NAMES)
compressReadName(read);
out.addAlignment(read);
}
private void reSoftClipBases(GATKSAMRecord read) {
Integer left = (Integer) read.getTemporaryAttribute("SL");
Integer right = (Integer) read.getTemporaryAttribute("SR");
if (left != null || right != null) {
Cigar newCigar = new Cigar();
for (CigarElement element : read.getCigar().getCigarElements()) {
newCigar.add(new CigarElement(element.getLength(), element.getOperator()));
}
if (left != null) {
newCigar = updateFirstSoftClipCigarElement(left, newCigar);
read.setAlignmentStart(read.getAlignmentStart() + left);
}
if (right != null) {
Cigar invertedCigar = invertCigar(newCigar);
newCigar = invertCigar(updateFirstSoftClipCigarElement(right, invertedCigar));
}
read.setCigar(newCigar);
}
}
/**
* Facility routine to revert the first element of a Cigar string (skipping hard clips) into a soft-clip.
* To be used on both ends if provided a flipped Cigar
*
* @param softClipSize the length of the soft clipped element to add
* @param originalCigar the original Cigar string
* @return a new Cigar object with the soft clips added
*/
private Cigar updateFirstSoftClipCigarElement (int softClipSize, Cigar originalCigar) {
Cigar result = new Cigar();
CigarElement leftElement = new CigarElement(softClipSize, CigarOperator.S);
boolean updated = false;
for (CigarElement element : originalCigar.getCigarElements()) {
if (!updated && element.getOperator() == CigarOperator.M) {
result.add(leftElement);
int newLength = element.getLength() - softClipSize;
if (newLength > 0)
result.add(new CigarElement(newLength, CigarOperator.M));
updated = true;
}
else
result.add(element);
}
return result;
}
/**
* Given a cigar string, returns the inverted cigar string.
*
* @param cigar the original cigar
* @return the inverted cigar
*/
private Cigar invertCigar(Cigar cigar) {
Stack<CigarElement> stack = new Stack<CigarElement>();
for (CigarElement e : cigar.getCigarElements())
stack.push(e);
Cigar inverted = new Cigar();
while (!stack.empty()) {
inverted.add(stack.pop());
}
return inverted;
}
/**
* Quality control procedure that checks if the consensus reads contains too many
* mismatches with the reference. This should never happen and is a good trigger for
* errors with the algorithm.
*
* @param read any read
*/
private void checkForHighMismatch(GATKSAMRecord read) {
final int start = read.getAlignmentStart();
final int stop = read.getAlignmentEnd();
final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases();
final int nm = SequenceUtil.countMismatches(read, ref, start - 1);
final int readLen = read.getReadLength();
final double nmFraction = nm / (1.0 * readLen);
if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus"))
throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
}
private void checkCigar (GATKSAMRecord read) {
if (read.getCigar().isValid(null, -1) != null) {
throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString());
}
}
/**
* Compresses the read name using the readNameHash if we have already compressed
* this read name before.
*
* @param read any read
*/
private void compressReadName(GATKSAMRecord read) {
String name = read.getReadName();
String compressedName = read.isReducedRead() ? "C" : "";
if (readNameHash.containsKey(name))
compressedName += readNameHash.get(name).toString();
else {
readNameHash.put(name, nextReadNumber);
compressedName += nextReadNumber.toString();
nextReadNumber++;
}
read.setReadName(compressedName);
}
/**
* Returns true if the read is the original read that went through map().
*
* This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled.
*
* @param list the list
* @param read the read
* @return Returns true if the read is the original read that went through map().
*/
private boolean isOriginalRead(LinkedList<GATKSAMRecord> list, GATKSAMRecord read) {
return isWholeGenome() || (list.getFirst().equals(read) && ReadUtils.getReadAndIntervalOverlapType(read, intervalList.first()) == ReadUtils.ReadAndIntervalOverlap.OVERLAP_CONTAINED);
}
/**
* Checks whether or not the intervalList is empty, meaning we're running in WGS mode.
*
* @return whether or not we're running in WGS mode.
*/
private boolean isWholeGenome() {
return intervalList.isEmpty();
}
}

View File

@ -0,0 +1,83 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.TreeSet;
/**
*
* @author depristo
* @version 0.1
*/
public class SingleSampleCompressor implements Compressor {
protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class);
protected final int contextSize;
protected final int downsampleCoverage;
protected int minMappingQuality;
protected int slidingWindowCounter;
protected final String sampleName;
protected SlidingWindow slidingWindow;
protected double minAltProportionToTriggerVariant;
protected double minIndelProportionToTriggerVariant;
protected int minBaseQual;
protected ReduceReadsWalker.DownsampleStrategy downsampleStrategy;
public SingleSampleCompressor(final String sampleName,
final int contextSize,
final int downsampleCoverage,
final int minMappingQuality,
final double minAltProportionToTriggerVariant,
final double minIndelProportionToTriggerVariant,
final int minBaseQual,
final ReduceReadsWalker.DownsampleStrategy downsampleStrategy) {
this.sampleName = sampleName;
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
this.minMappingQuality = minMappingQuality;
this.slidingWindowCounter = 0;
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
this.minBaseQual = minBaseQual;
this.downsampleStrategy = downsampleStrategy;
}
/**
* @{inheritDoc}
*/
@Override
public Iterable<GATKSAMRecord> addAlignment( GATKSAMRecord read ) {
TreeSet<GATKSAMRecord> result = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
int readOriginalStart = read.getUnclippedStart();
// create a new window if:
if ((slidingWindow != null) &&
( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
// close the current sliding window
result.addAll(slidingWindow.close());
slidingWindow = null; // so we create a new one on the next if
}
if ( slidingWindow == null) { // this is the first read
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
slidingWindowCounter++;
}
result.addAll(slidingWindow.addRead(read));
return result;
}
@Override
public Iterable<GATKSAMRecord> close() {
return (slidingWindow != null) ? slidingWindow.close() : new TreeSet<GATKSAMRecord>();
}
}

View File

@ -0,0 +1,713 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.downsampling.FractionalDownsampler;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
/**
* Created by IntelliJ IDEA.
* User: roger
* Date: 8/3/11
* Time: 2:24 PM
*/
public class SlidingWindow {
// Sliding Window data
final private LinkedList<GATKSAMRecord> readsInWindow;
final private LinkedList<HeaderElement> windowHeader;
protected int contextSize; // the largest context size (between mismatches and indels)
protected int stopLocation;
protected String contig;
protected int contigIndex;
protected SAMFileHeader header;
protected GATKSAMReadGroupRecord readGroupAttribute;
protected int downsampleCoverage;
// Running consensus data
protected SyntheticRead runningConsensus;
protected int consensusCounter;
protected String consensusReadName;
// Filtered Data Consensus data
protected SyntheticRead filteredDataConsensus;
protected int filteredDataConsensusCounter;
protected String filteredDataReadName;
// Additional parameters
protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions
protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value
protected int MIN_MAPPING_QUALITY;
protected ReduceReadsWalker.DownsampleStrategy downsampleStrategy;
private boolean hasIndelQualities;
/**
* The types of synthetic reads to use in the finalizeAndAdd method
*/
private enum ConsensusType {
CONSENSUS,
FILTERED,
BOTH
}
public int getStopLocation() {
return stopLocation;
}
public String getContig() {
return contig;
}
public int getContigIndex() {
return contigIndex;
}
public int getStartLocation() {
return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation();
}
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReadsWalker.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
this.stopLocation = -1;
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
this.MIN_MAPPING_QUALITY = minMappingQuality;
this.windowHeader = new LinkedList<HeaderElement>();
this.readsInWindow = new LinkedList<GATKSAMRecord>();
this.contig = contig;
this.contigIndex = contigIndex;
this.header = header;
this.readGroupAttribute = readGroupAttribute;
this.consensusCounter = 0;
this.consensusReadName = "Consensus-" + windowNumber + "-";
this.filteredDataConsensusCounter = 0;
this.filteredDataReadName = "Filtered-" + windowNumber + "-";
this.runningConsensus = null;
this.filteredDataConsensus = null;
this.downsampleStrategy = downsampleStrategy;
this.hasIndelQualities = hasIndelQualities;
}
/**
* Add a read to the sliding window and slides the window accordingly.
*
* Reads are assumed to be in order, therefore, when a read is added the sliding window can
* assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window
* slides forward to that position and returns all reads that may have been finalized in the
* sliding process.
*
* @param read the read
* @return a list of reads that have been finished by sliding the window.
*/
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
updateHeaderCounts(read, false); // update the window header counts
readsInWindow.add(read); // add read to sliding reads
return slideWindow(read.getUnclippedStart());
}
/**
* returns the next complete or incomplete variant region between 'from' (inclusive) and 'to' (exclusive)
*
* @param from beginning window header index of the search window (inclusive)
* @param to end window header index of the search window (exclusive)
* @param variantSite boolean array with true marking variant regions
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
*/
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
boolean foundStart = false;
int variantRegionStartIndex = 0;
for (int i=from; i<to; i++) {
if (variantSite[i] && !foundStart) {
variantRegionStartIndex = i;
foundStart = true;
}
else if(!variantSite[i] && foundStart) {
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
}
}
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
}
/**
* Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive)
*
* @param from beginning window header index of the search window (inclusive)
* @param to end window header index of the search window (exclusive)
* @param variantSite boolean array with true marking variant regions
* @return a list with start/stops of variant regions following getNextVariantRegion description
*/
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
int index = from;
while(index < to) {
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
if (result == null)
break;
regions.add(result);
if (result.getSecond() < 0)
break;
index = result.getSecond() + 1;
}
return regions;
}
/**
* Determines if the window can be slid given the new incoming read.
*
* We check from the start of the window to the (unclipped) start of the new incoming read if there
* is any variant.
* If there are variant sites, we check if it's time to close the variant region.
*
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
* @return all reads that have fallen to the left of the sliding window after the slide
*/
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
if (incomingReadUnclippedStart - contextSize > getStartLocation()) {
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex);
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
finalizedReads = closeVariantRegions(regions, false);
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
if (read.getAlignmentEnd() < getStartLocation()) {
readsToRemove.add(read);
}
}
for (GATKSAMRecord read : readsToRemove) {
readsInWindow.remove(read);
}
}
return finalizedReads;
}
/**
* returns an array marked with variant and non-variant regions (it uses
* markVariantRegions to make the marks)
*
* @param stop check the window from start to stop (not-inclusive)
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
*/
protected boolean[] markSites(int stop) {
boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1];
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
for (int i = getStartLocation(); i < stop; i++) {
if (headerElementIterator.hasNext()) {
HeaderElement headerElement = headerElementIterator.next();
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
markVariantRegion(markedSites, i - getStartLocation());
} else
break;
}
return markedSites;
}
/**
* Marks the sites around the variant site (as true)
*
* @param markedSites the boolean array to bear the marks
* @param variantSiteLocation the location where a variant site was found
*/
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
for (int i = from; i < to; i++)
markedSites[i] = true;
}
/**
* Adds bases to the running consensus or filtered data accordingly
*
* If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
*
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
* @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
*/
protected List<GATKSAMRecord> addToSyntheticReads(int start, int end) {
LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
if (start < end) {
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
if (!headerElementIterator.hasNext())
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end));
HeaderElement headerElement = headerElementIterator.next();
if (headerElement.hasConsensusData()) {
reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
int endOfConsensus = findNextNonConsensusElement(start, end);
addToRunningConsensus(start, endOfConsensus);
if (endOfConsensus <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
reads.addAll(addToSyntheticReads(endOfConsensus, end));
} else if (headerElement.hasFilteredData()) {
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
int endOfFilteredData = findNextNonFilteredDataElement(start, end);
addToFilteredData(start, endOfFilteredData);
if (endOfFilteredData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
reads.addAll(addToSyntheticReads(endOfFilteredData, end));
} else if (headerElement.isEmpty()) {
reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
int endOfEmptyData = findNextNonEmptyElement(start, end);
if (endOfEmptyData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
reads.addAll(addToSyntheticReads(endOfEmptyData, end));
} else
throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
}
return reads;
}
/**
* Finalizes one or more synthetic reads.
*
* @param type the synthetic reads you want to close
* @return the GATKSAMRecords generated by finalizing the synthetic reads
*/
private List<GATKSAMRecord> finalizeAndAdd(ConsensusType type) {
GATKSAMRecord read = null;
List<GATKSAMRecord> list = new LinkedList<GATKSAMRecord>();
switch (type) {
case CONSENSUS:
read = finalizeRunningConsensus();
break;
case FILTERED:
read = finalizeFilteredDataConsensus();
break;
case BOTH:
read = finalizeRunningConsensus();
if (read != null) list.add(read);
read = finalizeFilteredDataConsensus();
}
if (read != null)
list.add(read);
return list;
}
/**
* Looks for the next position without consensus data
*
* @param start beginning of the filtered region
* @param upTo limit to search for another consensus element
* @return next position with consensus data or empty
*/
private int findNextNonConsensusElement(int start, int upTo) {
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
int index = start;
while (index < upTo) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("There are no more header elements in this window");
HeaderElement headerElement = headerElementIterator.next();
if (!headerElement.hasConsensusData())
break;
index++;
}
return index;
}
/**
* Looks for the next position without filtered data
*
* @param start beginning of the region
* @param upTo limit to search for
* @return next position with no filtered data
*/
private int findNextNonFilteredDataElement(int start, int upTo) {
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
int index = start;
while (index < upTo) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("There are no more header elements in this window");
HeaderElement headerElement = headerElementIterator.next();
if (!headerElement.hasFilteredData() || headerElement.hasConsensusData())
break;
index++;
}
return index;
}
/**
* Looks for the next non-empty header element
*
* @param start beginning of the region
* @param upTo limit to search for
* @return next position with non-empty element
*/
private int findNextNonEmptyElement(int start, int upTo) {
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
int index = start;
while (index < upTo) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("There are no more header elements in this window");
HeaderElement headerElement = headerElementIterator.next();
if (!headerElement.isEmpty())
break;
index++;
}
return index;
}
/**
* Adds bases to the filtered data synthetic read.
*
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
* bases.
*
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
*/
private void addToFilteredData(int start, int end) {
if (filteredDataConsensus == null)
filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
for (int index = start; index < end; index++) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
HeaderElement headerElement = headerElementIterator.next();
if (headerElement.hasConsensusData())
throw new ReviewedStingException("Found consensus data inside region to add to filtered data.");
if (!headerElement.hasFilteredData())
throw new ReviewedStingException("No filtered data in " + index);
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
}
}
/**
* Adds bases to the filtered data synthetic read.
*
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
* bases.
*
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
*/
private void addToRunningConsensus(int start, int end) {
if (runningConsensus == null)
runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
for (int index = start; index < end; index++) {
if (!headerElementIterator.hasNext())
throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
HeaderElement headerElement = headerElementIterator.next();
if (!headerElement.hasConsensusData())
throw new ReviewedStingException("No CONSENSUS data in " + index);
genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
}
}
/**
* Generic accessor to add base and qualities to a synthetic read
*
* @param syntheticRead the synthetic read to add to
* @param baseCounts the base counts object in the header element
* @param rms the rms mapping quality in the header element
*/
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
BaseIndex base = baseCounts.baseIndexWithMostCounts();
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfMostCommonBase();
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
syntheticRead.add(base, count, qual, insQual, delQual, rms);
}
/**
* Finalizes a variant region, any adjacent synthetic reads.
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
* @return all reads contained in the variant region plus any adjacent synthetic reads
*/
@Requires("start <= stop")
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based
int refStop = windowHeader.get(stop).getLocation();
for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region
if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
allReads.add(read);
updateHeaderCounts(read, true); // Remove this read from the window header entirely
}
}
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
result.addAll(addToSyntheticReads(0, start));
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
for (GATKSAMRecord read : result) {
readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
}
return result; // finalized reads will be downsampled if necessary
}
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
if (!regions.isEmpty()) {
int lastStop = -1;
for (Pair<Integer, Integer> region : regions) {
int start = region.getFirst();
int stop = region.getSecond();
if (stop < 0 && forceClose)
stop = windowHeader.size() - 1;
if (stop >= 0) {
allReads.addAll(closeVariantRegion(start, stop));
lastStop = stop;
}
}
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
}
return allReads;
}
/**
* Downsamples a variant region to the downsample coverage of the sliding window.
*
* It will use the downsampling strategy defined by the SlidingWindow
*
* @param allReads the reads to select from (all reads that cover the window)
* @return a list of reads selected by the downsampler to cover the window to at least the desired coverage
*/
protected List<GATKSAMRecord> downsampleVariantRegion(final List<GATKSAMRecord> allReads) {
double fraction = 100 / allReads.size();
if (fraction >= 1)
return allReads;
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
downsampler.submit(allReads);
return downsampler.consumeDownsampledItems();
}
/**
* Properly closes a Sliding Window, finalizing all consensus and variant
* regions that still exist regardless of being able to fulfill the
* context size requirement in the end.
*
* @return All reads generated
*/
public List<GATKSAMRecord> close() {
// mark variant regions
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
if (!windowHeader.isEmpty()) {
boolean[] variantSite = markSites(stopLocation + 1);
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
finalizedReads = closeVariantRegions(regions, true);
if (!windowHeader.isEmpty()) {
finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1));
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
}
}
return finalizedReads;
}
/**
* generates the SAM record for the running consensus read and resets it (to null)
*
* @return the read contained in the running consensus
*/
protected GATKSAMRecord finalizeRunningConsensus() {
GATKSAMRecord finalizedRead = null;
if (runningConsensus != null) {
if (runningConsensus.size() > 0)
finalizedRead = runningConsensus.close();
else
consensusCounter--;
runningConsensus = null;
}
return finalizedRead;
}
/**
* generates the SAM record for the filtered data consensus and resets it (to null)
*
* @return the read contained in the running consensus
*/
protected GATKSAMRecord finalizeFilteredDataConsensus() {
GATKSAMRecord finalizedRead = null;
if (filteredDataConsensus != null) {
if (filteredDataConsensus.size() > 0)
finalizedRead = filteredDataConsensus.close();
else
filteredDataConsensusCounter--;
filteredDataConsensus = null;
}
return finalizedRead;
}
/**
* Updates the sliding window's header counts with the incoming read bases, insertions
* and deletions.
*
* @param read the incoming read to be added to the sliding window
*/
protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) {
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
byte[] insQuals = read.getExistingBaseInsertionQualities();
byte[] delQuals = read.getExistingBaseDeletionQualities();
int readStart = read.getSoftStart();
int readEnd = read.getSoftEnd();
Cigar cigar = read.getCigar();
int readBaseIndex = 0;
int startLocation = getStartLocation();
int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
if (removeRead && locationIndex < 0)
throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
for (int i = 1; i <= -locationIndex; i++)
windowHeader.addFirst(new HeaderElement(startLocation - i));
startLocation = readStart; // update start location accordingly
locationIndex = 0;
}
if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
while (elementsToAdd-- > 0)
windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd));
stopLocation = readEnd; // update stopLocation accordingly
}
// Special case for leading insertions before the beginning of the sliding read
if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
}
}
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(locationIndex);
HeaderElement headerElement;
for (CigarElement cigarElement : cigar.getCigarElements()) {
switch (cigarElement.getOperator()) {
case H:
break;
case I:
if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
break;
}
headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
if (removeRead) {
headerElement.removeInsertionToTheRight();
}
else {
headerElement.addInsertionToTheRight();
}
readBaseIndex += cigarElement.getLength();
break; // just ignore the insertions at the beginning of the read
case D:
int nDeletions = cigarElement.getLength();
while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score
headerElement = headerElementIterator.next();
byte mq = (byte) read.getMappingQuality();
if (removeRead)
headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
else
headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
locationIndex++;
}
break;
case S:
case M:
case P:
case EQ:
case X:
int nBasesToAdd = cigarElement.getLength();
while (nBasesToAdd-- > 0) {
headerElement = headerElementIterator.next();
byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything)
byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex];
if (removeRead)
headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
else
headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
readBaseIndex++;
locationIndex++;
}
break;
}
}
}
}

View File

@ -0,0 +1,285 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* Running Consensus is a read that is compressed as a sliding window travels over the reads
* and keeps track of all the bases that are outside of variant regions.
*
* Consensus reads have qual fields that correspond to the number of reads that had the base
* and passed the minimum quality threshold.
*
* The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads
* that compose the consensus
*
* @author Mauricio Carneiro
* @since 8/26/11
*/
public class SyntheticRead {
private List<BaseIndex> bases;
private List<Byte> counts;
private List<Byte> quals;
private List<Byte> insertionQuals;
private List<Byte> deletionQuals;
private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
private String readTag;
// Information to produce a GATKSAMRecord
private SAMFileHeader header;
private GATKSAMReadGroupRecord readGroupRecord;
private String contig;
private int contigIndex;
private String readName;
private Integer refStart;
private boolean hasIndelQualities = false;
/**
* Full initialization of the running consensus if you have all the information and are ready to
* start adding to the running consensus.
*
* @param header GATKSAMRecord file header
* @param readGroupRecord Read Group for the GATKSAMRecord
* @param contig the read's contig name
* @param contigIndex the read's contig index
* @param readName the read's name
* @param refStart the alignment start (reference based)
* @param readTag the reduce reads tag for the synthetic read
*/
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) {
final int initialCapacity = 10000;
bases = new ArrayList<BaseIndex>(initialCapacity);
counts = new ArrayList<Byte>(initialCapacity);
quals = new ArrayList<Byte>(initialCapacity);
insertionQuals = new ArrayList<Byte>(initialCapacity);
deletionQuals = new ArrayList<Byte>(initialCapacity);
mappingQuality = 0.0;
this.readTag = readTag;
this.header = header;
this.readGroupRecord = readGroupRecord;
this.contig = contig;
this.contigIndex = contigIndex;
this.readName = readName;
this.refStart = refStart;
this.hasIndelQualities = hasIndelQualities;
}
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) {
this.bases = bases;
this.counts = counts;
this.quals = quals;
this.insertionQuals = insertionQuals;
this.deletionQuals = deletionQuals;
this.mappingQuality = mappingQuality;
this.readTag = readTag;
this.header = header;
this.readGroupRecord = readGroupRecord;
this.contig = contig;
this.contigIndex = contigIndex;
this.readName = readName;
this.refStart = refStart;
this.hasIndelQualities = hasIndelQualities;
}
/**
* Easy access to keep adding to a running consensus that has already been
* initialized with the correct read name and refStart
*
* @param base the base to add
* @param count number of reads with this base
*/
@Requires("count < Byte.MAX_VALUE")
public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
counts.add(count);
bases.add(base);
quals.add(qual);
insertionQuals.add(insQual);
deletionQuals.add(delQual);
this.mappingQuality += mappingQuality;
}
public BaseIndex getBase(int readCoordinate) {
return bases.get(readCoordinate);
}
/**
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
*
* Invalid reads are :
* - exclusively composed of deletions
*
* @return a GATKSAMRecord or null
*/
public GATKSAMRecord close () {
if (isAllDeletions())
return null;
GATKSAMRecord read = new GATKSAMRecord(header);
read.setReferenceName(contig);
read.setReferenceIndex(contigIndex);
read.setReadPairedFlag(false);
read.setReadUnmappedFlag(false);
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
read.setAlignmentStart(refStart);
read.setReadName(readName);
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
read.setReadBases(convertReadBases());
read.setMappingQuality((int) Math.ceil(mappingQuality / bases.size()));
read.setReadGroup(readGroupRecord);
read.setAttribute(readTag, convertBaseCounts());
if (hasIndelQualities) {
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION);
}
return read;
}
/**
* Checks if the synthetic read is composed exclusively of deletions
*
* @return true if it is, false if it isn't.
*/
private boolean isAllDeletions() {
for (BaseIndex b : bases)
if (b != BaseIndex.D)
return false;
return true;
}
public int size () {
return bases.size();
}
private byte [] convertBaseQualities() {
return convertVariableGivenBases(bases, quals);
}
private byte [] convertInsertionQualities() {
return convertVariableGivenBases(bases, insertionQuals);
}
private byte [] convertDeletionQualities() {
return convertVariableGivenBases(bases, deletionQuals);
}
protected byte [] convertBaseCounts() {
byte[] countsArray = convertVariableGivenBases(bases, counts);
if (countsArray.length == 0)
throw new ReviewedStingException("Reduced read has counts array of length 0");
byte[] compressedCountsArray = new byte [countsArray.length];
compressedCountsArray[0] = countsArray[0];
for (int i = 1; i < countsArray.length; i++)
compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
return compressedCountsArray;
}
private byte [] convertReadBases() {
byte [] readArray = new byte[getReadLengthWithNoDeletions(bases)];
int i = 0;
for (BaseIndex baseIndex : bases)
if (baseIndex != BaseIndex.D)
readArray[i++] = baseIndex.getByte();
return readArray;
}
/**
* Builds the cigar string for the synthetic read
*
* Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read.
*
* @return the cigar string for the synthetic read
*/
private Cigar buildCigar() {
LinkedList<CigarElement> cigarElements = new LinkedList<CigarElement>();
CigarOperator cigarOperator = null;
int length = 0;
for (BaseIndex b : bases) {
CigarOperator op;
switch (b) {
case D:
op = CigarOperator.DELETION;
break;
case I:
throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported.");
default:
op = CigarOperator.MATCH_OR_MISMATCH;
break;
}
if (cigarOperator == null) {
if (op == CigarOperator.D) // read cannot start with a deletion
refStart++; // if it does, we need to move the reference start forward
else
cigarOperator = op;
}
else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one
cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator
cigarOperator = op;
length = 0;
}
if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions)
length++;
}
if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion
cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element
return new Cigar(cigarElements);
}
/**
* Shared functionality for all conversion utilities
*
* @param bases the read bases
* @param variable the list to convert
* @return a converted variable given the bases and skipping deletions
*/
private static byte [] convertVariableGivenBases (List<BaseIndex> bases, List<Byte> variable) {
byte [] variableArray = new byte[getReadLengthWithNoDeletions(bases)];
int i = 0;
Iterator<Byte> variableIterator = variable.iterator();
for (BaseIndex baseIndex : bases) {
byte count = variableIterator.next();
if (baseIndex != BaseIndex.D)
variableArray[i++] = count;
}
return variableArray;
}
/**
* Shared functionality for all conversion utilities
*
* @param bases the read bases
* @return the length of the read with no deletions
*/
private static int getReadLengthWithNoDeletions(List<BaseIndex> bases) {
int readLength = bases.size();
for (BaseIndex baseIndex : bases)
if (baseIndex == BaseIndex.D)
readLength--;
return readLength;
}
}

View File

@ -0,0 +1,69 @@
// our package
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.List;
/**
* Basic unit test for BaseCounts in reduced reads
*/
public class BaseCountsUnitTest extends BaseTest {
private class SingleTest {
public String bases;
public byte mostCountBase;
public int mostCommonCount;
private SingleTest(String bases, char mostCountBase, int mostCommonCount) {
this.mostCommonCount = mostCommonCount;
this.mostCountBase = (byte)mostCountBase;
this.bases = bases;
}
}
@DataProvider(name = "data")
public Object[][] createData1() {
List<SingleTest> params = new ArrayList<SingleTest>();
params.add(new SingleTest("A", 'A', 1 ));
params.add(new SingleTest("AA", 'A', 2 ));
params.add(new SingleTest("AC", 'A', 1 ));
params.add(new SingleTest("AAC", 'A', 2 ));
params.add(new SingleTest("AAA", 'A', 3 ));
params.add(new SingleTest("AAAN", 'A', 3 ));
params.add(new SingleTest("AAANNNN", 'N', 4 ));
params.add(new SingleTest("AACTG", 'A', 2 ));
params.add(new SingleTest("D", 'D', 1 ));
params.add(new SingleTest("DDAAD", 'D', 3));
params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
params.add(new SingleTest("AAIIIAI", 'I', 4 ));
List<Object[]> params2 = new ArrayList<Object[]>();
for ( SingleTest x : params ) params2.add(new Object[]{x});
return params2.toArray(new Object[][]{});
}
@Test(dataProvider = "data", enabled = true)
public void testCounting(SingleTest params) {
BaseCounts counts = new BaseCounts();
for ( byte base : params.bases.getBytes() )
counts.incr(base);
String name = String.format("Test-%s", params.bases);
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
}
}

View File

@ -0,0 +1,68 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ReduceReadsIntegrationTest extends WalkerTest {
final static String REF = b37KGReference;
final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam";
final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam";
final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556";
final String L = " -L 20:10,100,000-10,120,000 ";
private void RRTest(String testName, String args, String md5) {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5));
executeTest(testName, spec);
}
@Test(enabled = true)
public void testDefaultCompression() {
RRTest("testDefaultCompression ", L, "4c92d59d4a5292af1f968dc922c2c63e");
}
@Test(enabled = true)
public void testMultipleIntervals() {
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
RRTest("testMultipleIntervals ", intervals, "97d5c3fda5551741676793ba325ec7ed");
}
@Test(enabled = true)
public void testHighCompression() {
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e6bc1cd0e9de961cf0fb1789bf6ab108");
}
@Test(enabled = true)
public void testLowCompression() {
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "f33ec7cd0b98eebd73d1025ca656cd7e");
}
@Test(enabled = true)
public void testIndelCompression() {
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "e6e2bc889e4f342a7fedc5d38b391d20");
}
@Test(enabled = true)
public void testFilteredDeletionCompression() {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
}
/**
* Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead
* and a subsequent tail leaves only this read in the stash. The next read to come in is in fact
* before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error
* that was freaking hard to catch.
*
* This bam is simplified to replicate the exact bug with the three provided intervals.
*/
@Test(enabled = true)
public void testAddingReadAfterTailingTheStash() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("022931f032a4122cfe41e58e74d0aede")));
}
}

View File

@ -0,0 +1,85 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Random;
public class SyntheticReadUnitTest extends BaseTest {
final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1);
final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic");
final String artificialContig = "1";
final int artificialContigIndex = 0;
final String artificialReadName = "synth";
final int artificialRefStart = 1;
final double artificialMappingQuality = 60;
final Random random = new Random(8854875);
@Test
public void testBaseCounts() {
BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A};
Byte[] quals = new Byte[] {20, 20, 20, 20 };
TestRead [] testReads = new TestRead [] {
new TestRead(bases, quals, new Byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}),
new TestRead(bases, quals, new Byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}),
new TestRead(bases, quals, new Byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}),
new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
for (TestRead testRead : testReads) {
SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false);
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
}
}
private class TestRead {
BaseIndex[] bases;
Byte[] quals;
Byte[] insQuals;
Byte[] delQuals;
Byte[] counts;
byte [] expectedCounts;
private TestRead(BaseIndex[] bases, Byte[] quals, Byte[] counts, byte[] expectedCounts) {
this.bases = bases;
this.quals = quals;
this.insQuals = quals;
this.delQuals = quals;
this.counts = counts;
this.expectedCounts = expectedCounts;
}
public BaseIndex[] getBases() {
return bases;
}
public Byte[] getQuals() {
return quals;
}
public Byte[] getInsQuals() {
return insQuals;
}
public Byte[] getDelQuals() {
return delQuals;
}
public Byte[] getCounts() {
return counts;
}
public byte[] getExpectedCounts() {
return expectedCounts;
}
}
}