diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index a47c61d0b..af856f3f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -179,6 +179,11 @@ public class LocusIteratorByState extends LocusIterator { return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); } + public CigarElement peekBackwardOnGenome() { + return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); + } + + public CigarOperator stepForwardOnGenome() { // we enter this method with readOffset = index of the last processed base on the read // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion @@ -401,24 +406,24 @@ public class LocusIteratorByState extends LocusIterator { while (iterator.hasNext()) { final SAMRecordState state = iterator.next(); - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read - final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator - final int readOffset = state.getReadOffset(); // the base offset on this read - final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. final int eventLength = state.getEventLength(); - if (op == CigarOperator.N) // N's are never added to any pileup + if (op == CigarOperator.N) // N's are never added to any pileup continue; - if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref + if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref size++; ExtendedEventPileupElement pileupElement; - if (state.getEventBases() == null) { // Deletion event + if (state.getEventBases() == null) { // Deletion event nDeletions++; maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength()); pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength); } - else { // Insertion event + else { // Insertion event nInsertions++; pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases()); } @@ -442,10 +447,10 @@ public class LocusIteratorByState extends LocusIterator { if (indelPile.size() != 0) fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); } - hasExtendedEvents = false; // we are done with extended events prior to current ref base + hasExtendedEvents = false; // we are done with extended events prior to current ref base nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); } - else { // this is a regular event pileup (not extended) + else { // this is a regular event pileup (not extended) GenomeLoc location = getLocation(); Map fullPileup = new HashMap(); boolean hasBeenSampled = false; @@ -454,27 +459,34 @@ public class LocusIteratorByState extends LocusIterator { List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); - size = 0; // number of elements in this sample's pileup - nDeletions = 0; // number of deletions in this sample's pileup - nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) + size = 0; // number of elements in this sample's pileup + nDeletions = 0; // number of deletions in this sample's pileup + nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) while (iterator.hasNext()) { - final SAMRecordState state = iterator.next(); // state object with the read/offset information - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read - final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator - final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element - final CigarOperator nextOp = nextElement.getOperator(); - final int readOffset = state.getReadOffset(); // the base offset on this read - + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator + final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + + final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; + final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; + final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION; + final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); + int nextElementLength = nextElement.getLength(); - if (op == CigarOperator.N) // N's are never added to any pileup + if (op == CigarOperator.N) // N's are never added to any pileup continue; if (op == CigarOperator.D) { - if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so - pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()), - null,nextOp == CigarOperator.D? nextElementLength:-1)); + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); size++; nDeletions++; if (read.getMappingQuality() == 0) @@ -484,11 +496,10 @@ public class LocusIteratorByState extends LocusIterator { else { if (!filterBaseInRead(read, location.getStart())) { String insertedBaseString = null; - if (nextOp == CigarOperator.I) { + if (nextOp == CigarOperator.I) insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength())); - } - pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()), - insertedBaseString,nextElementLength)); + + pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); size++; if (read.getMappingQuality() == 0) nMQ0Reads++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java new file mode 100644 index 000000000..a30472ce8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -0,0 +1,284 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.BitSetUtils; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.LinkedList; +import java.util.List; + +/** + * This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR + * + * It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will + * add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information + * out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type. + * + * The keys represented by this key manager will always have the same order: + * + * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType + * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType + * ... + * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType + * + * + * Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary. + * + * @author Mauricio Carneiro + * @since 3/6/12 + */ +public class BQSRKeyManager { + private List requiredCovariates; + private List optionalCovariates; + + private int nRequiredBits; // Number of bits used to represent the required covariates + private int nOptionalBits; // Number of bits used to represent the standard covaraites + private int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs + private int totalNumberOfBits; // Sum of all of the above plus the event bits + + private BitSet optionalCovariateMask; // Standard mask for optional covariates bitset + private BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset + + /** + * Initializes the KeyManager with the total number of covariates to use + * + * @param requiredCovariates the ordered list of required covariates + * @param optionalCovariates the ordered list of optional covariates + */ + public BQSRKeyManager(List requiredCovariates, List optionalCovariates) { + this.requiredCovariates = new ArrayList(requiredCovariates.size()); // initialize the required covariates list + this.optionalCovariates = new ArrayList(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay) + + nRequiredBits = 0; + for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management + int nBits = required.numberOfBits(); // number of bits used by this covariate + BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate + this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate + nRequiredBits += nBits; + } + + short i = 0; + nOptionalBits = 0; + for (Covariate optional : optionalCovariates) { + int nBits = optional.numberOfBits(); // number of bits used by this covariate + nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate + BitSet optionalID = BitSetUtils.bitSetFrom(i); // calculate the optional covariate ID for this covariate + this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object + i++; + } + + nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID + optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset + optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset + totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key + } + + /** + * Generates one key per optional covariate. + * + * Keys include all required covariates, the standard covariate and the event type. + * + * Example allKeys: + * RG, QUAL, CYCLE, CONTEXT + * + * List of BitSets returned by this example (given eventType): + * RG, QUAL, CYCLE, EVENT + * RG, QUAL, CONTEXT, EVENT + * + * Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type + * + * @param allKeys The keys in bitset representation for each covariate + * @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions) + * @return one key in bitset representation per covariate + */ + public List bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) { + List allBitSets = new LinkedList(); // Generate one key per optional covariate + + BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type + int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits + + int covariateIndex = 0; + BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on + for (RequiredCovariateInfo infoRequired : requiredCovariates) + addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set + + for (OptionalCovariateInfo infoOptional : optionalCovariates) { + BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys + if (covariateKey == null) + continue; // do not add nulls to the final set of keys. + + BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate + optionalKey.or(requiredKey); // import all the required covariates + addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates + addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite + addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type + allBitSets.add(optionalKey); // add this key to the list of keys + } + + if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key) + addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type + allBitSets.add(requiredKey); // add this key to the list of keys + } + + return allBitSets; + } + + /** + * Generates one bitset key for the covariates represented in Object[] key + * + * The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file) + * and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many. + * + * Example key: + * RG, QUAL, CYCLE, CYCLE_ID, EventType + * + * @param key list of objects produced by the required covariates followed by one or zero optional covariates. + * @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface. + */ + public BitSet bitSetFromKey(Object[] key) { + BitSet bitSetKey = new BitSet(totalNumberOfBits); + + int requiredCovariate = 0; + for (RequiredCovariateInfo infoRequired : requiredCovariates) { + BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface + addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key + } + + if (optionalCovariates.size() > 0) { + int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array + int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's + int covariateID = (Short) key[covariateIDIndex]; // get the optional covariate id + OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information + + BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface + addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates + addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite + } + + int eventIndex = key.length - 1; // the event type is always the last key + int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits + BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type + addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type + + return bitSetKey; + } + + + /** + * Generates a key set of objects from a combined bitset key. + * + * Masks out each covariate independently and decodes their values (Object) into a keyset + * + * @param key the bitset representation of the keys + * @return an object array with the values for each key + */ + public List keySetFrom(BitSet key) { + List objectKeys = new ArrayList(); + for (RequiredCovariateInfo info : requiredCovariates) { + BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset + objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface + } + + if (optionalCovariates.size() > 0) { + BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set + BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits);// mask out the covariate order (to identify which covariate this is) + short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short + Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object + objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set + objectKeys.add(id); // add the covariate id + } + objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set + + return objectKeys; + } + + /** + * Translates a masked bitset into a bitset starting at 0 + * + * @param key the masked out bitset + * @param n the number of bits to chop + * @return a translated bitset starting at 0 for the covariate machinery to decode + */ + private BitSet chopNBitsFrom(BitSet key, int n) { + BitSet choppedKey = new BitSet(); + for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1)) + choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet + return choppedKey; + } + + /** + * Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key + * + * @param leadingBits the index of the covariate in the ordered covariate list + * @param nBits the number of bits needed by the Covariate to represent its values in BitSet form + * @return the bitset relevant to the covariate + */ + + private BitSet genericMask(int leadingBits, int nBits) { + BitSet mask = new BitSet(leadingBits + nBits); + mask.set(leadingBits, leadingBits + nBits); + return mask; + } + + /** + * Decodes the event type (enum) from the full bitset key + * + * @param fullKey the full key of all covariates + event type + * @return the decoded event type. + */ + private EventType eventFromBitSet(BitSet fullKey) { + BitSet eventKey = new BitSet(); + int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; + for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1)) + eventKey.set(i - firstBitIndex); + return EventType.eventFrom(BitSetUtils.shortFrom(eventKey)); + } + + private BitSet bitSetFromEvent(EventType eventType) { + return BitSetUtils.bitSetFrom(eventType.index); + } + + private int bitsInEventType() { + return BitSetUtils.numberOfBitsToRepresent(EventType.values().length); + } + + private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) { + for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1)) + key.set(j + location); // translate the bits set in the key to their corresponding position in the full key + } + + private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) { + BitSet bitSet = (BitSet) key.clone(); + bitSet.and(mask); + return chopNBitsFrom(bitSet, leadingBits); + } + + + /** + * Aggregate information for each Covariate + */ + class RequiredCovariateInfo { + public int bitsBefore; // number of bits before this covariate in the combined bitset key + public int nBits; // number of bits used by this covariate (cached access to covariate.nBits()) + public BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) + public Covariate covariate; // this allows reverse lookup of the Covariates in order + + RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) { + this.bitsBefore = bitsBefore; + this.nBits = nBits; + this.mask = mask; + this.covariate = covariate; + } + } + + class OptionalCovariateInfo { + public BitSet covariateID; // cache the covariate ID + public Covariate covariate; + + OptionalCovariateInfo(BitSet covariateID, Covariate covariate) { + this.covariateID = covariateID; + this.covariate = covariate; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index acbe69248..69461ed0e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BitSetUtils; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -46,7 +48,9 @@ public class ContextCovariate implements StandardCovariate { private int deletionsContextSize; private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L); - protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it +// protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it + + private byte LOW_QUAL_TAIL; // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -55,18 +59,22 @@ public class ContextCovariate implements StandardCovariate { insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE; deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE; + LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; + if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0) throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize)); } @Override - public CovariateValues getValues(final GATKSAMRecord read) { + public CovariateValues getValues(GATKSAMRecord read) { int l = read.getReadLength(); BitSet[] mismatches = new BitSet[l]; BitSet[] insertions = new BitSet[l]; BitSet[] deletions = new BitSet[l]; + read = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context + final boolean negativeStrand = read.getReadNegativeStrandFlag(); byte[] bases = read.getReadBases(); if (negativeStrand) @@ -94,11 +102,17 @@ public class ContextCovariate implements StandardCovariate { @Override public String keyFromBitSet(BitSet key) { - if (key.equals(NO_CONTEXT_BITSET)) - return NO_CONTEXT_VALUE; + if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file + return null; + return BitSetUtils.dnaFrom(key); } + @Override + public BitSet bitSetFromKey(Object key) { + return BitSetUtils.bitSetFrom((String) key); + } + @Override public int numberOfBits() { return Long.bitCount(-1L); @@ -113,7 +127,7 @@ public class ContextCovariate implements StandardCovariate { * @return the bitSet representing the Context */ private BitSet contextWith(byte[] bases, int offset, int contextSize) { - BitSet result = NO_CONTEXT_BITSET; + BitSet result = null; if (offset >= contextSize) { String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); if (!context.contains("N")) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java index 341b9e7af..6b872a50c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java @@ -71,6 +71,17 @@ public interface Covariate { */ public String keyFromBitSet(BitSet key); + /** + * Converts a key into a bitset + * + * Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates + * the getValues method already returns all values in BitSet format. + * + * @param key the object corresponding to the covariate + * @return a bitset representation of the object + */ + public BitSet bitSetFromKey(Object key); + /** * Each covariate should determine how many bits are necessary to encode it's data * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java deleted file mode 100644 index 19a8aab07..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java +++ /dev/null @@ -1,108 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.BitSet; -import java.util.HashMap; - -/** - * The object temporarily held by a read that describes all of it's covariates. - * - * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap - * - * @author Mauricio Carneiro - * @since 2/8/12 - */ -public class CovariateKeySet { - private BitSet[][] mismatchesKeySet; - private BitSet[][] insertionsKeySet; - private BitSet[][] deletionsKeySet; - - private int nextCovariateIndex; - - // private static String mismatchesCovariateName = "M"; - // private static String insertionsCovariateName = "I"; - // private static String deletionsCovariateName = "D"; - // - // private static BitSet mismatchesCovariateBitSet = BitSetUtils.bitSetFrom(0); - // private static BitSet insertionsCovariateBitSet = BitSetUtils.bitSetFrom(1); - // private static BitSet deletionsCovariateBitSet = BitSetUtils.bitSetFrom(2); - - private static HashMap nameToType = new HashMap(); - private static HashMap bitSetToName = new HashMap(); - - public CovariateKeySet(int readLength, int numberOfCovariates) { - // numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format) - this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates]; - this.insertionsKeySet = new BitSet[readLength][numberOfCovariates]; - this.deletionsKeySet = new BitSet[readLength][numberOfCovariates]; - // initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateBitSet); - // initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateBitSet); - // initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateBitSet); - this.nextCovariateIndex = 0; - - // nameToType.put(mismatchesCovariateName, RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION); - // nameToType.put(insertionsCovariateName, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); - // nameToType.put(deletionsCovariateName, RecalDataManager.BaseRecalibrationType.BASE_DELETION); - // - // bitSetToName.put(BitSetUtils.bitSetFrom(0), mismatchesCovariateName); - // bitSetToName.put(BitSetUtils.bitSetFrom(1), insertionsCovariateName); - // bitSetToName.put(BitSetUtils.bitSetFrom(2), deletionsCovariateName); - } - - public void addCovariate(CovariateValues covariate) { - transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); - transposeCovariateValues(insertionsKeySet, covariate.getInsertions()); - transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); - nextCovariateIndex++; - } - - public static RecalDataManager.BaseRecalibrationType errorModelFrom(final String modelString) { - if (!nameToType.containsKey(modelString)) - throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString); - return nameToType.get(modelString); - } - - public static String eventNameFrom(final BitSet bitSet) { - if (!bitSetToName.containsKey(bitSet)) - throw new ReviewedStingException("Unrecognized Event Type BitSet: " + bitSet); - return bitSetToName.get(bitSet); - } - - public BitSet[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) { - switch (errorModel) { - case BASE_SUBSTITUTION: - return getMismatchesKeySet(readPosition); - case BASE_INSERTION: - return getInsertionsKeySet(readPosition); - case BASE_DELETION: - return getDeletionsKeySet(readPosition); - default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel); - } - } - - public BitSet[] getMismatchesKeySet(int readPosition) { - return mismatchesKeySet[readPosition]; - } - - public BitSet[] getInsertionsKeySet(int readPosition) { - return insertionsKeySet[readPosition]; - } - - public BitSet[] getDeletionsKeySet(int readPosition) { - return deletionsKeySet[readPosition]; - } - - private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) { - for (int i = 0; i < covariateValues.length; i++) - keySet[i][nextCovariateIndex] = covariateValues[i]; - } - - private void initializeCovariateKeySet(BitSet[][] keySet, BitSet covariateName) { - int readLength = keySet.length; - int lastCovariateIndex = keySet[0].length - 1; - for (int i = 0; i < readLength; i++) - keySet[i][lastCovariateIndex] = covariateName; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java index 919a2fa79..3f3bc5040 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java @@ -69,35 +69,12 @@ public class CycleCovariate implements StandardCovariate { final short init; final short increment; if (!read.getReadNegativeStrandFlag()) { - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { - //second of pair, positive strand - init = -1; - increment = -1; - } - else { - //first of pair, positive strand - init = 1; - increment = 1; - } - + init = 1; + increment = 1; } else { - if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { - //second of pair, negative strand - init = (short) -read.getReadLength(); - increment = 1; - } - else { - //first of pair, negative strand - init = (short) read.getReadLength(); - increment = -1; - } + init = (short) read.getReadLength(); + increment = -1; } short cycle = init; @@ -121,7 +98,7 @@ public class CycleCovariate implements StandardCovariate { // the current sequential model would consider the effects independently instead of jointly. final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - short cycle = multiplyByNegative1 ? (short) -1 : 1; + short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // For example, AAAAAAA was probably read in two flow cycles but here we count it as one @@ -201,7 +178,7 @@ public class CycleCovariate implements StandardCovariate { // Used to get the covariate's value from input csv file during on-the-fly recalibration @Override public final Object getValue(final String str) { - return Integer.parseInt(str); + return Short.parseShort(str); } @Override @@ -209,6 +186,11 @@ public class CycleCovariate implements StandardCovariate { return String.format("%d", BitSetUtils.shortFrom(key)); } + @Override + public BitSet bitSetFromKey(Object key) { + return BitSetUtils.bitSetFrom((Short) key); + } + @Override public int numberOfBits() { return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java new file mode 100644 index 000000000..4c53dcca5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +public enum EventType { + BASE_SUBSTITUTION(0, "M"), + BASE_INSERTION(1, "I"), + BASE_DELETION(2, "D"); + + public int index; + public String representation; + + private EventType(int index, String representation) { + this.index = index; + this.representation = representation; + } + + public static EventType eventFrom(int index) { + switch (index) { + case 0: + return BASE_SUBSTITUTION; + case 1: + return BASE_INSERTION; + case 2: + return BASE_DELETION; + default: + throw new ReviewedStingException(String.format("Event %d does not exist.", index)); + } + } + + public static EventType eventFrom(String event) { + for (EventType eventType : EventType.values()) + if (eventType.representation.equals(event)) + return eventType; + + throw new ReviewedStingException(String.format("Event %s does not exist.", event)); + } + + @Override + public String toString() { + return representation; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java index 4f92b7fbc..cd2253e1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BitSetUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.BitSet; @@ -40,8 +41,6 @@ import java.util.BitSet; public class QualityScoreCovariate implements RequiredCovariate { - private final int MAX_QUAL = 50; - // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { @@ -71,7 +70,7 @@ public class QualityScoreCovariate implements RequiredCovariate { // Used to get the covariate's value from input csv file during on-the-fly recalibration @Override public final Object getValue(final String str) { - return Integer.parseInt(str); + return Byte.parseByte(str); } @Override @@ -79,8 +78,13 @@ public class QualityScoreCovariate implements RequiredCovariate { return String.format("%d", BitSetUtils.longFrom(key)); } + @Override + public BitSet bitSetFromKey(Object key) { + return BitSetUtils.bitSetFrom((Byte) key); + } + @Override public int numberOfBits() { - return BitSetUtils.numberOfBitsToRepresent(MAX_QUAL); + return BitSetUtils.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java new file mode 100644 index 000000000..f87986b47 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.BitSet; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class ReadCovariates { + private BitSet[][] mismatchesKeySet; + private BitSet[][] insertionsKeySet; + private BitSet[][] deletionsKeySet; + + private int nextCovariateIndex; + + public ReadCovariates(int readLength, int numberOfCovariates) { + this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates]; + this.insertionsKeySet = new BitSet[readLength][numberOfCovariates]; + this.deletionsKeySet = new BitSet[readLength][numberOfCovariates]; + this.nextCovariateIndex = 0; + } + + public void addCovariate(CovariateValues covariate) { + transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); + transposeCovariateValues(insertionsKeySet, covariate.getInsertions()); + transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); + nextCovariateIndex++; + } + + public BitSet[] getKeySet(final int readPosition, final EventType errorModel) { + switch (errorModel) { + case BASE_SUBSTITUTION: + return getMismatchesKeySet(readPosition); + case BASE_INSERTION: + return getInsertionsKeySet(readPosition); + case BASE_DELETION: + return getDeletionsKeySet(readPosition); + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel); + } + } + + public BitSet[] getMismatchesKeySet(int readPosition) { + return mismatchesKeySet[readPosition]; + } + + public BitSet[] getInsertionsKeySet(int readPosition) { + return insertionsKeySet[readPosition]; + } + + public BitSet[] getDeletionsKeySet(int readPosition) { + return deletionsKeySet[readPosition]; + } + + private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) { + for (int i = 0; i < covariateValues.length; i++) + keySet[i][nextCovariateIndex] = covariateValues[i]; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index b05717791..ad4f94f33 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -55,16 +55,7 @@ public class ReadGroupCovariate implements RequiredCovariate { public CovariateValues getValues(final GATKSAMRecord read) { final int l = read.getReadLength(); final String readGroupId = read.getReadGroup().getReadGroupId(); - short shortId; - if (readGroupLookupTable.containsKey(readGroupId)) - shortId = readGroupLookupTable.get(readGroupId); - else { - shortId = nextId; - readGroupLookupTable.put(readGroupId, nextId); - readGroupReverseLookupTable.put(nextId, readGroupId); - nextId++; - } - BitSet rg = BitSetUtils.bitSetFrom(shortId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset + BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset BitSet[] readGroups = new BitSet[l]; Arrays.fill(readGroups, rg); return new CovariateValues(readGroups, readGroups, readGroups); @@ -81,6 +72,11 @@ public class ReadGroupCovariate implements RequiredCovariate { return decodeReadGroup((short) BitSetUtils.longFrom(key)); } + @Override + public BitSet bitSetFromKey(Object key) { + return bitSetForReadGroup((String) key); + } + public final String decodeReadGroup(final short id) { return readGroupReverseLookupTable.get(id); } @@ -89,6 +85,19 @@ public class ReadGroupCovariate implements RequiredCovariate { public int numberOfBits() { return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE); } + + private BitSet bitSetForReadGroup(String readGroupId) { + short shortId; + if (readGroupLookupTable.containsKey(readGroupId)) + shortId = readGroupLookupTable.get(readGroupId); + else { + shortId = nextId; + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + return BitSetUtils.bitSetFrom(shortId); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index 47284b098..5d1adaf40 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -53,50 +53,18 @@ import java.util.Map; */ public class RecalDataManager { - public final NestedHashMap nestedHashMap; // The full dataset - private final HashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed - private final HashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - private final HashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + public final NestedHashMap nestedHashMap; // The full dataset + private final HashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private final HashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private final HashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams - public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams + public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private static boolean warnUserNullPlatform = false; - private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ - - public enum BaseRecalibrationType { - BASE_SUBSTITUTION(0, "M"), - BASE_INSERTION(1, "I"), - BASE_DELETION(2, "D"); - - public int index; - public String representation; - - private BaseRecalibrationType(int index, String representation) { - this.index = index; - this.representation = representation; - } - - public static BaseRecalibrationType eventFrom(int index) { - switch (index) { - case 0: - return BASE_SUBSTITUTION; - case 1: - return BASE_INSERTION; - case 2: - return BASE_DELETION; - default: - throw new ReviewedStingException(String.format("Event %d does not exist.", index)); - } - } - - @Override - public String toString() { - return representation; - } - } + private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ public enum SOLID_RECAL_MODE { /** @@ -142,10 +110,10 @@ public class RecalDataManager { public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) { if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration nestedHashMap = null; - dataCollapsedReadGroup = new HashMap(); - dataCollapsedQualityScore = new HashMap(); - dataCollapsedByCovariate = new HashMap>(); - for (final BaseRecalibrationType errorModel : BaseRecalibrationType.values()) { + dataCollapsedReadGroup = new HashMap(); + dataCollapsedQualityScore = new HashMap(); + dataCollapsedByCovariate = new HashMap>(); + for (final EventType errorModel : EventType.values()) { dataCollapsedReadGroup.put(errorModel, new NestedHashMap()); dataCollapsedQualityScore.put(errorModel, new NestedHashMap()); dataCollapsedByCovariate.put(errorModel, new ArrayList()); @@ -162,100 +130,10 @@ public class RecalDataManager { } } - public static CovariateKeySet covariateKeySetFrom(GATKSAMRecord read) { - return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE); + public static ReadCovariates covariateKeySetFrom(GATKSAMRecord read) { + return (ReadCovariates) read.getTemporaryAttribute(COVARS_ATTRIBUTE); } - /** - * Add the given mapping to all of the collapsed hash tables - * - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping - * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table - */ - public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel) { - - // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around - //data.put(key, thisDatum); // add the mapping to the main table - - final int qualityScore = Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; - RecalDatum collapsedDatum; - - // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed - if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) { - readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey); - if (collapsedDatum == null) { - dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey); - } - else { - collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported - } - } - - // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed - qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... - qualityScoreCollapsedKey[1] = key[1]; // and quality score - collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey); - if (collapsedDatum == null) { - dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); - } - else { - collapsedDatum.increment(fullDatum); - } - - // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) { - covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... - covariateCollapsedKey[1] = key[1]; // and quality score ... - final Object theCovariateElement = key[iii + 2]; // and the given covariate - if (theCovariateElement != null) { - covariateCollapsedKey[2] = theCovariateElement; - collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey); - if (collapsedDatum == null) { - dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); - } - else { - collapsedDatum.increment(fullDatum); - } - } - } - } - - /** - * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score - * that will be used in the sequential calculation in TableRecalibrationWalker - * - * @param smoothing The smoothing parameter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores - */ - public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { - - for (final BaseRecalibrationType errorModel : BaseRecalibrationType.values()) { - recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual); - recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual); - for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) { - recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); - checkForSingletons(map.data); - } - } - } - - private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) { - - for (Object comp : data.keySet()) { - final Object val = data.get(comp); - if (val instanceof RecalDatum) { // We are at the end of the nested hash maps - ((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual); - } - else { // Another layer in the nested hash map - recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual); - } - } - } private void checkForSingletons(final Map data) { // todo -- this looks like it's better just as a data.valueSet() call? @@ -279,7 +157,7 @@ public class RecalDataManager { * @param covariate Which covariate indexes the desired collapsed HashMap * @return The desired collapsed HashMap */ - public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) { + public final NestedHashMap getCollapsedTable(final int covariate, final EventType errorModel) { if (covariate == 0) { return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed } @@ -652,13 +530,13 @@ public class RecalDataManager { public static void computeCovariates(final GATKSAMRecord read, final List requestedCovariates) { final int numRequestedCovariates = requestedCovariates.size(); final int readLength = read.getReadLength(); - final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates); + final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates); // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read for (Covariate covariate : requestedCovariates) - covariateKeySet.addCovariate(covariate.getValues(read)); + readCovariates.addCovariate(covariate.getValues(read)); - read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet); + read.setTemporaryAttribute(COVARS_ATTRIBUTE, readCovariates); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 7ef402083..ab173e4fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -142,6 +142,11 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; + @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) + public byte LOW_QUAL_TAIL = 2; + + + @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @@ -149,4 +154,5 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index dd21681f0..b2f73c396 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -30,7 +30,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; @@ -203,7 +206,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public class BAQedPileupElement extends PileupElement { public BAQedPileupElement( final PileupElement PE ) { - super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip()); + super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isAfterDeletion(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 62a67a1f2..2e3978ddb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -4,7 +4,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -320,8 +320,8 @@ public class ClippingOp { byte[] newBaseDeletionQuals = new byte[newLength]; System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); - hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); - hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); + hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); } return hardClippedRead; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 1eab43256..9e7ee9dac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -231,15 +231,16 @@ public class ReadClipper { /** - * Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual. + * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the desired algorithm. * * This function will look for low quality tails and hard clip them away. A low quality tail * ends when a base has base quality greater than lowQual. * + * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...) * @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped * @return a new read without low quality tails */ - private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { + private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) { if (read.isEmpty()) return read; @@ -254,7 +255,6 @@ public class ReadClipper { // if the entire read should be clipped, then return an empty read. if (leftClipIndex > rightClipIndex) return GATKSAMRecord.emptyRead(read); -// return (new GATKSAMRecord(read.getHeader())); if (rightClipIndex < read.getReadLength() - 1) { this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1)); @@ -262,11 +262,18 @@ public class ReadClipper { if (leftClipIndex > 0 ) { this.addOp(new ClippingOp(0, leftClipIndex - 1)); } - return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); + return this.clipRead(algorithm); + } + + private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { + return this.clipLowQualEnds(ClippingRepresentation.HARDCLIP_BASES, lowQual); } public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); } + public static GATKSAMRecord clipLowQualEnds(GATKSAMRecord read, byte lowQual, ClippingRepresentation algorithm) { + return (new ReadClipper(read)).clipLowQualEnds(algorithm, lowQual); + } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index eea45567f..858f7a2ae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -4,7 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -203,8 +203,8 @@ public class FragmentUtils { insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop]; deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop]; } - returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION ); - returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION ); + returnRead.setBaseQualities( insertionQuals, EventType.BASE_INSERTION ); + returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION ); } final ArrayList returnList = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 7c2a67aba..c8f00778f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup pileup = new UnifiedPileupElementTracker(); for (GATKSAMRecord read : reads) { - pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important + pileup.add(createNewPileupElement(read, offset, false, false, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important } return pileup; @@ -204,8 +204,8 @@ public abstract class AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip, String nextEventBases, int nextEventLength ); + protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip); + protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ); // -------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java index 8df0aa0b8..8e63fb0b1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -48,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement { public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) { - super(read, offset, type == Type.DELETION, false, false, false,null,-1); // extended events are slated for removal + super(read, offset, type == Type.DELETION, false, false, false, false, false, null, -1); // extended events are slated for removal this.read = read; this.offset = offset; this.eventLength = eventLength; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 9dbfc52f3..2eb81b394 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -21,15 +21,17 @@ public class PileupElement implements Comparable { public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89; public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90; - protected final GATKSAMRecord read; - protected final int offset; - protected final boolean isDeletion; - protected final boolean isBeforeDeletion; - protected final boolean isBeforeInsertion; - protected final boolean isNextToSoftClip; - protected final int eventLength; - protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases - // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + protected final GATKSAMRecord read; // the read this base belongs to + protected final int offset; // the offset in the bases array for this base + protected final boolean isDeletion; // is this base a deletion + protected final boolean isBeforeDeletion; // is the base to the right of this base an deletion + protected final boolean isAfterDeletion; // is the base to the left of this base a deletion + protected final boolean isBeforeInsertion; // is the base to the right of this base an insertion + protected final boolean isAfterInsertion; // is the base to the left of this base an insertion + protected final boolean isNextToSoftClip; // is this base either before or after a soft clipped base + protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base + protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + /** @@ -39,7 +41,9 @@ public class PileupElement implements Comparable { * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) * @param isDeletion whether or not this base is a deletion * @param isBeforeDeletion whether or not this base is before a deletion + * @param isAfterDeletion whether or not this base is after a deletion * @param isBeforeInsertion whether or not this base is before an insertion + * @param isAfterInsertion whether or not this base is after an insertion * @param isNextToSoftClip whether or not this base is next to a soft clipped base * @param nextEventBases bases in event in case element comes before insertion or deletion * @param nextEventLength length of next event in case it's insertion or deletion @@ -48,8 +52,7 @@ public class PileupElement implements Comparable { "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip, - final String nextEventBases, final int nextEventLength) { + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength) { if (offset < 0 && isDeletion) throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); @@ -57,20 +60,22 @@ public class PileupElement implements Comparable { this.offset = offset; this.isDeletion = isDeletion; this.isBeforeDeletion = isBeforeDeletion; + this.isAfterDeletion = isAfterDeletion; this.isBeforeInsertion = isBeforeInsertion; + this.isAfterInsertion = isAfterInsertion; this.isNextToSoftClip = isNextToSoftClip; if (isBeforeInsertion) eventBases = nextEventBases; else - eventBases = null; // ignore argument in any other case + eventBases = null; // ignore argument in any other case if (isBeforeDeletion || isBeforeInsertion) eventLength = nextEventLength; else eventLength = -1; } - public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) { - this(read,offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null, -1); + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) { + this(read,offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, -1); } public boolean isDeletion() { return isDeletion; @@ -80,10 +85,18 @@ public class PileupElement implements Comparable { return isBeforeDeletion; } + public boolean isAfterDeletion() { + return isAfterDeletion; + } + public boolean isBeforeInsertion() { return isBeforeInsertion; } + public boolean isAfterInsertion() { + return isAfterInsertion; + } + public boolean isNextToSoftClip() { return isNextToSoftClip; } @@ -123,14 +136,14 @@ public class PileupElement implements Comparable { } /** - * Returns length of the event (number of inserted or deleted bases + * @return length of the event (number of inserted or deleted bases */ public int getEventLength() { return eventLength; } /** - * Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + * @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. */ public String getEventBases() { return eventBases; @@ -185,13 +198,9 @@ public class PileupElement implements Comparable { // // -------------------------------------------------------------------------- -// public boolean isReducedRead() { -// return read.isReducedRead(); -// } - /** * Returns the number of elements in the pileup element. - *

+ * * Unless this is a reduced read, the number of elements in a pileup element is one. In the event of * this being a reduced read and a deletion, we return the average number of elements between the left * and right elements to the deletion. We assume the deletion to be left aligned. diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index e547534dd..9d1e8ab62 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -96,12 +96,11 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { + protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, - boolean isNextToSoftClip,String nextEventBases, int nextEventLength) { + protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 759d64b2f..a11bc97c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -56,6 +56,9 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup pileup, int size, int nDeletions, int nMQ0Reads) { super(loc, pileup, size, nDeletions, nMQ0Reads); @@ -71,13 +74,14 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation - public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + private ArrayList> collapsedHashes = new ArrayList> (); // All the collapsed data tables + + private final ArrayList requestedCovariates = new ArrayList(); // List of all covariates to be used in this calculation + private final ArrayList requiredCovariates = new ArrayList(); // List of required covariates to be used in this calculation + private final ArrayList optionalCovariates = new ArrayList(); // List of optional covariates to be used in this calculation + + public static final Pattern REQUIRED_COVARIATE_PATTERN = Pattern.compile("^# Required Covariates.*"); + public static final Pattern OPTIONAL_COVARIATE_PATTERN = Pattern.compile("^# Optional Covariates.*"); public static final String EOF_MARKER = "EOF"; - private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here? - private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values. + + private static final byte SMOOTHING_CONSTANT = 1; + + ArrayList keyManagers = new ArrayList(); public BaseRecalibration(final File RECAL_FILE) { // Get a list of all available covariates final List> classes = new PluginManager(Covariate.class).getPlugins(); + RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // todo -- initialize with the parameters from the csv file! int lineNumber = 0; - boolean foundAllCovariates = false; + + boolean foundRequiredCovariates = false; + boolean foundOptionalCovariates = false; + boolean initializedKeyManagers = false; // Read in the data from the csv file and populate the data map and covariates list boolean sawEOF = false; try { for (String line : new XReadLines(RECAL_FILE)) { lineNumber++; - if (EOF_MARKER.equals(line)) { - sawEOF = true; - } - else if (COMMENT_PATTERN.matcher(line).matches()) { - ; // Skip over the comment lines, (which start with '#') - } - // Read in the covariates that were used from the input file - else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data - if (foundAllCovariates) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE); - } - else { // Found the covariate list in input file, loop through all of them and instantiate them - String[] vals = line.split(","); - for (int iii = 0; iii < vals.length - 4; iii++) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical - boolean foundClass = false; - for (Class covClass : classes) { - if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) { - foundClass = true; - try { - Covariate covariate = (Covariate) covClass.newInstance(); - requestedCovariates.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } + sawEOF = EOF_MARKER.equals(line); + if (sawEOF) + break; + + boolean requiredCovariatesLine = REQUIRED_COVARIATE_PATTERN.matcher(line).matches(); + boolean optionalCovariatesLine = OPTIONAL_COVARIATE_PATTERN.matcher(line).matches(); + + if (requiredCovariatesLine && foundRequiredCovariates) + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate required covariates line"); + + if (optionalCovariatesLine && foundOptionalCovariates) + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate optional covariates line"); + + if (optionalCovariatesLine && !foundRequiredCovariates) + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Optional covariates reported before Required covariates"); + + if (requiredCovariatesLine || optionalCovariatesLine) { + String [] covariateNames = line.split(": ")[1].split(","); // take the second half of the string (past the ":") and split it by "," to get the list of required covariates + + List covariateList = requiredCovariatesLine ? requiredCovariates : optionalCovariates; // set the appropriate covariate list to update + + for (String covariateName : covariateNames) { + boolean foundClass = false; + for (Class covClass : classes) { + if ((covariateName + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) { + foundClass = true; + try { + Covariate covariate = (Covariate) covClass.newInstance(); + covariate.initialize(RAC); + requestedCovariates.add(covariate); + covariateList.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); } } - - if (!foundClass) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option."); - } } + if (!foundClass) + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (covariateName + "Covariate") + ") isn't a valid covariate option."); } + foundRequiredCovariates = foundRequiredCovariates || requiredCovariatesLine; + foundOptionalCovariates = foundOptionalCovariates || optionalCovariatesLine; + } - } - else { // Found a line of data - if (!foundAllCovariates) { - foundAllCovariates = true; + else if (!line.startsWith("#")) { // if this is not a comment line that we don't care about, it is DATA! + if (!foundRequiredCovariates || !foundOptionalCovariates) // At this point all the covariates should have been found and initialized + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE); - // At this point all the covariates should have been found and initialized - if (requestedCovariates.size() < 2) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE); + if (!initializedKeyManagers) { + ArrayList emptyList = new ArrayList(0); + ArrayList requiredCovariatesUpToThis = new ArrayList(); // Initialize one key manager for each table of required covariate + for (Covariate covariate : requiredCovariates) { // Every required covariate table includes all preceding required covariates (e.g. RG ; RG,Q ) + requiredCovariatesUpToThis.add(covariate); + keyManagers.add(new BQSRKeyManager(requiredCovariatesUpToThis, emptyList)); } - - final boolean createCollapsedTables = true; - - // Initialize any covariate member variables using the shared argument collection - RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - for (Covariate cov : requestedCovariates) { - cov.initialize(RAC); - } - // Initialize the data hashMaps - dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size()); - + keyManagers.add(new BQSRKeyManager(requiredCovariates, optionalCovariates)); // One master key manager for the collapsed tables + + initializedKeyManagers = true; } - addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap + addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap } } @@ -140,67 +156,113 @@ public class BaseRecalibration { throw new UserException.MalformedFile(RECAL_FILE, errorMessage); } - if (dataManager == null) { - throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); - } - - dataManager.generateEmpiricalQualities(1, MAX_QUALITY_SCORE); + generateEmpiricalQualities(SMOOTHING_CONSTANT); } + /** * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) * + * @param file The CSV file we read the line from (for exception throwing purposes) * @param line A line of CSV data read from the recalibration table data file */ private void addCSVData(final File file, final String line) { final String[] vals = line.split(","); + boolean hasOptionalCovariates = optionalCovariates.size() > 0; // Do we have optional covariates in this key? + int addOptionalCovariates = hasOptionalCovariates ? 2 : 0; // If we have optional covariates at all, add two to the size of the array (to acommodate the covariate and the id) + final Object[] key = new Object[requiredCovariates.size() + addOptionalCovariates + 1]; // Reserve enough space for the required covariates, optional covariate, id and eventType - // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if (vals.length != requestedCovariates.size() + 4) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical - throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + - " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + int indexCovariateValue = key.length - 3; // In the order of keys, the optional covariate comes right after the required covariates + int indexCovariateID = key.length - 2; // followed by the covariate ID + int indexEventType = key.length - 1; // and the event type + + addKeysToArray(key, vals, requiredCovariates, 0); // Add the required covariates keys + + if (hasOptionalCovariates) { + key[indexCovariateID] = Short.parseShort(vals[indexCovariateID]); // Add the optional covariate ID + Covariate covariate = optionalCovariates.get((Short) key[indexCovariateID]); // Get the covariate object for this ID + key[indexCovariateValue] = covariate.getValue(vals[indexCovariateValue]); // Add the optional covariate value, given the ID } + key[indexEventType] = EventType.eventFrom(vals[indexEventType]); // Add the event type - final Object[] key = new Object[requestedCovariates.size()]; - Covariate cov; - int iii; - for (iii = 0; iii < requestedCovariates.size(); iii++) { - cov = requestedCovariates.get(iii); - key[iii] = cov.getValue(vals[iii]); + int datumIndex = key.length; // The recal datum starts at the end of the key (after the event type) + long count = Long.parseLong(vals[datumIndex]); // Number of observations + long errors = Long.parseLong(vals[datumIndex + 1]); // Number of errors observed + double reportedQual = Double.parseDouble(vals[1]); // The reported Q score --> todo -- I don't like having the Q score hard coded in vals[1]. Generalize it! + final RecalDatum datum = new RecalDatum(count, errors, reportedQual, 0.0); // Create a new datum using the number of observations, number of mismatches, and reported quality score + + addToAllTables(key, datum); // Add that datum to all the collapsed tables which will be used in the sequential calculation + } + + /** + * Add the given mapping to all of the collapsed hash tables + * + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping + */ + private void addToAllTables(final Object[] key, final RecalDatum fullDatum) { + int nHashes = requiredCovariates.size(); // We will always need one hash per required covariate + if (optionalCovariates.size() > 0) // If we do have optional covariates + nHashes += 1; // we will need one extra hash table with the optional covariate encoded in the key set on top of the required covariates + + + for (int hashIndex = 0; hashIndex < nHashes; hashIndex++) { + HashMap table; // object to hold the hash table we are going to manipulate + if (hashIndex >= collapsedHashes.size()) { // if we haven't yet created the collapsed hash table for this index, create it now! + table = new HashMap(); + collapsedHashes.add(table); // Because this is the only place where we add tables to the ArrayList, they will always be in the order we want. + } + else + table = collapsedHashes.get(hashIndex); // if the table has been previously created, just assign it to the "table" object for manipulation + + int copyTo = hashIndex + 1; // this will copy the covariates up to the index of the one we are including now (1 for RG, 2 for QS,...) + if (copyTo > requiredCovariates.size()) // only in the case where we have optional covariates we need to increase the size of the array + copyTo = requiredCovariates.size() + 2; // if we have optional covarites, add the optional covariate and it's id to the size of the key + Object[] tableKey = new Object[copyTo + 1]; // create a new array that will hold as many keys as hashIndex (1 for RG hash, 2 for QualityScore hash, 3 for covariate hash plus the event type + System.arraycopy(key, 0, tableKey, 0, copyTo); // copy the keys for the corresponding covariates into the tableKey. + tableKey[tableKey.length-1] = key[key.length - 1]; // add the event type. The event type is always the last key, on both key sets. + + BitSet hashKey = keyManagers.get(hashIndex).bitSetFromKey(tableKey); // Add bitset key with fullDatum to the appropriate hash + RecalDatum datum = table.get(hashKey); + if (datum == null) + datum = fullDatum; + else if (hashIndex == 0) // Special case for the ReadGroup covariate + datum.combine(fullDatum); + else + datum.increment(fullDatum); + table.put(hashKey, datum); } - final String modelString = vals[iii++]; - final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.errorModelFrom(modelString); - - // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0); - // Add that datum to all the collapsed tables which will be used in the sequential calculation - - dataManager.addToAllTables(key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter } - public void recalibrateRead(final GATKSAMRecord read) { + /** + * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score + * that will be used in the sequential calculation in TableRecalibrationWalker + * + * @param smoothing The smoothing parameter that goes into empirical quality score calculation + */ + private void generateEmpiricalQualities(final int smoothing) { + for (final HashMap table : collapsedHashes) + for (final RecalDatum datum : table.values()) + datum.calcCombinedEmpiricalQuality(smoothing, QualityUtils.MAX_QUAL_SCORE); + } + + + + + public void recalibrateRead(final GATKSAMRecord read) { //compute all covariate values for this read RecalDataManager.computeCovariates(read, requestedCovariates); - final CovariateKeySet covariateKeySet = RecalDataManager.covariateKeySetFrom(read); + final ReadCovariates readCovariates = RecalDataManager.covariateKeySetFrom(read); - for (final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values()) { + for (final EventType errorModel : EventType.values()) { final byte[] originalQuals = read.getBaseQualities(errorModel); final byte[] recalQuals = originalQuals.clone(); // For each base in the read for (int offset = 0; offset < read.getReadLength(); offset++) { - - final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel); - final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length - 1); // need to strip off the error mode which was appended to the list of covariates - - // BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables? - //Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); - //if( qualityScore == null ) { - final byte qualityScore = performSequentialQualityCalculation(errorModel, fullCovariateKey); - // qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); - //} - + final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); + final byte qualityScore = performSequentialQualityCalculation(keySet, errorModel); recalQuals[offset] = qualityScore; } @@ -209,6 +271,8 @@ public class BaseRecalibration { } } + + /** * Implements a serial recalibration of the reads using the combinational table. * First, we perform a positional recalibration, and then a subsequent dinuc correction. @@ -221,20 +285,26 @@ public class BaseRecalibration { * - The final shift equation is: * * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * + * todo -- I extremely dislike the way all this math is hardcoded... should rethink the data structures for this method in particular. * * @param key The list of Comparables that were calculated from the covariates + * @param errorModel the event type * @return A recalibrated quality score as a byte */ - private byte performSequentialQualityCalculation(final RecalDataManager.BaseRecalibrationType errorModel, final Object... key) { - - final byte qualFromRead = (byte) Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; - + private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) { + final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]); + + final int readGroupKeyIndex = 0; + final int qualKeyIndex = 1; + final int covariatesKeyIndex = 2; + // The global quality shift (over the read group only) - readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0, errorModel).get(readGroupCollapsedKey)); + List bitKeys = keyManagers.get(readGroupKeyIndex).bitSetsFromAllKeys(key, errorModel); + if (bitKeys.size() > 1) + throw new ReviewedStingException("There should only be one key for the RG collapsed table, something went wrong here"); + + final RecalDatum globalRecalDatum = collapsedHashes.get(readGroupKeyIndex).get(bitKeys.get(0)); double globalDeltaQ = 0.0; if (globalRecalDatum != null) { final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); @@ -243,9 +313,11 @@ public class BaseRecalibration { } // The shift in quality between reported and empirical - qualityScoreCollapsedKey[0] = key[0]; - qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1, errorModel).get(qualityScoreCollapsedKey)); + bitKeys = keyManagers.get(qualKeyIndex).bitSetsFromAllKeys(key, errorModel); + if (bitKeys.size() > 1) + throw new ReviewedStingException("There should only be one key for the Qual collapsed table, something went wrong here"); + + final RecalDatum qReportedRecalDatum = collapsedHashes.get(qualKeyIndex).get(bitKeys.get(0)); double deltaQReported = 0.0; if (qReportedRecalDatum != null) { final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); @@ -253,13 +325,11 @@ public class BaseRecalibration { } // The shift in quality due to each covariate by itself in turn + bitKeys = keyManagers.get(covariatesKeyIndex).bitSetsFromAllKeys(key, errorModel); double deltaQCovariates = 0.0; double deltaQCovariateEmpirical; - covariateCollapsedKey[0] = key[0]; - covariateCollapsedKey[1] = key[1]; - for (int iii = 2; iii < key.length; iii++) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii, errorModel).get(covariateCollapsedKey)); + for (BitSet k : bitKeys) { + final RecalDatum covariateRecalDatum = collapsedHashes.get(covariatesKeyIndex).get(k); if (covariateRecalDatum != null) { deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); @@ -267,7 +337,7 @@ public class BaseRecalibration { } final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE); + return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE); } /** @@ -283,4 +353,19 @@ public class BaseRecalibration { } } } + + /** + * Shared functionality to add keys + * + * @param array the target array we are creating the keys in + * @param keys the actual keys we're using as a source + * @param covariateList the covariate list to loop through + * @param keyIndex the index in the keys and the arrays objects to run from + */ + private void addKeysToArray(final Object[] array, final String[] keys, List covariateList, int keyIndex) { + for (Covariate covariate : covariateList) { + array[keyIndex] = covariate.getValue(keys[keyIndex]); + keyIndex++; + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index b17e325fc..de8c50935 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -361,10 +361,10 @@ public class ArtificialSAMUtils { final GATKSAMRecord left = pair.get(0); final GATKSAMRecord right = pair.get(1); - pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false)); + pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false, false, false)); if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { - pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false)); + pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false, false, false)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 41bf74e4b..6b43479dc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -165,7 +165,7 @@ public class GATKSAMRecord extends BAMRecord { /** * Setters and Accessors for base insertion and base deletion quality scores */ - public void setBaseQualities( final byte[] quals, final RecalDataManager.BaseRecalibrationType errorModel ) { + public void setBaseQualities( final byte[] quals, final EventType errorModel ) { switch( errorModel ) { case BASE_SUBSTITUTION: setBaseQualities(quals); @@ -181,7 +181,7 @@ public class GATKSAMRecord extends BAMRecord { } } - public byte[] getBaseQualities( final RecalDataManager.BaseRecalibrationType errorModel ) { + public byte[] getBaseQualities( final EventType errorModel ) { switch( errorModel ) { case BASE_SUBSTITUTION: return getBaseQualities(); @@ -204,7 +204,7 @@ public class GATKSAMRecord extends BAMRecord { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); + setBaseQualities(quals, EventType.BASE_INSERTION); } return quals; } @@ -215,7 +215,7 @@ public class GATKSAMRecord extends BAMRecord { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + setBaseQualities(quals, EventType.BASE_DELETION); } return quals; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index 312ad252e..30a9bad3e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -33,24 +35,22 @@ public class ContextCovariateUnitTest { public void testSimpleContexts() { byte[] quals = ReadUtils.createRandomReadQuals(10000); byte[] bbases = ReadUtils.createRandomReadBases(10000, true); - String bases = stringFrom(bbases); - // System.out.println("Read: " + bases); GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); CovariateValues values = covariate.getValues(read); - verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); - verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); - verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); + verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); + verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); + verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases())); } private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { for (int i = 0; i < values.length; i++) { - String expectedContext = covariate.NO_CONTEXT_VALUE; + String expectedContext = null; if (i >= contextSize) { String context = bases.substring(i - contextSize, i); if (!context.contains("N")) expectedContext = context; } - // System.out.println(String.format("Context [%d]:\n%s\n%s\n", i, covariate.keyFromBitSet(values[i]), expectedContext)); Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java index f92bff600..49315672c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java @@ -45,15 +45,6 @@ public class CycleCovariateUnitTest { values = covariate.getValues(read); verifyCovariateArray(values.getMismatches(), readLength, (short) -1); - read.setReadPairedFlag(true); - read.setSecondOfPairFlag(true); - values = covariate.getValues(read); - verifyCovariateArray(values.getMismatches(), (short) -readLength, (short) 1); - - read.setReadNegativeStrandFlag(false); - values = covariate.getValues(read); - verifyCovariateArray(values.getMismatches(), (short) -1, (short) -1); - } private void verifyCovariateArray(BitSet[] values, short init, short increment) { diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java new file mode 100644 index 000000000..3e50a5fd1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.testng.annotations.Test; + +import java.io.File; + +/** + * Unit tests for on-the-fly recalibration. + * + * @author Mauricio Carneiro + * @since 3/16/12 + */ +public class BaseRecalibrationUnitTest { + + @Test(enabled=true) + public void testReadingCSV() { + File csv = new File("public/testdata/exampleCSV.csv"); + BaseRecalibration baseRecalibration = new BaseRecalibration(csv); + System.out.println("Success"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 520fb7040..5946e38ea 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest { @Test public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0, false, false, false, false); - PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false); + PileupElement readp = new PileupElement(read, 0, false, false, false, false, false, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false, false, false); Assert.assertFalse(readp.getRead().isReducedRead());