diff --git a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java index 8cd910759..c0bc2c5a7 100755 --- a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.VariantContext; @@ -318,7 +319,7 @@ public abstract class AbstractGenomeAnalysisEngine { referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); validateSuppliedReads(); - readsDataSource = createReadsDataSource(genomeLocParser); + readsDataSource = createReadsDataSource(genomeLocParser, referenceDataSource.getReference()); sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); @@ -540,7 +541,7 @@ public abstract class AbstractGenomeAnalysisEngine { * * @return A data source for the given set of reads. */ - private SAMDataSource createReadsDataSource(GenomeLocParser genomeLocParser) { + private SAMDataSource createReadsDataSource(GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { DownsamplingMethod method = getDownsamplingMethod(); return new SAMDataSource( @@ -553,7 +554,8 @@ public abstract class AbstractGenomeAnalysisEngine { new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, includeReadsWithDeletionAtLoci(), - generateExtendedEvents()); + generateExtendedEvents(), + argCollection.BAQMode, refReader); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index 271d6696c..3ac0ce429 100755 --- a/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -1,10 +1,12 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; +import org.broadinstitute.sting.utils.BAQ; import java.util.List; import java.util.Collection; @@ -35,7 +37,11 @@ public class ReadProperties { private Collection supplementalFilters = null; private boolean includeReadsWithDeletionAtLoci = false; private boolean useOriginalBaseQualities = false; - private boolean generateExtendedEvents = false; // do we want to generate additional piles of "extended" events (indels) + private boolean generateExtendedEvents = false; + private BAQ.Mode mode = BAQ.Mode.NONE; + IndexedFastaSequenceFile refReader = null; // read for BAQ, if desired + + // do we want to generate additional piles of "extended" events (indels) // immediately after the reference base such event is associated with? @@ -120,6 +126,15 @@ public class ReadProperties { return useOriginalBaseQualities; } + + public BAQ.Mode getBAQMode() { + return mode; + } + + public IndexedFastaSequenceFile getRefReader() { + return refReader; + } + /** * Extract the command-line arguments having to do with reads input * files and store them in an easy-to-work-with package. Constructor @@ -138,6 +153,8 @@ public class ReadProperties { * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method * will explicitly list reads with deletion over the current reference base; otherwise, only observed * bases will be seen in the pileups, and the deletions will be skipped silently. + * @param mode How should we apply the BAQ calculation to the reads? + * @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation */ public ReadProperties( List samFiles, SAMFileHeader header, @@ -148,7 +165,9 @@ public class ReadProperties { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents) { + boolean generateExtendedEvents, + BAQ.Mode mode, + IndexedFastaSequenceFile refReader ) { this.readers = samFiles; this.header = header; this.readBufferSize = readBufferSize; @@ -159,5 +178,7 @@ public class ReadProperties { this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; this.generateExtendedEvents = generateExtendedEvents; this.useOriginalBaseQualities = useOriginalBaseQualities; + this.mode = mode; + this.refReader = refReader; } } diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 0b7541c07..7de4c0154 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.utils.interval.IntervalSetRule; +import org.broadinstitute.sting.utils.BAQ; import org.simpleframework.xml.*; import org.simpleframework.xml.core.Persister; import org.simpleframework.xml.stream.Format; @@ -149,6 +150,10 @@ public class GATKArgumentCollection { return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction); } + @Element(required = false) + @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) + public BAQ.Mode BAQMode = BAQ.Mode.NONE; + /** * Gets the default downsampling method, returned if the user didn't specify any downsampling * method. @@ -336,11 +341,9 @@ public class GATKArgumentCollection { return false; } if (BTIMergeRule != other.BTIMergeRule) - return false; - -// if (other.enableRodWalkers != this.enableRodWalkers) { -// return false; -// } + return false; + if ( BAQMode != other.BAQMode) + return false; return true; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index d2d9d76b1..c32c4661d 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -29,6 +29,7 @@ import net.sf.samtools.util.CloseableIterator; import net.sf.picard.filter.SamRecordFilter; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.picard.sam.MergingSamRecordIterator; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.DownsamplingMethod; @@ -43,6 +44,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -141,6 +143,34 @@ public class SAMDataSource implements SimpleDataSource { ); } + /** + * See complete constructor. Does not enable BAQ by default. + */ + public SAMDataSource( + List samFiles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + SAMFileReader.ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + boolean includeReadsWithDeletionAtLoci, + boolean generateExtendedEvents ) { + this( samFiles, + genomeLocParser, + useOriginalBaseQualities, + strictness, + readBufferSize, + downsamplingMethod, + exclusionList, + supplementalFilters, + includeReadsWithDeletionAtLoci, + generateExtendedEvents, + BAQ.Mode.NONE, null // no BAQ + ); + } + /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. @@ -167,7 +197,9 @@ public class SAMDataSource implements SimpleDataSource { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents + boolean generateExtendedEvents, + BAQ.Mode Mode, + IndexedFastaSequenceFile refReader ) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; @@ -213,8 +245,8 @@ public class SAMDataSource implements SimpleDataSource { exclusionList, supplementalFilters, includeReadsWithDeletionAtLoci, - generateExtendedEvents - ); + generateExtendedEvents, + Mode, refReader ); // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. @@ -483,7 +515,8 @@ public class SAMDataSource implements SimpleDataSource { new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), readProperties.getDownsamplingMethod().toFraction, readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters()); + readProperties.getSupplementalFilters(), + readProperties.getBAQMode(), readProperties.getRefReader()); } /** @@ -506,7 +539,8 @@ public class SAMDataSource implements SimpleDataSource { new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)), readProperties.getDownsamplingMethod().toFraction, readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters()); + readProperties.getSupplementalFilters(), + readProperties.getBAQMode(), readProperties.getRefReader()); } /** @@ -539,7 +573,8 @@ public class SAMDataSource implements SimpleDataSource { StingSAMIterator wrappedIterator, Double downsamplingFraction, Boolean noValidationOfReadOrder, - Collection supplementalFilters) { + Collection supplementalFilters, + BAQ.Mode mode, IndexedFastaSequenceFile refReader ) { wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities); // NOTE: this (and other filtering) should be done before on-the-fly sorting @@ -552,6 +587,9 @@ public class SAMDataSource implements SimpleDataSource { if (!noValidationOfReadOrder && enableVerification) wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator); + if (mode != BAQ.Mode.NONE) + wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, mode); + wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); return wrappedIterator; diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/BAQSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/BAQSamIterator.java new file mode 100644 index 000000000..4095cae20 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/BAQSamIterator.java @@ -0,0 +1,37 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.BAQ; + +import java.util.Iterator; + +/** + * Simple iterator that applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQSamIterator implements StingSAMIterator { + StingSAMIterator it; + BAQ baqHMM = new BAQ(); // creates a BAQ creator with default parameters + IndexedFastaSequenceFile refReader = null; + BAQ.Mode mode; + + /** + * Creates a new BAMSamIterator using the reference getter refReader and applies the BAM to the reads coming + * in from it. See BAQ docs for baqType information. + * + * @param refReader + * @param it + * @param mode + */ + public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.Mode mode) { + this.refReader =refReader; + this.it = it; + this.mode = mode; + } + + public SAMRecord next() { return baqHMM.baqRead(it.next(), refReader, mode); } + public boolean hasNext() { return this.it.hasNext(); } + public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + public void close() { it.close(); } + public Iterator iterator() { return this; } +} diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index e5a561176..2499f1e9a 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -4,16 +4,11 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.RuntimeIOException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.gatk.ReadProperties; import java.util.Iterator; /** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Mar 15, 2009 - * Time: 6:02:31 PM - * To change this template use File | Settings | File Templates. + * Verifies that the incoming stream of reads is correctly sorted */ public class VerifyingSamIterator implements StingSAMIterator { private GenomeLocParser genomeLocParser; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java index eb4b84ddd..74f63aa2f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java @@ -10,12 +10,12 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; /** * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. + * --read-filter command line argument). Simplest example of a read-backed analysis. */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadsWalker extends ReadWalker { public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - //System.out.println(read.format()); + return 1; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java new file mode 100755 index 000000000..d00dae153 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java @@ -0,0 +1,142 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.BAQ; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.Argument; + +import java.io.PrintStream; + +/** + * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + */ +@Reference(window=@Window(start=-5,stop=5)) +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +public class ValidateBAQWalker extends ReadWalker { + @Output(doc="File to which results should be written",required=true) + protected PrintStream out; + + @Argument(doc="maximum read length to apply the BAQ calculation too",required=false) + protected int maxReadLen = 1000; + + @Argument(doc="only operates on reads with this name",required=false) + protected String readName = null; + + @Argument(doc="only prints out detailed information on the impact of BAQ when our implementation differences from the samtools BAQ tag", required=false) + protected boolean onlyPrintOnFailures = false; + + @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false) + protected boolean alsoPrintWarnings = false; + + int counter = 0; + + BAQ baqHMM = new BAQ(1e-3, 0.1, 7, 0); // matches current samtools parameters + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); + + if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && BAQ.hasBAQTag(read) ) { + byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, false); + if (counter++ % 1000 == 0) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); + BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader); + + boolean fail = false; + boolean print = false; + int badi = 0; + for ( badi = 0; badi < baqFromTag.length; badi++ ) { + if ( baqFromTag[badi] != baq.bq[badi] ) { + if (MathUtils.arrayMin(read.getBaseQualities()) == 0) { + print = true; + out.printf(" different, but Q0 base detected%n"); + break; + } + else if (readHasSoftClip(read)) { + print = true; + out.printf(" different, but soft clip detected%n"); + break; + } else if (readHasDeletion(read)) { + print = true; + out.printf(" different, but deletion detected%n"); + break; + } else { + fail = true; print = true; + break; + } + } + } + + if ( ! onlyPrintOnFailures || ( print && ( alsoPrintWarnings || fail ) ) ) { + out.printf(" read length : %d%n", read.getReadLength()); + out.printf(" read start : %d%n", read.getAlignmentStart()); + out.printf(" cigar : %s%n", read.getCigarString()); + out.printf(" read bases : %s%n", new String(read.getReadBases())); + out.printf(" ref length : %d%n", baq.refBases.length); + out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); + printQuals(" BQ deltas : ", BAQ.getBAQDeltas(read), true); + printQuals(" original quals: ", read.getBaseQualities(), true); + printQuals(" original quals: ", read.getBaseQualities()); + printQuals(" tag quals: ", baqFromTag); + printQuals(" hmm quals: ", baq.bq); + out.printf(" read bases : %s%n", new String(read.getReadBases())); + } + + + if ( fail ) + throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d", + read.getReadName(), badi, baqFromTag[badi], baq.bq[badi])); + } + + return 1; + } + + private final static boolean readHasSoftClip(SAMRecord read) { + for (CigarElement e : read.getCigar().getCigarElements()) { + if ( e.getOperator() == CigarOperator.SOFT_CLIP ) + return true; + } + + return false; + } + + private final static boolean readHasDeletion(SAMRecord read) { + for (CigarElement e : read.getCigar().getCigarElements()) { + if ( e.getOperator() == CigarOperator.DELETION ) + return true; + } + + return false; + } + + private final void printQuals( String prefix, byte[] quals ) { + printQuals(prefix, quals, false); + } + + private final void printQuals( String prefix, byte[] quals, boolean asInt ) { + out.print(prefix); + for ( int i = 0; i < quals.length; i++) { + if ( asInt ) { + out.print((int)quals[i]); + if ( i+1 != quals.length ) out.print(","); + } else + out.print((char)(quals[i]+33)); + } + out.println(); + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} + diff --git a/java/src/org/broadinstitute/sting/utils/BAQ.java b/java/src/org/broadinstitute/sting/utils/BAQ.java new file mode 100644 index 000000000..ad6d074c8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/BAQ.java @@ -0,0 +1,490 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequence; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/* + The topology of the profile HMM: + + /\ /\ /\ /\ + I[1] I[k-1] I[k] I[L] + ^ \ \ ^ \ ^ \ \ ^ + | \ \ | \ | \ \ | + M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1] + \ \/ \/ \/ / + \ /\ /\ /\ / + -> D[k-1] -> D[k] -> + + M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1]. + + On input, _ref is the reference sequence and _query is the query + sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an + ambiguous residue. iqual is the base quality. c sets the gap open + probability, gap extension probability and band width. + + On output, state and q are arrays of length l_query. The higher 30 + bits give the reference position the query base is matched to and the + lower two bits can be 0 (an alignment match) or 1 (an + insertion). q[i] gives the phred scaled posterior probability of + state[i] being wrong. + */ +public class BAQ { + private final static boolean DEBUG = false; + + public enum Mode { + NONE, // don't apply a BAQ at all, the default + USE_TAG_ONLY, // only use the TAG, if present, in the reads + CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag + RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present + } + + public static final String BAQ_TAG = "BQ"; + + private static double[] qual2prob = new double[256]; + static { + for (int i = 0; i < 256; ++i) + qual2prob[i] = Math.pow(10, -i/10.); + } + + private double cd = 1e-3; // gap open probility [1e-3] + private double ce = 0.1; // gap extension probability [0.1] + private int cb = 7; // band width [7] + + /** + * Any bases with Q < MIN_BASE_QUAL are raised up to this base quality + */ + private int minBaseQual = 4; + + public double getGapOpenProb() { + return cd; + } + + public double getGapExtensionProb() { + return ce; + } + + public int getBandWidth() { + return cb; + } + + /** + * Use defaults for everything + */ + public BAQ() { } + + /** + * Create a new HmmGlocal object with specified parameters + * + * @param d gap open prob. + * @param e gap extension prob. + * @param b band width + * @param minBaseQual All bases with Q < minBaseQual are up'd to this value + */ + public BAQ(final double d, final double e, final int b, final int minBaseQual) { + cd = d; ce = e; cb = b; this.minBaseQual = minBaseQual; + } + + private final static double EM = 0.33333333333; + private final static double EI = 0.25; + + // #################################################################################################### + // + // NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE + // NOTE -- PUSHED BACK TO HENG LI + // + // Note that _ref and _query are in the special 0-4 encoding [see above for docs] + // + // #################################################################################################### + public int hmm_glocal(final byte[] _ref, final byte[] _query, final byte[] _iqual, int[] state, byte[] q) { + if ( _ref == null ) throw new ReviewedStingException("BUG: ref sequence is null"); + if ( _query == null ) throw new ReviewedStingException("BUG: query sequence is null"); + if ( _iqual == null ) throw new ReviewedStingException("BUG: query quality vector is null"); + if ( _query.length != _iqual.length ) throw new ReviewedStingException("BUG: read sequence length != qual length"); + if ( q != null && q.length != _query.length ) throw new ReviewedStingException("BUG: BAQ quality length != read sequence length"); + if ( state != null && state.length != _query.length ) throw new ReviewedStingException("BUG: state length != read sequence length"); + + int i, k; + + /*** initialization ***/ + // change coordinates + int l_ref = _ref.length; + byte[] ref = new byte[l_ref+1]; + for (i = 0; i < l_ref; ++i) ref[i+1] = _ref[i]; // FIXME: this is silly... + int l_query = _query.length; + byte[] query = new byte[l_query+1]; + double[] qual = new double[l_query+1]; + for (i = 0; i < l_query; ++i) { + query[i+1] = _query[i]; + qual[i+1] = qual2prob[_iqual[i] < minBaseQual ? minBaseQual : _iqual[i]]; + } + + // set band width + int bw2, bw = l_ref > l_query? l_ref : l_query; + if (bw > cb) bw = cb; + if (bw < Math.abs(l_ref - l_query)) bw = Math.abs(l_ref - l_query); + bw2 = bw * 2 + 1; + + // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] + double[][] f = new double[l_query+1][bw2*3 + 6]; + double[][] b = new double[l_query+1][bw2*3 + 6]; + double[] s = new double[l_query+2]; + + // initialize transition probabilities + double sM, sI, bM, bI; + sM = sI = 1. / (2 * l_query + 2); + bM = (1 - cd) / l_query; bI = cd / l_query; // (bM+bI)*l_query==1 + double[] m = new double[9]; + m[0*3+0] = (1 - cd - cd) * (1 - sM); m[0*3+1] = m[0*3+2] = cd * (1 - sM); + m[1*3+0] = (1 - ce) * (1 - sI); m[1*3+1] = ce * (1 - sI); m[1*3+2] = 0.; + m[2*3+0] = 1 - ce; m[2*3+1] = 0.; m[2*3+2] = ce; + + /*** forward ***/ + // f[0] + f[0][set_u(bw, 0, 0)] = s[0] = 1.; + { // f[1] + double[] fi = f[1]; + double sum; + int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end; + for (k = beg, sum = 0.; k <= end; ++k) { + int u; + double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; + u = set_u(bw, 1, k); + fi[u+0] = e * bM; fi[u+1] = EI * bI; + sum += fi[u] + fi[u+1]; + } + // rescale + s[1] = sum; + _beg = set_u(bw, 1, beg); _end = set_u(bw, 1, end); _end += 2; + for (k = _beg; k <= _end; ++k) fi[k] /= sum; + } + + // f[2..l_query] + for (i = 2; i <= l_query; ++i) { + double[] fi = f[i], fi1 = f[i-1]; + double sum, qli = qual[i]; + int beg = 1, end = l_ref, x, _beg, _end; + byte qyi = query[i]; + x = i - bw; beg = beg > x? beg : x; // band start + x = i + bw; end = end < x? end : x; // band end + for (k = beg, sum = 0.; k <= end; ++k) { + int u, v11, v01, v10; + double e; + e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM; + u = set_u(bw, i, k); v11 = set_u(bw, i-1, k-1); v10 = set_u(bw, i-1, k); v01 = set_u(bw, i, k-1); + fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]); + fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); + fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; + sum += fi[u] + fi[u+1] + fi[u+2]; + //System.out.println("("+i+","+k+";"+u+"): "+fi[u]+","+fi[u+1]+","+fi[u+2]); + } + // rescale + s[i] = sum; + _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2; + for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum; + } + { // f[l_query+1] + double sum; + for (k = 1, sum = 0.; k <= l_ref; ++k) { + int u = set_u(bw, l_query, k); + if (u < 3 || u >= bw2*3+3) continue; + sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI; + } + s[l_query+1] = sum; // the last scaling factor + } + + + /*** backward ***/ + // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) + for (k = 1; k <= l_ref; ++k) { + int u = set_u(bw, l_query, k); + double[] bi = b[l_query]; + if (u < 3 || u >= bw2*3+3) continue; + bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; + } + // b[l_query-1..1] + for (i = l_query - 1; i >= 1; --i) { + int beg = 1, end = l_ref, x, _beg, _end; + double[] bi = b[i], bi1 = b[i+1]; + double y = (i > 1)? 1. : 0., qli1 = qual[i+1]; + byte qyi1 = query[i+1]; + x = i - bw; beg = beg > x? beg : x; + x = i + bw; end = end < x? end : x; + for (k = end; k >= beg; --k) { + int u, v11, v01, v10; + double e; + u = set_u(bw, i, k); v11 = set_u(bw, i+1, k+1); v10 = set_u(bw, i+1, k); v01 = set_u(bw, i, k+1); + e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11]; + bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. + bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; + bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; + } + // rescale + _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2; + for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; + } + + // TODO -- this appears to be a null operation overall. For debugging only? + double pb; + { // b[0] + int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; + double sum = 0.; + for (k = end; k >= beg; --k) { + int u = set_u(bw, 1, k); + double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; + if (u < 3 || u >= bw2*3+3) continue; + sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI; + } + pb = b[0][set_u(bw, 0, 0)] = sum / s[0]; // if everything works as is expected, pb == 1.0 + } + + /*** MAP ***/ + for (i = 1; i <= l_query; ++i) { + double sum = 0., max = 0.; + double[] fi = f[i], bi = b[i]; + int beg = 1, end = l_ref, x, max_k = -1; + x = i - bw; beg = beg > x? beg : x; + x = i + bw; end = end < x? end : x; + for (k = beg; k <= end; ++k) { + int u = set_u(bw, i, k); + double z; + sum += (z = fi[u+0] * bi[u+0]); if (z > max) { max = z; max_k = (k-1)<<2 | 0; } + sum += (z = fi[u+1] * bi[u+1]); if (z > max) { max = z; max_k = (k-1)<<2 | 1; } + } + max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 + if (state != null) state[i-1] = max_k; + if (q != null) { + k = (int)(-4.343 * Math.log(1. - max) + .499); + q[i-1] = (byte)(k > 100? 99 : k); + } + //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")"); + } + + return 0; + } + + /** + * helper routine for hmm_glocal + * + * @param b + * @param i + * @param k + * @return + */ + private static int set_u(final int b, final int i, final int k) { + int x = i - b; + x = x > 0 ? x : 0; + return (k + 1 - x) * 3; + } + + private static byte[] bases2indices(byte[] bases) { + byte[] out = new byte[bases.length]; + + for ( int i = 0; i < bases.length; i++ ) { + switch (bases[i]) { + case 'A': case 'a': out[i] = 0; break; + case 'C': case 'c': out[i] = 1; break; + case 'G': case 'g': out[i] = 2; break; + case 'T': case 't': out[i] = 3; break; + default: out[i] = 4; break; + } + } + + return out; + } + + /** + * Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present. + * @param read + * @return + */ + public static byte[] getBAQTag(SAMRecord read) { + String s = read.getStringAttribute(BAQ_TAG); + return s != null ? s.getBytes() : null; + } + + /** + * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. + * @param read + * @return + */ + public static byte[] getBAQDeltas(SAMRecord read) { + byte[] baq = getBAQTag(read); + if ( baq != null ) { + byte[] deltas = new byte[baq.length]; + for ( int i = 0; i < deltas.length; i++) + deltas[i] = (byte)(-1 * (baq[i] - 64)); + return deltas; + } else + return null; + } + + /** + * Returns true if the read has a BAQ tag, or false otherwise + * @param read + * @return + */ + public static boolean hasBAQTag(SAMRecord read) { + return read.getStringAttribute(BAQ_TAG) != null; + } + + /** + * Returns a new qual array for read that includes the BAQ adjusted. Does not support on-the-fly BAQ calculation + * + * @param read the SAMRecord to operate on + * @param overwriteOriginalQuals If true, we replace the original qualities scores in the read with their BAQ'd version + * @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ annotation we just use the raw quality scores. Throws IllegalStateException is false and no BAQ tag is present + * @return + */ + public static byte[] calcBAQFromTag(SAMRecord read, boolean overwriteOriginalQuals, boolean useRawQualsIfNoBAQTag) { + byte[] rawQuals = read.getBaseQualities(); + byte[] newQuals = rawQuals; + byte[] baq = getBAQTag(read); + + if ( baq != null ) { + // Offset to base alignment quality (BAQ), of the same length as the read sequence. + // At the i-th read base, BAQi = Qi ? (BQi ? 64) where Qi is the i-th base quality. + newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length]; + for ( int i = 0; i < rawQuals.length; i++) { + int val = rawQuals[i] - (baq[i] - 64); + newQuals[i] = val < 0 ? 0 : (byte)val; + } + } else if ( ! useRawQualsIfNoBAQTag ) { + throw new IllegalStateException("Required BAQ tag to be present, but none was on read " + read.getReadName()); + } + + return newQuals; + } + + public static class BAQCalculationResult { + public byte[] refBases, rawQuals, readBases, bq; + public int refOffset; + public int[] state; + + // todo -- optimization: use static growing arrays here + public BAQCalculationResult(SAMRecord read, byte[] ref, int refOffset) { + // prepares data for calculation + rawQuals = read.getBaseQualities(); + readBases = read.getReadBases(); + + // now actually prepare the data structures, and fire up the hmm + bq = new byte[rawQuals.length]; + state = new int[rawQuals.length]; + this.refBases = ref; + this.refOffset = refOffset; + } + } + + private static int getFirstInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(0); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + private static int getLastInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { + // start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar + int offset = getBandWidth() / 2; + int start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read); + int stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read); + + // now that we have the start and stop, get the reference sequence covering it + ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop); + + return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read))); + } + + // we need to bad ref by at least the bandwidth / 2 on either side + public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { + // note -- assumes ref is offset from the *CLIPPED* start + BAQCalculationResult baqResult = new BAQCalculationResult(read, ref, refOffset); + byte[] convSeq = bases2indices(baqResult.readBases); + byte[] convRef = bases2indices(baqResult.refBases); + + hmm_glocal(convRef, convSeq, baqResult.rawQuals, baqResult.state, baqResult.bq); + + // cap quals + int readI = 0, refI = 0; + for ( CigarElement elt : read.getCigar().getCigarElements() ) { + int l = elt.getLength(); + switch (elt.getOperator()) { + case N: // cannot handle these + return null; + case H : case P : // ignore pads and hard clips + break; + case I : case S : + // todo -- is it really the case that we want to treat I and S the same? + for ( int i = readI; i < readI + l; i++ ) baqResult.bq[i] = baqResult.rawQuals[i]; + readI += l; + break; + case D : refI += l; break; + case M : + for (int i = readI; i < readI + l; i++) { + boolean isIndel = stateIsIndel(baqResult.state[i]); + int pos = stateAlignedPosition(baqResult.state[i]); + int expectedPos = refI - refOffset + (i - readI); + if ( isIndel || pos != expectedPos ) + // we are an indel or we don't align to our best current position + baqResult.bq[i] = 0; + else + baqResult.bq[i] = baqResult.bq[i] < baqResult.rawQuals[i] ? baqResult.bq[i] : baqResult.rawQuals[i]; + } + readI += l; refI += l; + break; + } + } + + return baqResult; + } + + /** + * Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses the BAQ + * tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and does the BAQ on the fly + * using the refReader to obtain the reference bases as needed. + * + * @param read + * @param refReader + * @param calculationType + * @return + */ + public SAMRecord baqRead(SAMRecord read, IndexedFastaSequenceFile refReader, Mode calculationType ) { + if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName()); + if ( calculationType == Mode.NONE ) { // we don't want to do anything + ; // just fall though + } else if ( calculationType == Mode.USE_TAG_ONLY ) { + calcBAQFromTag(read, true, true); + } else { + if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) ) { + if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n"); + BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader); + System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length); + } else { + if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n"); + // this overwrites the original qualities + calcBAQFromTag(read, true, false); + } + } + + return read; + } + + private static boolean stateIsIndel(int state) { + return (state & 3) != 0; + } + + private static int stateAlignedPosition(int state) { + return state >> 2; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index f7d7a434a..8cfbf6dd4 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -379,6 +379,10 @@ public class MathUtils { return array[minElementIndex(array)]; } + public static byte arrayMin(byte[] array) { + return array[minElementIndex(array)]; + } + public static int minElementIndex(double[] array) { if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); @@ -391,6 +395,18 @@ public class MathUtils { return minI; } + public static int minElementIndex(byte[] array) { + if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + + int minI = -1; + for ( int i = 0; i < array.length; i++ ) { + if ( minI == -1 || array[i] < array[minI] ) + minI = i; + } + + return minI; + } + public static int arrayMax(List array) { if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 8541f6fbf..64efe19b2 100644 --- a/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.BAQ; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; @@ -146,7 +147,8 @@ public class LocusIteratorByStateUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), false, - false + false, + BAQ.Mode.NONE, null // no BAQ ); } }