diff --git a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java index afb306b6d..0c26d847d 100755 --- a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java @@ -316,6 +316,9 @@ public abstract class AbstractGenomeAnalysisEngine { protected void initializeDataSources() { logger.info("Strictness is " + argCollection.strictnessLevel); + // TODO -- REMOVE ME + BAQ.DEFAULT_GOP = argCollection.BAQGOP; + validateSuppliedReference(); referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 2eef79661..433d49e40 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.utils.interval.IntervalSetRule; @@ -154,6 +155,11 @@ public class GATKArgumentCollection { @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.NONE; + @Element(required = false) + @Hidden + @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="Gap open penalty. For testing purposes only", required = false) + public double BAQGOP = 1e-3; // todo remove me + /** * Gets the default downsampling method, returned if the user didn't specify any downsampling * method. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 368b717a7..0014b91fa 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -42,6 +42,7 @@ import java.io.PrintStream; * in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter * command line argument. */ +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReadsWalker extends ReadWalker { /** an optional argument to dump the reads out to a BAM file */ diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java index 88669977b..204ad8b92 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java @@ -50,7 +50,7 @@ public class ValidateBAQWalker extends ReadWalker { BAQ baqHMM = null; // matches current samtools parameters public void initialize() { - baqHMM = new BAQ(bw, 0.1, 7, 0); + baqHMM = new BAQ(bw, 0.1, 7, (byte)0); } public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { @@ -81,6 +81,11 @@ public class ValidateBAQWalker extends ReadWalker { print = true; out.printf(" different, but deletion detected%n"); break; + } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) { + print = true; + out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual()); + fail = true; + break; } else { fail = true; print = true; break; @@ -96,6 +101,7 @@ public class ValidateBAQWalker extends ReadWalker { out.printf(" read length : %d%n", read.getReadLength()); out.printf(" read start : %d%n", read.getAlignmentStart()); out.printf(" cigar : %s%n", read.getCigarString()); + out.printf(" ref bases : %s%n", new String(baq.refBases)); out.printf(" read bases : %s%n", new String(read.getReadBases())); out.printf(" ref length : %d%n", baq.refBases.length); out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); @@ -136,11 +142,15 @@ public class ValidateBAQWalker extends ReadWalker { return false; } - private final void printQuals( String prefix, byte[] quals ) { + public final void printQuals( String prefix, byte[] quals ) { printQuals(prefix, quals, false); } - private final void printQuals( String prefix, byte[] quals, boolean asInt ) { + public final void printQuals( String prefix, byte[] quals, boolean asInt ) { + printQuals(out, prefix, quals, asInt); + } + + public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { out.print(prefix); for ( int i = 0; i < quals.length; i++) { if ( asInt ) { diff --git a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index d7d1fb1c6..4f10ef151 100644 --- a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.baq; import net.sf.samtools.SAMRecord; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMSequenceRecord; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -64,14 +63,20 @@ public class BAQ { qual2prob[i] = Math.pow(10, -i/10.); } - private double cd = 1e-3; // gap open probility [1e-3] + public static double DEFAULT_GOP = 1e-3; // todo -- make me final private + + public double cd = -1; // gap open probility [1e-3] private double ce = 0.1; // gap extension probability [0.1] private int cb = 7; // band width [7] + public byte getMinBaseQual() { + return minBaseQual; + } + /** * Any bases with Q < MIN_BASE_QUAL are raised up to this base quality */ - private int minBaseQual = 4; + private byte minBaseQual = 4; public double getGapOpenProb() { return cd; @@ -88,7 +93,9 @@ public class BAQ { /** * Use defaults for everything */ - public BAQ() { } + public BAQ() { + cd = DEFAULT_GOP; + } /** * Create a new HmmGlocal object with specified parameters @@ -98,7 +105,7 @@ public class BAQ { * @param b band width * @param minBaseQual All bases with Q < minBaseQual are up'd to this value */ - public BAQ(final double d, final double e, final int b, final int minBaseQual) { + public BAQ(final double d, final double e, final int b, final byte minBaseQual) { cd = d; ce = e; cb = b; this.minBaseQual = minBaseQual; } @@ -272,7 +279,7 @@ public class BAQ { if (state != null) state[i-1] = max_k; if (q != null) { k = (int)(-4.343 * Math.log(1. - max) + .499); - q[i-1] = (byte)(k > 100? 99 : k); + q[i-1] = (byte)(k > 100? 99 : (k < minBaseQual ? minBaseQual : k)); } //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")"); } @@ -287,12 +294,12 @@ public class BAQ { // --------------------------------------------------------------------------------------------------------------- /** decode the bit encoded state array values */ - private static boolean stateIsIndel(int state) { + public static boolean stateIsIndel(int state) { return (state & 3) != 0; } /** decode the bit encoded state array values */ - private static int stateAlignedPosition(int state) { + public static int stateAlignedPosition(int state) { return state >> 2; } @@ -397,20 +404,22 @@ public class BAQ { public static class BAQCalculationResult { public byte[] refBases, rawQuals, readBases, bq; - public int refOffset; public int[] state; // todo -- optimization: use static growing arrays here - public BAQCalculationResult(SAMRecord read, byte[] ref, int refOffset) { + public BAQCalculationResult(SAMRecord read, byte[] ref) { + this(read.getBaseQualities(), read.getReadBases(), ref); + } + + public BAQCalculationResult(byte[] bases, byte[] quals, byte[] ref) { // prepares data for calculation - rawQuals = read.getBaseQualities(); - readBases = read.getReadBases(); + rawQuals = quals; + readBases = bases; // now actually prepare the data structures, and fire up the hmm bq = new byte[rawQuals.length]; state = new int[rawQuals.length]; this.refBases = ref; - this.refOffset = refOffset; } } @@ -448,14 +457,19 @@ public class BAQ { } } - // we need to bad ref by at least the bandwidth / 2 on either side - public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { + + public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals) { // note -- assumes ref is offset from the *CLIPPED* start - BAQCalculationResult baqResult = new BAQCalculationResult(read, ref, refOffset); + BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref); byte[] convSeq = bases2indices(baqResult.readBases); byte[] convRef = bases2indices(baqResult.refBases); - hmm_glocal(convRef, convSeq, baqResult.rawQuals, baqResult.state, baqResult.bq); + return baqResult; + } + + // we need to bad ref by at least the bandwidth / 2 on either side + public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { + BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities()); // cap quals int readI = 0, refI = 0; @@ -474,14 +488,8 @@ public class BAQ { case D : refI += l; break; case M : for (int i = readI; i < readI + l; i++) { - boolean isIndel = stateIsIndel(baqResult.state[i]); - int pos = stateAlignedPosition(baqResult.state[i]); int expectedPos = refI - refOffset + (i - readI); - if ( isIndel || pos != expectedPos ) - // we are an indel or we don't align to our best current position - baqResult.bq[i] = 0; - else - baqResult.bq[i] = baqResult.bq[i] < baqResult.rawQuals[i] ? baqResult.bq[i] : baqResult.rawQuals[i]; + baqResult.bq[i] = capBaseByBAQ( baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos ); } readI += l; refI += l; break; @@ -491,6 +499,18 @@ public class BAQ { return baqResult; } + public byte capBaseByBAQ( byte oq, byte bq, int state, int expectedPos ) { + byte b; + boolean isIndel = stateIsIndel(state); + int pos = stateAlignedPosition(state); + if ( isIndel || pos != expectedPos ) // we are an indel or we don't align to our best current position + b = minBaseQual; // just take b = minBaseQuality + else + b = bq < oq ? bq : oq; + + return b; + } + /** * Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses the BAQ * tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and does the BAQ on the fly diff --git a/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java new file mode 100644 index 000000000..499a0e210 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -0,0 +1,128 @@ +// our package +package org.broadinstitute.sting.utils.baq; + + +// the imports for unit testing. + + +import org.testng.Assert; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.annotations.BeforeMethod; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequence; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMFileHeader; + +/** + * Basic unit test for GenomeLoc + */ +public class BAQUnitTest extends BaseTest { + private SAMFileHeader header; + private final int startChr = 1; + private final int numChr = 2; + private final int chrSize = 1000; + + @BeforeMethod + public void before() { + header = ArtificialSAMUtils.createArtificialSamHeader(numChr, startChr, chrSize); + } + + + @DataProvider(name = "data") + public Object[][] createData1() { + List params = new ArrayList(); + params.add(new Object[]{0, + "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG", + "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", null, + "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG", + "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"}); + + params.add(new Object[]{0, + "GCTTTTTCTCCT", + "GCTTTTCCTCCT", null, + "IIHGGGIHHIIH", + "DI410..0HIID"}); + +// params.add(new Object[]{"GTTTTTTG", +// "GTTCCTTG", +// "IIIIIIII", +// "III!!III"}); + + // big and complex, also does a cap from 3 to 4! + params.add(new Object[]{-3, + "AAATTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATAG", + "TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", "49M1I126M1I20M1I25M", + ">IHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?B3BBC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?ACBABBBABBCA@@>AAA5;=64>???B>=6497<<;;<;>2?>BA@??A6<EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?ACBABBBABBCA@@>AAA5;=64>???B;86497<<;;<;>2?>BA@??A6<>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9", + "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9"}); + + // raw base qualities are low -- but they shouldn't be capped + params.add(new Object[]{-3, + "CCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATA", + "CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", null, + "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8", + "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8"}); + + +// // capping test +// params.add(new Object[]{-3, +// "TTTAGGGCTAACTATGACATGAACCCCAAAA", +// "AGGGCTAACTATGACATGAACCCCA", null, +// "!CDDEEEDDDCEEEECBCA@A@.0'", +// "!%%%%&)DDDCEEEECBCA@A@.0'"}); + + return params.toArray(new Object[][]{}); + } + + // + // todo -- add cigar strings to test, and create synthetic reads + // + @Test(dataProvider = "data", enabled = true) + public void testBAQ1(int offset, String _ref, String _query, String cigar, String _fastqQuals, String _expected) { + byte[] ref = _ref.getBytes(); + byte[] query = _query.getBytes(); + byte[] fastqQuals = _fastqQuals.getBytes(); + byte[] expected = _expected.getBytes(); + if ( cigar == null ) cigar = String.format("%dM", query.length); + + BAQ baqHMM = new BAQ(1e-3, 0.1, 7, (byte)4); // matches current samtools parameters + byte[] quals = new byte[fastqQuals.length]; + for ( int i = 0; i < fastqQuals.length; i++) { + quals[i] = (byte)(fastqQuals[i] - 33); + expected[i] = (byte)(expected[i] - 33); + } + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, query, quals); + read.setCigarString(cigar); + BAQ.BAQCalculationResult result = baqHMM.calcBAQFromHMM(read, ref, offset); + + for (int i = 0; i < quals.length; i++) { + //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); + Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); + Assert.assertEquals(result.bq[i], expected[i], "Did not see the expected BAQ value"); + } + + ValidateBAQWalker.printQuals(System.out, "in-quals:", quals, false); + ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false); + } +} \ No newline at end of file