diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java index 204ad8b92..fcb8346c6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Argument; @@ -36,6 +37,9 @@ public class ValidateBAQWalker extends ReadWalker { @Argument(doc="only operates on reads with this name",required=false) protected String readName = null; + @Argument(doc="If true, all differences are errors", required=false) + protected boolean strict = false; + @Argument(doc="prints info for each read", required=false) protected boolean printEachRead = false; @@ -58,7 +62,7 @@ public class ValidateBAQWalker extends ReadWalker { if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) { byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); - if (counter++ % 1000 == 0) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); + if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader); boolean fail = false; @@ -70,31 +74,33 @@ public class ValidateBAQWalker extends ReadWalker { if ( baqFromTag[badi] != baq.bq[badi] ) { if (MathUtils.arrayMin(read.getBaseQualities()) == 0) { print = true; + fail = strict; out.printf(" different, but Q0 base detected%n"); break; } else if (readHasSoftClip(read)) { print = true; + fail = strict; out.printf(" different, but soft clip detected%n"); break; } else if (readHasDeletion(read)) { print = true; + fail = strict; out.printf(" different, but deletion detected%n"); break; } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) { - print = true; + print = fail = true; out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual()); - fail = true; break; } else { - fail = true; print = true; + print = fail = true; break; } } } } - if ( printEachRead || ( print && ( alsoPrintWarnings || fail ) ) ) { + if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) { byte[] pos = new byte[baq.bq.length]; for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i; @@ -113,6 +119,7 @@ public class ValidateBAQWalker extends ReadWalker { if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag); printQuals(" hmm quals: ", baq.bq); out.printf(" read bases : %s%n", new String(read.getReadBases())); + out.println(Utils.dupString('-', 80)); } diff --git a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 4f10ef151..77976cc76 100644 --- a/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -7,6 +7,8 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Arrays; + /* The topology of the profile HMM: @@ -457,19 +459,31 @@ public class BAQ { } } - - public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals) { + public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) { // note -- assumes ref is offset from the *CLIPPED* start BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref); byte[] convSeq = bases2indices(baqResult.readBases); byte[] convRef = bases2indices(baqResult.refBases); - hmm_glocal(convRef, convSeq, baqResult.rawQuals, baqResult.state, baqResult.bq); + + //boolean clipped = queryStart > 0 || queryEnd < query.length; // we are clipping, need to handle + byte[] cseq = Arrays.copyOfRange(convSeq, queryStart, queryEnd ); + byte[] cquals = Arrays.copyOfRange(baqResult.rawQuals, queryStart, queryEnd ); + int[] cstate = Arrays.copyOfRange(baqResult.state, queryStart, queryEnd ); + byte[] cbq = Arrays.copyOfRange(baqResult.bq, queryStart, queryEnd ); + + hmm_glocal(convRef, cseq, cquals, cstate, cbq); + + System.arraycopy(cstate, 0, baqResult.state, queryStart, cstate.length); + System.arraycopy(cbq, 0, baqResult.bq, queryStart, cbq.length); + return baqResult; } // we need to bad ref by at least the bandwidth / 2 on either side public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { - BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities()); + int queryStart = (int)(read.getAlignmentStart() - read.getUnclippedStart()); + int queryEnd = (int)(read.getReadLength() - (read.getUnclippedEnd() - read.getAlignmentEnd())); + BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd); // cap quals int readI = 0, refI = 0; @@ -493,6 +507,8 @@ public class BAQ { } readI += l; refI += l; break; + default: + throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); } } diff --git a/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index 499a0e210..c4e0a65bb 100644 --- a/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.Utils; import java.io.File; import java.util.Arrays; @@ -22,10 +23,7 @@ import java.util.ArrayList; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; -import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.*; /** * Basic unit test for GenomeLoc @@ -35,94 +33,160 @@ public class BAQUnitTest extends BaseTest { private final int startChr = 1; private final int numChr = 2; private final int chrSize = 1000; + IndexedFastaSequenceFile fasta = null; @BeforeMethod public void before() { header = ArtificialSAMUtils.createArtificialSamHeader(numChr, startChr, chrSize); + fasta = new IndexedFastaSequenceFile(new File(hg18Reference)); + } + + private class BAQTest { + String readBases, refBases; + byte[] quals, expected; + String cigar; + int refOffset; + int pos; + + public BAQTest(String _refBases, String _readBases, String _quals, String _expected) { + this(0, -1, null, _readBases, _refBases, _quals, _expected); + } + + public BAQTest(int refOffset, String _refBases, String _readBases, String _quals, String _expected) { + this(refOffset, -1, null, _refBases, _readBases, _quals, _expected); + } + + public BAQTest(long pos, String cigar, String _readBases, String _quals, String _expected) { + this(0, pos, cigar, null, _readBases, _quals, _expected); + } + + + public BAQTest(int _refOffset, long _pos, String _cigar, String _refBases, String _readBases, String _quals, String _expected) { + refOffset = _refOffset; + pos = (int)_pos; + cigar = _cigar; + readBases = _readBases; + refBases = _refBases; + + quals = new byte[_quals.getBytes().length]; + expected = new byte[_quals.getBytes().length]; + for ( int i = 0; i < quals.length; i++) { + quals[i] = (byte)(_quals.getBytes()[i] - 33); + expected[i] = (byte)(_expected.getBytes()[i] - 33); + } + } + + public String toString() { return readBases; } + + public SAMRecord createRead() { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, pos > 0 ? pos + (refOffset > 0 ? refOffset : 0): 1, readBases.getBytes(), quals); + //if ( cigar != null ) read.setAlignmentEnd(readBases.getBytes().length + pos); + read.setCigarString( cigar == null ? String.format("%dM", quals.length) : cigar); + return read; + } } @DataProvider(name = "data") public Object[][] createData1() { - List params = new ArrayList(); - params.add(new Object[]{0, - "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG", - "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", null, - "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG", - "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"}); + List params = new ArrayList(); - params.add(new Object[]{0, - "GCTTTTTCTCCT", - "GCTTTTCCTCCT", null, - "IIHGGGIHHIIH", - "DI410..0HIID"}); + params.add(new BAQTest("GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG", + "GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", + "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG", + "?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE")); -// params.add(new Object[]{"GTTTTTTG", -// "GTTCCTTG", -// "IIIIIIII", -// "III!!III"}); + params.add(new BAQTest("GCTTTTTCTCCTCCTG", + "GCTTTTCCTCCTCCTG", + "IIHGGGIHHIIHHIIH", + "EI410..0HIIHHIIE")); // big and complex, also does a cap from 3 to 4! - params.add(new Object[]{-3, + params.add(new BAQTest(-3, 9999810l, "49M1I126M1I20M1I25M", "AAATTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATAG", - "TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", "49M1I126M1I20M1I25M", + "TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", ">IHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?B3BBC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?ACBABBBABBCA@@>AAA5;=64>???B>=6497<<;;<;>2?>BA@??A6<EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?ACBABBBABBCA@@>AAA5;=64>???B;86497<<;;<;>2?>BA@??A6<EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?ACBABBBABBCA@@>AAA5;=64>???B;86497<<;;<;>2?>BA@??A6<>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9", - "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9"}); + "A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9")); // raw base qualities are low -- but they shouldn't be capped - params.add(new Object[]{-3, + params.add(new BAQTest(-3, 9999993l, "36M", "CCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATA", - "CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", null, + "CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8", - "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8"}); + "33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8")); + + // soft clipping + // todo soft clip testing just doesn't work right now! + +// params.add(new BAQTest(29, 10000109l, "29S190M", +// null, "GAAGGTTGAATCAAACCTTCGGTTCCAACGGATTACAGGTGTGAGCCACCGCGACCGGCCTGCTCAAGATAATTTTTAGGGCTAACTATGACATGAACCCCAAAATTCCTGTCCTCTAGATGGCAGAAACCAAGATAAAGTATCCCCACATGGCCACAAGGTTAAGCTCTTATGGACACAAAACAAGGCAGAGAAATGTCATTTGGCATTGGTTTCAGG", +// "3737088:858278273772:3<=;:?;5=9@>@?>@=<>8?>@=>>?>4=5>?=5====A==@?A@=@6@A>@A?>?AA>@?AA>A?>==?AAA@@A>=A>A=?A>AA==@A?AA?>?AA?A@@C@:?A@<;::??AA==>@@?BB=>A>A?AB=???@?BBA@?BA==?A>A?BB=A:@?ABAB>>?ABB>8A@BAIGA", +// "3737088:858278273772:3<=;:?;5=9@>@?>@=<>8?>@=>>?>4=5>?=5====A==@?A@=@6@A>@A?>?AA>@?AA>A?>==?AAA@@A>=A>A=?A>AA==@A?AA?>?AA?A@@C@:?A@<;::??AA==>@@?BB=>A>A?AB=???@?BBA@?BA==?A>A?BB=A:@?ABAB>>?ABB>8A@BAI>;")); + +// params.add(new BAQTest(30, 10000373l, "30S69M1D2M", +// null, "TGAAATCCTGCCTTATAGTTCCCCTAAACCCACGTTCTATCCCCAGATACTCCCCTCTTCATTACAGAACAACAAAGAAAGACAAATTCTTAGCATCAATG", +// "###############################=89>B;6<;96*>.1799>++66=:=:8=<-.9>><;9<':-+;*+::=;8=;;.::<:;=/2=70<=?-", +// "###############################=89>B;6<;96*>.1799>++66=:=:8=<-.9>><;9<':-+;*+::=;8=;;.::<:;=/2=7000%%")); -// // capping test -// params.add(new Object[]{-3, -// "TTTAGGGCTAACTATGACATGAACCCCAAAA", -// "AGGGCTAACTATGACATGAACCCCA", null, -// "!CDDEEEDDDCEEEECBCA@A@.0'", -// "!%%%%&)DDDCEEEECBCA@A@.0'"}); +// params.add(new BAQTest(5, 10000109l, "5S5M", +// "GAAGGTTGAA", +// null, +// "HHHHHHHHHH", +// "HHHHHHHHHE")); - return params.toArray(new Object[][]{}); +// params.add(new BAQTest(10009480l, "102M1I18M1I16M1I43M1I10M1D9M1I7M1I7M1I16M1I9M1I8M1I14M2I18M", +// "AGAGATGGGGTTTCGCCATGTTGTCCAGGCTGGTCTTGAACTCCTGACCTCAAGTGATCTGCCCACCTCGGCCTCCCAAAGTGCTGGGATTACACGTGTGAAACCACCATGCCTGGTCTCTTAATTTTTCNGATTCTAATAAAATTACATTCTATTTGCTGAAAGNGTACTTTAGAGTTGAAAGAAAAAGAAAGGNGTGGAACTTCCCCTAGTAAACAAGGAAAAACNTCCATGTTATTTATTGGACCTTAAAAATAGTGAAACATCTTAAGAAAAAAAATCAATCCTA", +// "@HI@BA7>=AA>9@==??C???@?>:?BB@BA>B?=A@@<=B?AB???@@@@@?=?A==B@7<==>=<=>???>=@@A?<=B:5?413577/675;><;==@=<>>968;6;>????:#;=?>:3072077726/6;3719;9A=9;774771#30532676??=8::97<7144448/4425#65688821515986255/5601548355551#218>96/5/8<4/.2344/914/55553)1047;:30312:4:63556565631=:62610", +// "@HI@BA7>=AA>9@==??C???@?>:?BB@BA>B?=A@@<=B?AB???@@@@@?=?A==B@7<==>=<=>???>=@@A?<=B:5?413&!7/675;><;==@=<>>96!;6;>????:#;=?>:3!72077726/6;3719;9A=9;774771#30532676??=8::&!<7144448'$!25#65687421515986255/560!548355551#218>96!5/8<4/.2344/614(%!!53)1047;:30312:4:63556565631=:62610")); + + List params2 = new ArrayList(); + for ( BAQTest x : params ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); } - // - // todo -- add cigar strings to test, and create synthetic reads - // + + @Test(dataProvider = "data", enabled = true) - public void testBAQ1(int offset, String _ref, String _query, String cigar, String _fastqQuals, String _expected) { - byte[] ref = _ref.getBytes(); - byte[] query = _query.getBytes(); - byte[] fastqQuals = _fastqQuals.getBytes(); - byte[] expected = _expected.getBytes(); - if ( cigar == null ) cigar = String.format("%dM", query.length); + public void testBAQWithProvidedReference(BAQTest test) { + if ( test.refBases != null ) { + testBAQ(test, false); + } + } + @Test(dataProvider = "data", enabled = true) + public void testBAQWithCigarAndRefLookup(BAQTest test) { + if ( test.cigar != null ) { + testBAQ(test, true); + } + } + + public void testBAQ(BAQTest test, boolean lookupWithFasta) { BAQ baqHMM = new BAQ(1e-3, 0.1, 7, (byte)4); // matches current samtools parameters - byte[] quals = new byte[fastqQuals.length]; - for ( int i = 0; i < fastqQuals.length; i++) { - quals[i] = (byte)(fastqQuals[i] - 33); - expected[i] = (byte)(expected[i] - 33); - } - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, query, quals); - read.setCigarString(cigar); - BAQ.BAQCalculationResult result = baqHMM.calcBAQFromHMM(read, ref, offset); + SAMRecord read = test.createRead(); + BAQ.BAQCalculationResult result; + if ( lookupWithFasta && test.cigar != null ) + result = baqHMM.calcBAQFromHMM(read, fasta); + else + result = baqHMM.calcBAQFromHMM(read, test.refBases.getBytes(), test.refOffset); - for (int i = 0; i < quals.length; i++) { - //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); - Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); - Assert.assertEquals(result.bq[i], expected[i], "Did not see the expected BAQ value"); - } - - ValidateBAQWalker.printQuals(System.out, "in-quals:", quals, false); + System.out.println(Utils.dupString('-', 40)); + System.out.println("reads : " + new String(test.readBases)); + ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false); ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false); + for (int i = 0; i < test.quals.length; i++) { + //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); + Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); + Assert.assertEquals(result.bq[i], test.expected[i], "Did not see the expected BAQ value at " + i); + } + } } \ No newline at end of file