Better testing of BAQ. Now really handles soft clipped reads properly by doing an expensive copy operation :-( will need to be transformed to a ByteBuffer in the near future.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4807 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-12-08 17:37:00 +00:00
parent f1f01610f8
commit db55b2b0c6
3 changed files with 155 additions and 68 deletions

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.Argument;
@ -36,6 +37,9 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
@Argument(doc="only operates on reads with this name",required=false)
protected String readName = null;
@Argument(doc="If true, all differences are errors", required=false)
protected boolean strict = false;
@Argument(doc="prints info for each read", required=false)
protected boolean printEachRead = false;
@ -58,7 +62,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
if (counter++ % 1000 == 0) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
boolean fail = false;
@ -70,31 +74,33 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
if ( baqFromTag[badi] != baq.bq[badi] ) {
if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
print = true;
fail = strict;
out.printf(" different, but Q0 base detected%n");
break;
}
else if (readHasSoftClip(read)) {
print = true;
fail = strict;
out.printf(" different, but soft clip detected%n");
break;
} else if (readHasDeletion(read)) {
print = true;
fail = strict;
out.printf(" different, but deletion detected%n");
break;
} else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
print = true;
print = fail = true;
out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
fail = true;
break;
} else {
fail = true; print = true;
print = fail = true;
break;
}
}
}
}
if ( printEachRead || ( print && ( alsoPrintWarnings || fail ) ) ) {
if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
byte[] pos = new byte[baq.bq.length];
for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
@ -113,6 +119,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag);
printQuals(" hmm quals: ", baq.bq);
out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.println(Utils.dupString('-', 80));
}

View File

@ -7,6 +7,8 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
/*
The topology of the profile HMM:
@ -457,19 +459,31 @@ public class BAQ {
}
}
public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals) {
public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals, int queryStart, int queryEnd ) {
// note -- assumes ref is offset from the *CLIPPED* start
BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref);
byte[] convSeq = bases2indices(baqResult.readBases);
byte[] convRef = bases2indices(baqResult.refBases);
hmm_glocal(convRef, convSeq, baqResult.rawQuals, baqResult.state, baqResult.bq);
//boolean clipped = queryStart > 0 || queryEnd < query.length; // we are clipping, need to handle
byte[] cseq = Arrays.copyOfRange(convSeq, queryStart, queryEnd );
byte[] cquals = Arrays.copyOfRange(baqResult.rawQuals, queryStart, queryEnd );
int[] cstate = Arrays.copyOfRange(baqResult.state, queryStart, queryEnd );
byte[] cbq = Arrays.copyOfRange(baqResult.bq, queryStart, queryEnd );
hmm_glocal(convRef, cseq, cquals, cstate, cbq);
System.arraycopy(cstate, 0, baqResult.state, queryStart, cstate.length);
System.arraycopy(cbq, 0, baqResult.bq, queryStart, cbq.length);
return baqResult;
}
// we need to bad ref by at least the bandwidth / 2 on either side
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities());
int queryStart = (int)(read.getAlignmentStart() - read.getUnclippedStart());
int queryEnd = (int)(read.getReadLength() - (read.getUnclippedEnd() - read.getAlignmentEnd()));
BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities(), queryStart, queryEnd);
// cap quals
int readI = 0, refI = 0;
@ -493,6 +507,8 @@ public class BAQ {
}
readI += l; refI += l;
break;
default:
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
}
}

View File

@ -14,6 +14,7 @@ import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.Utils;
import java.io.File;
import java.util.Arrays;
@ -22,10 +23,7 @@ import java.util.ArrayList;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.*;
/**
* Basic unit test for GenomeLoc
@ -35,94 +33,160 @@ public class BAQUnitTest extends BaseTest {
private final int startChr = 1;
private final int numChr = 2;
private final int chrSize = 1000;
IndexedFastaSequenceFile fasta = null;
@BeforeMethod
public void before() {
header = ArtificialSAMUtils.createArtificialSamHeader(numChr, startChr, chrSize);
fasta = new IndexedFastaSequenceFile(new File(hg18Reference));
}
private class BAQTest {
String readBases, refBases;
byte[] quals, expected;
String cigar;
int refOffset;
int pos;
public BAQTest(String _refBases, String _readBases, String _quals, String _expected) {
this(0, -1, null, _readBases, _refBases, _quals, _expected);
}
public BAQTest(int refOffset, String _refBases, String _readBases, String _quals, String _expected) {
this(refOffset, -1, null, _refBases, _readBases, _quals, _expected);
}
public BAQTest(long pos, String cigar, String _readBases, String _quals, String _expected) {
this(0, pos, cigar, null, _readBases, _quals, _expected);
}
public BAQTest(int _refOffset, long _pos, String _cigar, String _refBases, String _readBases, String _quals, String _expected) {
refOffset = _refOffset;
pos = (int)_pos;
cigar = _cigar;
readBases = _readBases;
refBases = _refBases;
quals = new byte[_quals.getBytes().length];
expected = new byte[_quals.getBytes().length];
for ( int i = 0; i < quals.length; i++) {
quals[i] = (byte)(_quals.getBytes()[i] - 33);
expected[i] = (byte)(_expected.getBytes()[i] - 33);
}
}
public String toString() { return readBases; }
public SAMRecord createRead() {
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, pos > 0 ? pos + (refOffset > 0 ? refOffset : 0): 1, readBases.getBytes(), quals);
//if ( cigar != null ) read.setAlignmentEnd(readBases.getBytes().length + pos);
read.setCigarString( cigar == null ? String.format("%dM", quals.length) : cigar);
return read;
}
}
@DataProvider(name = "data")
public Object[][] createData1() {
List<Object[]> params = new ArrayList<Object[]>();
params.add(new Object[]{0,
"GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG",
"GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", null,
"?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG",
"?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"});
List<BAQTest> params = new ArrayList<BAQTest>();
params.add(new Object[]{0,
"GCTTTTTCTCCT",
"GCTTTTCCTCCT", null,
"IIHGGGIHHIIH",
"DI410..0HIID"});
params.add(new BAQTest("GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG",
"GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG",
"?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG",
"?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"));
// params.add(new Object[]{"GTTTTTTG",
// "GTTCCTTG",
// "IIIIIIII",
// "III!!III"});
params.add(new BAQTest("GCTTTTTCTCCTCCTG",
"GCTTTTCCTCCTCCTG",
"IIHGGGIHHIIHHIIH",
"EI410..0HIIHHIIE"));
// big and complex, also does a cap from 3 to 4!
params.add(new Object[]{-3,
params.add(new BAQTest(-3, 9999810l, "49M1I126M1I20M1I25M",
"AAATTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATAG",
"TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", "49M1I126M1I20M1I25M",
"TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA",
">IHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?B3BBC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@C<>5;<A5=A;>=64>???B>=6497<<;;<;>2?>BA@??A6<<A59",
">EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@%<>5;<A5=A;>=64>???B;86497<<;;<;>2?>BA@??A6<<A59"});
">EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@%<>5;<A5=A;>=64>???B;86497<<;;<;>2?>BA@??A6<<A59"));
// now changes
params.add(new Object[]{-3,
params.add(new BAQTest(-3, 9999966l, "36M",
"CCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCC",
"AGTAGCTGGGACTACAGGCACCCACCACCACGCCTG", null,
"AGTAGCTGGGACTACAGGCACCCACCACCACGCCTG",
"A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9",
"A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9"});
"A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9"));
// raw base qualities are low -- but they shouldn't be capped
params.add(new Object[]{-3,
params.add(new BAQTest(-3, 9999993l, "36M",
"CCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATA",
"CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", null,
"CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG",
"33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8",
"33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8"});
"33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8"));
// soft clipping
// todo soft clip testing just doesn't work right now!
// params.add(new BAQTest(29, 10000109l, "29S190M",
// null, "GAAGGTTGAATCAAACCTTCGGTTCCAACGGATTACAGGTGTGAGCCACCGCGACCGGCCTGCTCAAGATAATTTTTAGGGCTAACTATGACATGAACCCCAAAATTCCTGTCCTCTAGATGGCAGAAACCAAGATAAAGTATCCCCACATGGCCACAAGGTTAAGCTCTTATGGACACAAAACAAGGCAGAGAAATGTCATTTGGCATTGGTTTCAGG",
// "3737088:858278273772:3<=;:?;5=9@>@?>@=<>8?>@=>>?>4=5>?=5====A==@?A@=@6@A><?B:A;:;>@A?>?AA>@?AA>A?>==?AAA@@A>=A<A>>A=?A>AA==@A?AA?>?AA?A@@C@:?A@<;::??AA==>@@?BB=<A?BA>>A>A?AB=???@?BBA@?BA==?A>A?BB=A:@?ABAB>>?ABB>8A@BAIGA",
// "3737088:858278273772:3<=;:?;5=9@>@?>@=<>8?>@=>>?>4=5>?=5====A==@?A@=@6@A><?B:A;:;>@A?>?AA>@?AA>A?>==?AAA@@A>=A<A>>A=?A>AA==@A?AA?>?AA?A@@C@:?A@<;::??AA==>@@?BB=<A?BA>>A>A?AB=???@?BBA@?BA==?A>A?BB=A:@?ABAB>>?ABB>8A@BAI>;"));
// params.add(new BAQTest(30, 10000373l, "30S69M1D2M",
// null, "TGAAATCCTGCCTTATAGTTCCCCTAAACCCACGTTCTATCCCCAGATACTCCCCTCTTCATTACAGAACAACAAAGAAAGACAAATTCTTAGCATCAATG",
// "###############################=89>B;6<;96*>.1799>++66=:=:8=<-.9>><;9<':-+;*+::=;8=;;.::<:;=/2=70<=?-",
// "###############################=89>B;6<;96*>.1799>++66=:=:8=<-.9>><;9<':-+;*+::=;8=;;.::<:;=/2=7000%%"));
// // capping test
// params.add(new Object[]{-3,
// "TTTAGGGCTAACTATGACATGAACCCCAAAA",
// "AGGGCTAACTATGACATGAACCCCA", null,
// "!CDDEEEDDDCEEEECBCA@A@.0'",
// "!%%%%&)DDDCEEEECBCA@A@.0'"});
// params.add(new BAQTest(5, 10000109l, "5S5M",
// "GAAGGTTGAA",
// null,
// "HHHHHHHHHH",
// "HHHHHHHHHE"));
return params.toArray(new Object[][]{});
// params.add(new BAQTest(10009480l, "102M1I18M1I16M1I43M1I10M1D9M1I7M1I7M1I16M1I9M1I8M1I14M2I18M",
// "AGAGATGGGGTTTCGCCATGTTGTCCAGGCTGGTCTTGAACTCCTGACCTCAAGTGATCTGCCCACCTCGGCCTCCCAAAGTGCTGGGATTACACGTGTGAAACCACCATGCCTGGTCTCTTAATTTTTCNGATTCTAATAAAATTACATTCTATTTGCTGAAAGNGTACTTTAGAGTTGAAAGAAAAAGAAAGGNGTGGAACTTCCCCTAGTAAACAAGGAAAAACNTCCATGTTATTTATTGGACCTTAAAAATAGTGAAACATCTTAAGAAAAAAAATCAATCCTA",
// "@HI@BA<?C@?CA>7>=AA>9@==??C???@?>:?BB@BA>B?=A@@<=B?AB???@@@@@?=?A==B@7<<?@>==>=<=>???>=@@A?<=B:5?413577/675;><;==@=<>>968;6;>????:#;=?>:3072077726/6;3719;9A=9;774771#30532676??=8::97<7144448/4425#65688821515986255/5601548355551#218>96/5/8<4/.2344/914/55553)1047;:30312:4:63556565631=:62610",
// "@HI@BA<?C@?CA>7>=AA>9@==??C???@?>:?BB@BA>B?=A@@<=B?AB???@@@@@?=?A==B@7<<?@>==>=<=>???>=@@A?<=B:5?413&!7/675;><;==@=<>>96!;6;>????:#;=?>:3!72077726/6;3719;9A=9;774771#30532676??=8::&!<7144448'$!25#65687421515986255/560!548355551#218>96!5/8<4/.2344/614(%!!53)1047;:30312:4:63556565631=:62610"));
List<Object[]> params2 = new ArrayList<Object[]>();
for ( BAQTest x : params ) params2.add(new Object[]{x});
return params2.toArray(new Object[][]{});
}
//
// todo -- add cigar strings to test, and create synthetic reads
//
@Test(dataProvider = "data", enabled = true)
public void testBAQ1(int offset, String _ref, String _query, String cigar, String _fastqQuals, String _expected) {
byte[] ref = _ref.getBytes();
byte[] query = _query.getBytes();
byte[] fastqQuals = _fastqQuals.getBytes();
byte[] expected = _expected.getBytes();
if ( cigar == null ) cigar = String.format("%dM", query.length);
public void testBAQWithProvidedReference(BAQTest test) {
if ( test.refBases != null ) {
testBAQ(test, false);
}
}
@Test(dataProvider = "data", enabled = true)
public void testBAQWithCigarAndRefLookup(BAQTest test) {
if ( test.cigar != null ) {
testBAQ(test, true);
}
}
public void testBAQ(BAQTest test, boolean lookupWithFasta) {
BAQ baqHMM = new BAQ(1e-3, 0.1, 7, (byte)4); // matches current samtools parameters
byte[] quals = new byte[fastqQuals.length];
for ( int i = 0; i < fastqQuals.length; i++) {
quals[i] = (byte)(fastqQuals[i] - 33);
expected[i] = (byte)(expected[i] - 33);
}
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, query, quals);
read.setCigarString(cigar);
BAQ.BAQCalculationResult result = baqHMM.calcBAQFromHMM(read, ref, offset);
SAMRecord read = test.createRead();
BAQ.BAQCalculationResult result;
if ( lookupWithFasta && test.cigar != null )
result = baqHMM.calcBAQFromHMM(read, fasta);
else
result = baqHMM.calcBAQFromHMM(read, test.refBases.getBytes(), test.refOffset);
for (int i = 0; i < quals.length; i++) {
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
Assert.assertEquals(result.bq[i], expected[i], "Did not see the expected BAQ value");
}
ValidateBAQWalker.printQuals(System.out, "in-quals:", quals, false);
System.out.println(Utils.dupString('-', 40));
System.out.println("reads : " + new String(test.readBases));
ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
for (int i = 0; i < test.quals.length; i++) {
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
Assert.assertEquals(result.bq[i], test.expected[i], "Did not see the expected BAQ value at " + i);
}
}
}