Hidden command line option to control BAQ gap open penalty for testing by me and eric. ValidateBAQWalker has misc. useful improvements. PrintReads now adds BAQ tags on output, if requested.
BAQ has generally useful improvements. Refactor code to make it easier for BAQUnitTest to run. minBaseQuality enforced on output, as well as input now. Added BAQUnitTest that checks that the BAQ calculation is performing as expected. Still needs to be expanded significantly. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4804 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1b6bec8e6b
commit
16e1bbd380
|
|
@ -316,6 +316,9 @@ public abstract class AbstractGenomeAnalysisEngine {
|
|||
protected void initializeDataSources() {
|
||||
logger.info("Strictness is " + argCollection.strictnessLevel);
|
||||
|
||||
// TODO -- REMOVE ME
|
||||
BAQ.DEFAULT_GOP = argCollection.BAQGOP;
|
||||
|
||||
validateSuppliedReference();
|
||||
referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile);
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
|
@ -154,6 +155,11 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false)
|
||||
public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.NONE;
|
||||
|
||||
@Element(required = false)
|
||||
@Hidden
|
||||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="Gap open penalty. For testing purposes only", required = false)
|
||||
public double BAQGOP = 1e-3; // todo remove me
|
||||
|
||||
/**
|
||||
* Gets the default downsampling method, returned if the user didn't specify any downsampling
|
||||
* method.
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import java.io.PrintStream;
|
|||
* in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter
|
||||
* command line argument.
|
||||
*/
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
/** an optional argument to dump the reads out to a BAM file */
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
|||
BAQ baqHMM = null; // matches current samtools parameters
|
||||
|
||||
public void initialize() {
|
||||
baqHMM = new BAQ(bw, 0.1, 7, 0);
|
||||
baqHMM = new BAQ(bw, 0.1, 7, (byte)0);
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||
|
|
@ -81,6 +81,11 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
|||
print = true;
|
||||
out.printf(" different, but deletion detected%n");
|
||||
break;
|
||||
} else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
|
||||
print = true;
|
||||
out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
|
||||
fail = true;
|
||||
break;
|
||||
} else {
|
||||
fail = true; print = true;
|
||||
break;
|
||||
|
|
@ -96,6 +101,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
|||
out.printf(" read length : %d%n", read.getReadLength());
|
||||
out.printf(" read start : %d%n", read.getAlignmentStart());
|
||||
out.printf(" cigar : %s%n", read.getCigarString());
|
||||
out.printf(" ref bases : %s%n", new String(baq.refBases));
|
||||
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||
out.printf(" ref length : %d%n", baq.refBases.length);
|
||||
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
|
||||
|
|
@ -136,11 +142,15 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
|||
return false;
|
||||
}
|
||||
|
||||
private final void printQuals( String prefix, byte[] quals ) {
|
||||
public final void printQuals( String prefix, byte[] quals ) {
|
||||
printQuals(prefix, quals, false);
|
||||
}
|
||||
|
||||
private final void printQuals( String prefix, byte[] quals, boolean asInt ) {
|
||||
public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
|
||||
printQuals(out, prefix, quals, asInt);
|
||||
}
|
||||
|
||||
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
|
||||
out.print(prefix);
|
||||
for ( int i = 0; i < quals.length; i++) {
|
||||
if ( asInt ) {
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.baq;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -64,14 +63,20 @@ public class BAQ {
|
|||
qual2prob[i] = Math.pow(10, -i/10.);
|
||||
}
|
||||
|
||||
private double cd = 1e-3; // gap open probility [1e-3]
|
||||
public static double DEFAULT_GOP = 1e-3; // todo -- make me final private
|
||||
|
||||
public double cd = -1; // gap open probility [1e-3]
|
||||
private double ce = 0.1; // gap extension probability [0.1]
|
||||
private int cb = 7; // band width [7]
|
||||
|
||||
public byte getMinBaseQual() {
|
||||
return minBaseQual;
|
||||
}
|
||||
|
||||
/**
|
||||
* Any bases with Q < MIN_BASE_QUAL are raised up to this base quality
|
||||
*/
|
||||
private int minBaseQual = 4;
|
||||
private byte minBaseQual = 4;
|
||||
|
||||
public double getGapOpenProb() {
|
||||
return cd;
|
||||
|
|
@ -88,7 +93,9 @@ public class BAQ {
|
|||
/**
|
||||
* Use defaults for everything
|
||||
*/
|
||||
public BAQ() { }
|
||||
public BAQ() {
|
||||
cd = DEFAULT_GOP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new HmmGlocal object with specified parameters
|
||||
|
|
@ -98,7 +105,7 @@ public class BAQ {
|
|||
* @param b band width
|
||||
* @param minBaseQual All bases with Q < minBaseQual are up'd to this value
|
||||
*/
|
||||
public BAQ(final double d, final double e, final int b, final int minBaseQual) {
|
||||
public BAQ(final double d, final double e, final int b, final byte minBaseQual) {
|
||||
cd = d; ce = e; cb = b; this.minBaseQual = minBaseQual;
|
||||
}
|
||||
|
||||
|
|
@ -272,7 +279,7 @@ public class BAQ {
|
|||
if (state != null) state[i-1] = max_k;
|
||||
if (q != null) {
|
||||
k = (int)(-4.343 * Math.log(1. - max) + .499);
|
||||
q[i-1] = (byte)(k > 100? 99 : k);
|
||||
q[i-1] = (byte)(k > 100? 99 : (k < minBaseQual ? minBaseQual : k));
|
||||
}
|
||||
//System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
|
||||
}
|
||||
|
|
@ -287,12 +294,12 @@ public class BAQ {
|
|||
// ---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/** decode the bit encoded state array values */
|
||||
private static boolean stateIsIndel(int state) {
|
||||
public static boolean stateIsIndel(int state) {
|
||||
return (state & 3) != 0;
|
||||
}
|
||||
|
||||
/** decode the bit encoded state array values */
|
||||
private static int stateAlignedPosition(int state) {
|
||||
public static int stateAlignedPosition(int state) {
|
||||
return state >> 2;
|
||||
}
|
||||
|
||||
|
|
@ -397,20 +404,22 @@ public class BAQ {
|
|||
|
||||
public static class BAQCalculationResult {
|
||||
public byte[] refBases, rawQuals, readBases, bq;
|
||||
public int refOffset;
|
||||
public int[] state;
|
||||
|
||||
// todo -- optimization: use static growing arrays here
|
||||
public BAQCalculationResult(SAMRecord read, byte[] ref, int refOffset) {
|
||||
public BAQCalculationResult(SAMRecord read, byte[] ref) {
|
||||
this(read.getBaseQualities(), read.getReadBases(), ref);
|
||||
}
|
||||
|
||||
public BAQCalculationResult(byte[] bases, byte[] quals, byte[] ref) {
|
||||
// prepares data for calculation
|
||||
rawQuals = read.getBaseQualities();
|
||||
readBases = read.getReadBases();
|
||||
rawQuals = quals;
|
||||
readBases = bases;
|
||||
|
||||
// now actually prepare the data structures, and fire up the hmm
|
||||
bq = new byte[rawQuals.length];
|
||||
state = new int[rawQuals.length];
|
||||
this.refBases = ref;
|
||||
this.refOffset = refOffset;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -448,14 +457,19 @@ public class BAQ {
|
|||
}
|
||||
}
|
||||
|
||||
// we need to bad ref by at least the bandwidth / 2 on either side
|
||||
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
|
||||
|
||||
public BAQCalculationResult calcBAQFromHMM(byte[] ref, byte[] query, byte[] quals) {
|
||||
// note -- assumes ref is offset from the *CLIPPED* start
|
||||
BAQCalculationResult baqResult = new BAQCalculationResult(read, ref, refOffset);
|
||||
BAQCalculationResult baqResult = new BAQCalculationResult(query, quals, ref);
|
||||
byte[] convSeq = bases2indices(baqResult.readBases);
|
||||
byte[] convRef = bases2indices(baqResult.refBases);
|
||||
|
||||
hmm_glocal(convRef, convSeq, baqResult.rawQuals, baqResult.state, baqResult.bq);
|
||||
return baqResult;
|
||||
}
|
||||
|
||||
// we need to bad ref by at least the bandwidth / 2 on either side
|
||||
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
|
||||
BAQCalculationResult baqResult = calcBAQFromHMM(ref, read.getReadBases(), read.getBaseQualities());
|
||||
|
||||
// cap quals
|
||||
int readI = 0, refI = 0;
|
||||
|
|
@ -474,14 +488,8 @@ public class BAQ {
|
|||
case D : refI += l; break;
|
||||
case M :
|
||||
for (int i = readI; i < readI + l; i++) {
|
||||
boolean isIndel = stateIsIndel(baqResult.state[i]);
|
||||
int pos = stateAlignedPosition(baqResult.state[i]);
|
||||
int expectedPos = refI - refOffset + (i - readI);
|
||||
if ( isIndel || pos != expectedPos )
|
||||
// we are an indel or we don't align to our best current position
|
||||
baqResult.bq[i] = 0;
|
||||
else
|
||||
baqResult.bq[i] = baqResult.bq[i] < baqResult.rawQuals[i] ? baqResult.bq[i] : baqResult.rawQuals[i];
|
||||
baqResult.bq[i] = capBaseByBAQ( baqResult.rawQuals[i], baqResult.bq[i], baqResult.state[i], expectedPos );
|
||||
}
|
||||
readI += l; refI += l;
|
||||
break;
|
||||
|
|
@ -491,6 +499,18 @@ public class BAQ {
|
|||
return baqResult;
|
||||
}
|
||||
|
||||
public byte capBaseByBAQ( byte oq, byte bq, int state, int expectedPos ) {
|
||||
byte b;
|
||||
boolean isIndel = stateIsIndel(state);
|
||||
int pos = stateAlignedPosition(state);
|
||||
if ( isIndel || pos != expectedPos ) // we are an indel or we don't align to our best current position
|
||||
b = minBaseQual; // just take b = minBaseQuality
|
||||
else
|
||||
b = bq < oq ? bq : oq;
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
/**
|
||||
* Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses the BAQ
|
||||
* tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and does the BAQ on the fly
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.utils.baq;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
||||
/**
|
||||
* Basic unit test for GenomeLoc
|
||||
*/
|
||||
public class BAQUnitTest extends BaseTest {
|
||||
private SAMFileHeader header;
|
||||
private final int startChr = 1;
|
||||
private final int numChr = 2;
|
||||
private final int chrSize = 1000;
|
||||
|
||||
@BeforeMethod
|
||||
public void before() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(numChr, startChr, chrSize);
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "data")
|
||||
public Object[][] createData1() {
|
||||
List<Object[]> params = new ArrayList<Object[]>();
|
||||
params.add(new Object[]{0,
|
||||
"GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTTCTCCTGCTCCTGCGTGATCCGCTGCAG",
|
||||
"GCTGCTCCTGGTACTGCTGGATGAGGGCCTCGATGAAGCTAAGCTTTTCCTCCTGCTCCTGCGTGATCCGCTGCAG", null,
|
||||
"?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGIIHGGGIHHIIHIIHIIHGICCIGEII@IGIHCG",
|
||||
"?BACCBDDDFFBCFFHHFIHFEIFHIGHHGHBFEIFGIIGEGII410..0HIIHIIHIIHGICCIGEII@IGIHCE"});
|
||||
|
||||
params.add(new Object[]{0,
|
||||
"GCTTTTTCTCCT",
|
||||
"GCTTTTCCTCCT", null,
|
||||
"IIHGGGIHHIIH",
|
||||
"DI410..0HIID"});
|
||||
|
||||
// params.add(new Object[]{"GTTTTTTG",
|
||||
// "GTTCCTTG",
|
||||
// "IIIIIIII",
|
||||
// "III!!III"});
|
||||
|
||||
// big and complex, also does a cap from 3 to 4!
|
||||
params.add(new Object[]{-3,
|
||||
"AAATTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATAG",
|
||||
"TTCAAGATTTCAAAGGCTCTTAACTGCTCAAGATAATTTTTTTTTTTTGTAGACAGAGTCTTGCTGTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCCACCCACCACCACGCCTGGCCTAATTTTTTTGTATTTTTAGTAGAGA", "49M1I126M1I20M1I25M",
|
||||
">IHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?B3BBC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@C<>5;<A5=A;>=64>???B>=6497<<;;<;>2?>BA@??A6<<A59",
|
||||
">EHFECEBDBBCBCABABAADBD?AABBACEABABC?>?B>@A@@>A?838BC?CBDBAABBBBBAABAABBABDACCCBCDAACBCBABBB:ABDBACBBDCCCCABCDCCBCC@@;?<B@BC;CBBBAB=;A>ACBABBBABBCA@@<?>>AAA<CA@AABBABCC?BB8@<@%<>5;<A5=A;>=64>???B;86497<<;;<;>2?>BA@??A6<<A59"});
|
||||
|
||||
// now changes
|
||||
params.add(new Object[]{-3,
|
||||
"CCGAGTAGCTGGGACTACAGGCACCCACCACCACGCCTGGCC",
|
||||
"AGTAGCTGGGACTACAGGCACCCACCACCACGCCTG", null,
|
||||
"A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9",
|
||||
"A?>>@>AA?@@>A?>A@?>@>>?=>?'>?=>7=?A9"});
|
||||
|
||||
// raw base qualities are low -- but they shouldn't be capped
|
||||
params.add(new Object[]{-3,
|
||||
"CCACCACGCCTGGCCAATTTTTTTGTATTTTTAGTAGAGATA",
|
||||
"CCACGCTTGGCAAAGTTTTCCGTACGTTTAGCCGAG", null,
|
||||
"33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8",
|
||||
"33'/(7+270&4),(&&-)$&,%7$',-/61(,6?8"});
|
||||
|
||||
|
||||
// // capping test
|
||||
// params.add(new Object[]{-3,
|
||||
// "TTTAGGGCTAACTATGACATGAACCCCAAAA",
|
||||
// "AGGGCTAACTATGACATGAACCCCA", null,
|
||||
// "!CDDEEEDDDCEEEECBCA@A@.0'",
|
||||
// "!%%%%&)DDDCEEEECBCA@A@.0'"});
|
||||
|
||||
return params.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
//
|
||||
// todo -- add cigar strings to test, and create synthetic reads
|
||||
//
|
||||
@Test(dataProvider = "data", enabled = true)
|
||||
public void testBAQ1(int offset, String _ref, String _query, String cigar, String _fastqQuals, String _expected) {
|
||||
byte[] ref = _ref.getBytes();
|
||||
byte[] query = _query.getBytes();
|
||||
byte[] fastqQuals = _fastqQuals.getBytes();
|
||||
byte[] expected = _expected.getBytes();
|
||||
if ( cigar == null ) cigar = String.format("%dM", query.length);
|
||||
|
||||
BAQ baqHMM = new BAQ(1e-3, 0.1, 7, (byte)4); // matches current samtools parameters
|
||||
byte[] quals = new byte[fastqQuals.length];
|
||||
for ( int i = 0; i < fastqQuals.length; i++) {
|
||||
quals[i] = (byte)(fastqQuals[i] - 33);
|
||||
expected[i] = (byte)(expected[i] - 33);
|
||||
}
|
||||
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, query, quals);
|
||||
read.setCigarString(cigar);
|
||||
BAQ.BAQCalculationResult result = baqHMM.calcBAQFromHMM(read, ref, offset);
|
||||
|
||||
for (int i = 0; i < quals.length; i++) {
|
||||
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
|
||||
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
|
||||
Assert.assertEquals(result.bq[i], expected[i], "Did not see the expected BAQ value");
|
||||
}
|
||||
|
||||
ValidateBAQWalker.printQuals(System.out, "in-quals:", quals, false);
|
||||
ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue