Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-04-19 11:32:32 -04:00
commit 1ea4e48a27
8 changed files with 335 additions and 23 deletions

View File

@ -66,18 +66,18 @@ public class CycleCovariate implements StandardCovariate {
// Discrete cycle platforms
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final short init;
final short readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? (short) -1 : 1;
final short increment;
if (!read.getReadNegativeStrandFlag()) {
init = 1;
increment = 1;
short cycle;
if (read.getReadNegativeStrandFlag()) {
cycle = (short) (read.getReadLength() * readOrderFactor);
increment = (short) (-1 * readOrderFactor);
}
else {
init = (short) read.getReadLength();
increment = -1;
cycle = readOrderFactor;
increment = readOrderFactor;
}
short cycle = init;
for (int i = 0; i < read.getReadLength(); i++) {
cycles[i] = BitSetUtils.bitSetFrom(cycle);
cycle += increment;

View File

@ -102,7 +102,7 @@ public class RecalDatum extends Datum {
@Override
public String toString() {
return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()));
return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported()));
}

View File

@ -143,9 +143,18 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
/**
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
*/
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
public byte LOW_QUAL_TAIL = 2;
/**
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
* This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
*/
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
@Hidden
@ -155,8 +164,11 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
@Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots")
public boolean KEEP_INTERMEDIATE_FILES = false;
@Hidden
@Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering")
public boolean NO_PLOTS = false;
public GATKReportTable generateReportTable() {
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
@ -176,6 +188,8 @@ public class RecalibrationArgumentCollection {
argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES);
argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS);
return argumentsTable;
}

View File

@ -279,6 +279,12 @@ public class RecalibrationReport {
else if (primaryKey.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
else if (primaryKey.equals("keep_intermediate_files"))
RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("no_plots"))
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
}
return RAC;

View File

@ -109,4 +109,10 @@ public class RecalDatum extends RecalDatumOptimized {
private double qualToErrorProb( final double qual ) {
return Math.pow(10.0, qual / -10.0);
}
@Override
public String toString() {
return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported()));
}
}

View File

@ -65,6 +65,19 @@ public class BaseRecalibration {
quantizationInfo.quantizeQualityScores(quantizationLevels);
}
/**
* This constructor only exists for testing purposes.
*
* @param quantizationInfo
* @param keysAndTablesMap
* @param requestedCovariates
*/
protected BaseRecalibration(QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, ArrayList<Covariate> requestedCovariates) {
this.quantizationInfo = quantizationInfo;
this.keysAndTablesMap = keysAndTablesMap;
this.requestedCovariates = requestedCovariates;
}
/**
* Recalibrates the base qualities of a read
*
@ -110,7 +123,7 @@ public class BaseRecalibration {
* @param errorModel the event type
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
protected byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";

View File

@ -26,8 +26,9 @@ public class CycleCovariateUnitTest {
@Test(enabled = true)
public void testSimpleCycles() {
short readLength = 10;
short readLength = 10;
GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
read.setReadPairedFlag(true);
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
read.getReadGroup().setPlatform("illumina");
@ -38,6 +39,13 @@ public class CycleCovariateUnitTest {
values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), readLength, (short) -1);
read.setSecondOfPairFlag(true);
values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), (short) -readLength, (short) 1);
read.setReadNegativeStrandFlag(false);
values = covariate.getValues(read);
verifyCovariateArray(values.getMismatches(), (short) -1, (short) -1);
}
private void verifyCovariateArray(BitSet[] values, short init, short increment) {

View File

@ -1,13 +1,17 @@
package org.broadinstitute.sting.utils.recalibration;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.util.*;
/**
* Unit tests for on-the-fly recalibration.
@ -17,13 +21,274 @@ import java.io.File;
*/
public class BaseRecalibrationUnitTest {
@Test(enabled=false)
public void testReadingReport() {
File csv = new File("public/testdata/exampleGATKREPORT.grp");
BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1);
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
baseRecalibration.recalibrateRead(read);
System.out.println("Success");
private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager;
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap;
private BQSRKeyManager rgKeyManager;
private BQSRKeyManager qsKeyManager;
private BQSRKeyManager cvKeyManager;
private ReadGroupCovariate rgCovariate;
private QualityScoreCovariate qsCovariate;
private ContextCovariate cxCovariate;
private CycleCovariate cyCovariate;
private GATKSAMRecord read = ReadUtils.createRandomRead(10000);
private BaseRecalibration baseRecalibration;
private ReadCovariates readCovariates;
@BeforeClass
public void init() {
GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg");
rg.setPlatform("illumina");
read.setReadGroup(rg);
byte[] quals = new byte[read.getReadLength()];
for (int i = 0; i < read.getReadLength(); i++)
quals[i] = 20;
read.setBaseQualities(quals);
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
List<Covariate> requiredCovariates = new ArrayList<Covariate>();
List<Covariate> optionalCovariates = new ArrayList<Covariate>();
ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();
dataManager = new org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager(true, 4);
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
rgCovariate = new ReadGroupCovariate();
rgCovariate.initialize(RAC);
requiredCovariates.add(rgCovariate);
rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
keysAndTablesMap.put(rgKeyManager, new HashMap<BitSet, RecalDatum>());
qsCovariate = new QualityScoreCovariate();
qsCovariate.initialize(RAC);
requiredCovariates.add(qsCovariate);
qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
keysAndTablesMap.put(qsKeyManager, new HashMap<BitSet, RecalDatum>());
cxCovariate = new ContextCovariate();
cxCovariate.initialize(RAC);
optionalCovariates.add(cxCovariate);
cyCovariate = new CycleCovariate();
cyCovariate.initialize(RAC);
optionalCovariates.add(cyCovariate);
cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates);
keysAndTablesMap.put(cvKeyManager, new HashMap<BitSet, RecalDatum>());
for (Covariate cov : requiredCovariates)
requestedCovariates.add(cov);
for (Covariate cov : optionalCovariates)
requestedCovariates.add(cov);
readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates);
for (int i=0; i<read.getReadLength(); i++) {
BitSet[] bitKeys = readCovariates.getMismatchesKeySet(i);
Object[] objKey = buildObjectKey(bitKeys);
Random random = new Random();
int nObservations = random.nextInt(10000);
int nErrors = random.nextInt(10);
double estimatedQReported = 30;
double empiricalQuality = calcEmpiricalQual(nObservations, nErrors);
org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum oldDatum = new org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
dataManager.addToAllTables(objKey, oldDatum, QualityUtils.MIN_USABLE_Q_SCORE);
RecalDatum newDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> mapEntry : keysAndTablesMap.entrySet()) {
List<BitSet> keys = mapEntry.getKey().bitSetsFromAllKeys(bitKeys, EventType.BASE_SUBSTITUTION);
for (BitSet key : keys)
updateCovariateWithKeySet(mapEntry.getValue(), key, newDatum);
}
}
dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_QUAL_SCORE);
List<Byte> quantizedQuals = new ArrayList<Byte>();
List<Long> qualCounts = new ArrayList<Long>();
for (byte i = 0; i <= QualityUtils.MAX_QUAL_SCORE; i++) {
quantizedQuals.add(i);
qualCounts.add(1L);
}
QuantizationInfo quantizationInfo = new QuantizationInfo(quantizedQuals, qualCounts);
quantizationInfo.noQuantization();
baseRecalibration = new BaseRecalibration(quantizationInfo, keysAndTablesMap, requestedCovariates);
}
@Test(enabled=true)
public void testGoldStandardComparison() {
debugTables();
for (int i = 0; i < read.getReadLength(); i++) {
BitSet [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION);
Object [] objKey = buildObjectKey(bitKey);
byte v2 = baseRecalibration.performSequentialQualityCalculation(bitKey, EventType.BASE_SUBSTITUTION);
byte v1 = goldStandardSequentialCalculation(objKey);
Assert.assertEquals(v2, v1);
}
}
private Object[] buildObjectKey(BitSet[] bitKey) {
Object[] key = new Object[bitKey.length];
key[0] = rgCovariate.keyFromBitSet(bitKey[0]);
key[1] = qsCovariate.keyFromBitSet(bitKey[1]);
key[2] = cxCovariate.keyFromBitSet(bitKey[2]);
key[3] = cyCovariate.keyFromBitSet(bitKey[3]);
return key;
}
private void debugTables() {
System.out.println("\nV1 Table\n");
System.out.println("ReadGroup Table:");
NestedHashMap nestedTable = dataManager.getCollapsedTable(0);
printNestedHashMap(nestedTable.data, "");
System.out.println("\nQualityScore Table:");
nestedTable = dataManager.getCollapsedTable(1);
printNestedHashMap(nestedTable.data, "");
System.out.println("\nCovariates Table:");
nestedTable = dataManager.getCollapsedTable(2);
printNestedHashMap(nestedTable.data, "");
nestedTable = dataManager.getCollapsedTable(3);
printNestedHashMap(nestedTable.data, "");
int i = 0;
System.out.println("\nV2 Table\n");
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> mapEntry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = mapEntry.getKey();
Map<BitSet, RecalDatum> table = mapEntry.getValue();
switch(i++) {
case 0 :
System.out.println("ReadGroup Table:");
break;
case 1 :
System.out.println("QualityScore Table:");
break;
case 2 :
System.out.println("Covariates Table:");
break;
}
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) {
BitSet key = entry.getKey();
RecalDatum datum = entry.getValue();
List<Object> keySet = keyManager.keySetFrom(key);
System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum));
}
System.out.println();
}
}
private static void printNestedHashMap(Map<Object,Object> table, String output) {
for (Object key : table.keySet()) {
String ret = "";
if (output.isEmpty())
ret = "" + key;
else
ret = output + "," + key;
Object next = table.get(key);
if (next instanceof org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum)
System.out.println(ret + " => " + next);
else
printNestedHashMap((Map<Object, Object>) next, "" + ret);
}
}
private void updateCovariateWithKeySet(final Map<BitSet, RecalDatum> recalTable, final BitSet hashKey, final RecalDatum datum) {
RecalDatum previousDatum = recalTable.get(hashKey); // using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
if (previousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
recalTable.put(hashKey, datum.copy());
else
previousDatum.combine(datum); // add one to the number of observations and potentially one to the number of mismatches
}
/**
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
private byte goldStandardSequentialCalculation(final Object... key) {
final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
final Object[] qualityScoreCollapsedKey = new Object[2];
final Object[] covariateCollapsedKey = new Object[3];
// The global quality shift (over the read group only)
readGroupCollapsedKey[0] = key[0];
final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum globalRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
double globalDeltaQ = 0.0;
if (globalRecalDatum != null) {
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
}
// The shift in quality between reported and empirical
qualityScoreCollapsedKey[0] = key[0];
qualityScoreCollapsedKey[1] = key[1];
final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum qReportedRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
double deltaQReported = 0.0;
if (qReportedRecalDatum != null) {
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
}
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
double deltaQCovariateEmpirical;
covariateCollapsedKey[0] = key[0];
covariateCollapsedKey[1] = key[1];
for (int iii = 2; iii < key.length; iii++) {
covariateCollapsedKey[2] = key[iii]; // The given covariate
final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum covariateRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
if (covariateRecalDatum != null) {
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
}
}
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE);
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
//}
//else {
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
//}
//return newQualityByte;
}
public static double calcEmpiricalQual(final int observations, final int errors) {
final int smoothing = 1;
final double doubleMismatches = (double) (errors + smoothing);
final double doubleObservations = (double) ( observations + smoothing );
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
return Math.min(QualityUtils.MAX_QUAL_SCORE, empiricalQual);
}
}