Not sure why IntelliJ didn't add this for commit like the other dirs

This commit is contained in:
Eric Banks 2013-01-07 18:11:07 -05:00
parent 3787ee6de7
commit 9e6c2afb28
6 changed files with 1354 additions and 0 deletions

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.List;
/**
* User: carneiro
* Date: 3/29/11
*/
public class BQSRGatherer extends Gatherer {
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
private static final String MISSING_OUTPUT_FILE = "missing output file name";
@Override
public void gather(List<File> inputs, File output) {
final PrintStream outputFile;
try {
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
RecalibrationReport generalReport = null;
for (File input : inputs) {
final RecalibrationReport inputReport = new RecalibrationReport(input);
if (generalReport == null)
generalReport = inputReport;
else
generalReport.combine(inputReport);
}
if (generalReport == null)
throw new ReviewedStingException(EMPTY_INPUT_LIST);
generalReport.calculateQuantizedQualities();
RecalibrationArgumentCollection RAC = generalReport.getRAC();
if ( RAC.RECAL_PDF_FILE != null ) {
RAC.RECAL_TABLE_FILE = output;
if ( RAC.existingRecalibrationReport != null ) {
final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport);
RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates());
}
else {
RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates());
}
}
generalReport.output(outputFile);
}
}

View File

@ -0,0 +1,528 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileHeader;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.variant.utils.BaseUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.recalibration.*;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
*
* <p>
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
* only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
* of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
* reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
* probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
* The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
* <p>
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
*
* <p>
*
* <h2>Input</h2>
* <p>
* The input read data whose base quality scores need to be assessed.
* <p>
* A database of known polymorphic sites to skip over.
* </p>
*
* <h2>Output</h2>
* <p>
* A GATK Report file with many tables:
* <ol>
* <li>The list of arguments</li>
* <li>The quantized qualities table</li>
* <li>The recalibration table by read group</li>
* <li>The recalibration table by quality score</li>
* <li>The recalibration table for all the optional covariates</li>
* </ol>
*
* The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T BaseRecalibrator \
* -I my_reads.bam \
* -R resources/Homo_sapiens_assembly18.fasta \
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
* -knownSites another/optional/setOfSitesToMask.vcf \
* -o recal_data.grp
* </pre>
*/
@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class})
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@PartitionBy(PartitionType.READ)
public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSchedulable {
/**
* all the command line arguments for BQSR and it's covariates
*/
@ArgumentCollection
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/**
* When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
* purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag
* to safely access only one table. There may be some CPU cost, but as long as the table is really big
* there should be relatively little CPU costs.
*/
@Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false)
public boolean lowMemoryMode = false;
@Advanced
@Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
public double BAQGOP = BAQ.DEFAULT_GOP;
/**
* an object that keeps track of the information necessary for quality score quantization
*/
private QuantizationInfo quantizationInfo;
/**
* list to hold the all the covariate objects that were requested (required + standard + experimental)
*/
private Covariate[] requestedCovariates;
private RecalibrationEngine recalibrationEngine;
private int minimumQToUse;
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
/**
* Parse the -cov arguments and create a list of covariates to be used here
* Based on the covariates' estimates for initial capacity allocate the data hashmap
*/
public void initialize() {
baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
if (RAC.FORCE_PLATFORM != null)
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
if (RAC.LIST_ONLY) {
RecalUtils.listAvailableCovariates(logger);
System.exit(0);
}
RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
int covariateIndex = 0;
for (final Covariate covariate : requiredCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (final Covariate covariate : optionalCovariates)
requestedCovariates[covariateIndex++] = covariate;
logger.info("The covariates being used here: ");
for (Covariate cov : requestedCovariates) { // list all the covariates being used
logger.info("\t" + cov.getClass().getSimpleName());
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
}
try {
RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE);
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e);
}
initializeRecalibrationEngine();
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
referenceReader = getToolkit().getReferenceDataSource().getReference();
}
/**
* Initialize the recalibration engine
*/
private void initializeRecalibrationEngine() {
int numReadGroups = 0;
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
numReadGroups += header.getReadGroups().size();
recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode);
}
private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
return read.getBaseQualities()[offset] < minimumQToUse;
}
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
*/
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
RecalUtils.parsePlatformForRead(read, RAC);
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
return 0L; // skip this read completely
}
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
final int nErrors = nEvents(isSNP, isInsertion, isDeletion);
// note for efficiency regions we don't compute the BAQ array unless we actually have
// some error to marginalize over. For ILMN data ~85% of reads have no error
final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read);
if( baqArray != null ) { // some reads just can't be BAQ'ed
final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
// aggregate all of the info into our info object, and update the data
final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
recalibrationEngine.updateDataForRead(info);
return 1L;
} else {
return 0L;
}
}
/**
* Compute the number of mutational events across all hasEvent vectors
*
* Simply the sum of entries in hasEvents
*
* @param hasEvents a vector a vectors of 0 (no event) and 1 (has event)
* @return the total number of events across all hasEvent arrays
*/
private int nEvents(final int[]... hasEvents) {
int n = 0;
for ( final int[] hasEvent : hasEvents ) {
n += MathUtils.sum(hasEvent);
}
return n;
}
protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
final byte[] bases = read.getReadBases();
final boolean[] skip = new boolean[bases.length];
final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
for( int iii = 0; iii < bases.length; iii++ ) {
skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
}
return skip;
}
protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
}
protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
final int readLength = read.getReadBases().length;
final boolean[] knownSites = new boolean[readLength];
Arrays.fill(knownSites, false);
for( final Feature f : features ) {
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureStartOnRead = 0;
}
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
featureEndOnRead = readLength;
}
if( featureStartOnRead > readLength ) {
featureStartOnRead = featureEndOnRead = readLength;
}
Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
}
return knownSites;
}
// BUGBUG: can be merged with calculateIsIndel
protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
final byte[] readBases = read.getReadBases();
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
final int[] snp = new int[readBases.length];
int readPos = 0;
int refPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
switch (ce.getOperator()) {
case M:
case EQ:
case X:
for( int iii = 0; iii < elementLength; iii++ ) {
snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
readPos++;
refPos++;
}
break;
case D:
case N:
refPos += elementLength;
break;
case I:
case S: // ReferenceContext doesn't have the soft clipped bases!
readPos += elementLength;
break;
case H:
case P:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
}
}
return snp;
}
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
final byte[] readBases = read.getReadBases();
final int[] indel = new int[readBases.length];
Arrays.fill(indel, 0);
int readPos = 0;
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
final int elementLength = ce.getLength();
switch (ce.getOperator()) {
case M:
case EQ:
case X:
case S:
{
readPos += elementLength;
break;
}
case D:
{
final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
break;
}
case I:
{
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
if( forwardStrandRead ) {
indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
}
for (int iii = 0; iii < elementLength; iii++) {
readPos++;
}
if( !forwardStrandRead ) {
indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
}
break;
}
case N:
case H:
case P:
break;
default:
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
}
}
return indel;
}
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
if(errorArray.length != baqArray.length ) {
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
}
final int BLOCK_START_UNSET = -1;
final double[] fractionalErrors = new double[baqArray.length];
Arrays.fill(fractionalErrors, 0.0);
boolean inBlock = false;
int blockStartIndex = BLOCK_START_UNSET;
int iii;
for( iii = 0; iii < fractionalErrors.length; iii++ ) {
if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
if( !inBlock ) {
fractionalErrors[iii] = (double) errorArray[iii];
} else {
calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
inBlock = false; // reset state variables
blockStartIndex = BLOCK_START_UNSET; // reset state variables
}
} else {
inBlock = true;
if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
}
}
if( inBlock ) {
calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
}
if( fractionalErrors.length != errorArray.length ) {
throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
}
return fractionalErrors;
}
private static void calculateAndStoreErrorsInBlock( final int iii,
final int blockStartIndex,
final int[] errorArray,
final double[] fractionalErrors ) {
int totalErrors = 0;
for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
totalErrors += errorArray[jjj];
}
for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
}
}
/**
* Create a BAQ style array that indicates no alignment uncertainty
* @param read the read for which we want a BAQ array
* @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values
* // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above
*/
private byte[] flatBAQArray(final GATKSAMRecord read) {
final byte[] baq = new byte[read.getReadLength()];
Arrays.fill(baq, NO_BAQ_UNCERTAINTY);
return baq;
}
/**
* Compute an actual BAQ array for read, based on its quals and the reference sequence
* @param read the read to BAQ
* @return a non-null BAQ tag array for read
*/
private byte[] calculateBAQArray( final GATKSAMRecord read ) {
baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
return BAQ.getBAQTag(read);
}
/**
* Initialize the reduce step by returning 0L
*
* @return returns 0L
*/
public Long reduceInit() {
return 0L;
}
/**
* The Reduce method doesn't do anything for this walker.
*
* @param mapped Result of the map. This value is immediately ignored.
* @param sum The summing CountedData used to output the CSV data
* @return returns The sum used to output the CSV data
*/
public Long reduce(Long mapped, Long sum) {
sum += mapped;
return sum;
}
@Override
public void onTraversalDone(Long result) {
recalibrationEngine.finalizeData();
logger.info("Calculating quantized quality scores...");
quantizeQualityScores();
logger.info("Writing recalibration report...");
generateReport();
logger.info("...done!");
if ( RAC.RECAL_PDF_FILE != null ) {
logger.info("Generating recalibration plots...");
generatePlots();
}
logger.info("Processed: " + result + " reads");
}
private RecalibrationTables getRecalibrationTable() {
return recalibrationEngine.getFinalRecalibrationTables();
}
private void generatePlots() {
File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE;
if (recalFile != null) {
RecalibrationReport report = new RecalibrationReport(recalFile);
RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates);
}
else
RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates);
}
/**
* go through the quality score table and use the # observations and the empirical quality score
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
* generate a quantization map (recalibrated_qual -> quantized_qual)
*/
private void quantizeQualityScores() {
quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS);
}
private void generateReport() {
RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
}
}

View File

@ -0,0 +1,163 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 12/18/12
* Time: 3:50 PM
*
* TODO -- merge in ReadCovariates?
*/
public final class ReadRecalibrationInfo {
private final GATKSAMRecord read;
private final int length;
private final ReadCovariates covariates;
private final boolean[] skips;
private final byte[] baseQuals, insertionQuals, deletionQuals;
private final double[] snpErrors, insertionErrors, deletionErrors;
public ReadRecalibrationInfo(final GATKSAMRecord read,
final ReadCovariates covariates,
final boolean[] skips,
final double[] snpErrors,
final double[] insertionErrors,
final double[] deletionErrors) {
if ( read == null ) throw new IllegalArgumentException("read cannot be null");
if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null");
if ( skips == null ) throw new IllegalArgumentException("skips cannot be null");
if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null");
if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null");
if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null");
this.read = read;
this.baseQuals = read.getBaseQualities();
this.length = baseQuals.length;
this.covariates = covariates;
this.skips = skips;
this.insertionQuals = read.getExistingBaseInsertionQualities();
this.deletionQuals = read.getExistingBaseDeletionQualities();
this.snpErrors = snpErrors;
this.insertionErrors = insertionErrors;
this.deletionErrors = deletionErrors;
if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length);
if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length);
if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length);
if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length);
}
/**
* Get the qual score for event type at offset
*
* @param eventType the type of event we want the qual for
* @param offset the offset into this read for the qual
* @return a valid quality score for event at offset
*/
@Requires("validOffset(offset)")
@Ensures("validQual(result)")
public byte getQual(final EventType eventType, final int offset) {
switch ( eventType ) {
case BASE_SUBSTITUTION: return baseQuals[offset];
// note optimization here -- if we don't have ins/del quals we just return the default byte directly
case BASE_INSERTION: return insertionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : insertionQuals[offset];
case BASE_DELETION: return deletionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : deletionQuals[offset];
default: throw new IllegalStateException("Unknown event type " + eventType);
}
}
/**
* Get the error fraction for event type at offset
*
* The error fraction is a value between 0 and 1 that indicates how much certainty we have
* in the error occurring at offset. A value of 1 means that the error definitely occurs at this
* site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight
* of the error belongs here
*
* @param eventType the type of event we want the qual for
* @param offset the offset into this read for the qual
* @return a fractional weight for an error at this offset
*/
@Requires("validOffset(offset)")
@Ensures("result >= 0.0 && result <= 1.0")
public double getErrorFraction(final EventType eventType, final int offset) {
switch ( eventType ) {
case BASE_SUBSTITUTION: return snpErrors[offset];
case BASE_INSERTION: return insertionErrors[offset];
case BASE_DELETION: return deletionErrors[offset];
default: throw new IllegalStateException("Unknown event type " + eventType);
}
}
/**
* Get the read involved in this recalibration info
* @return a non-null GATKSAMRecord
*/
@Ensures("result != null")
public GATKSAMRecord getRead() {
return read;
}
/**
* Should offset in this read be skipped (because it's covered by a known variation site?)
* @param offset a valid offset into this info
* @return true if offset should be skipped, false otherwise
*/
@Requires("validOffset(offset)")
public boolean skip(final int offset) {
return skips[offset];
}
/**
* Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets
* @return a non-null ReadCovariates object
*/
@Ensures("result != null")
public ReadCovariates getCovariatesValues() {
return covariates;
}
/**
* Ensures an offset is valid. Used in contracts
* @param offset a proposed offset
* @return true if offset is valid w.r.t. the data in this object, false otherwise
*/
private boolean validOffset(final int offset) {
return offset >= 0 && offset < baseQuals.length;
}
private boolean validQual(final byte result) {
return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE;
}
}

View File

@ -0,0 +1,251 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
import java.io.File;
import java.io.PrintStream;
import java.util.Collections;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 27, 2009
*
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
*/
public class RecalibrationArgumentCollection {
/**
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out.
*/
@Gather(BQSRGatherer.class)
@Output(doc = "The output recalibration table file to create", required = true)
public File RECAL_TABLE_FILE = null;
public PrintStream RECAL_TABLE;
/**
* If not provided, then no plots will be generated (useful for queue scatter/gathering).
* However, we *highly* recommend that users generate these plots whenever possible for QC checking.
*/
@Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false)
public File RECAL_PDF_FILE = null;
/**
* If not provided, then a temporary file is created and then deleted upon completion.
* For advanced users only.
*/
@Advanced
@Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
public File RECAL_CSV_FILE = null;
/**
* Note that the --list argument requires a fully resolved and correct command-line to work.
*/
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
public boolean LIST_ONLY = false;
/**
* Note that the ReadGroup and QualityScore covariates are required and do not need to be specified.
* Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default.
* Use the --list argument to see the available covariates.
*/
@Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false)
public String[] COVARIATES = null;
/*
* The Cycle and Context covariates are standard and are included by default unless this argument is provided.
* Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
*/
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Advanced
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
public boolean RUN_WITHOUT_DBSNP = false;
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base mismatches
*/
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
public int MISMATCHES_CONTEXT_SIZE = 2;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
*/
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
public int INDELS_CONTEXT_SIZE = 3;
/**
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
* This argument is ignored if the Cycle covariate is not used.
*/
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
public int MAXIMUM_CYCLE_VALUE = 500;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
*/
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
public byte MISMATCHES_DEFAULT_QUALITY = -1;
/**
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
*/
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
public byte INSERTIONS_DEFAULT_QUALITY = 45;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
*/
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
/**
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
*/
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
public byte LOW_QUAL_TAIL = 2;
/**
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
* This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
*/
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
/**
* The tag name for the binary tag covariate (if using it)
*/
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
public String BINARY_TAG_NAME = null;
/*
* whether GATK report tables should have rows in sorted order, starting from leftmost column
*/
@Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false)
public Boolean SORT_BY_ALL_COLUMNS = false;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
public File existingRecalibrationReport = null;
public GATKReportTable generateReportTable(final String covariateNames) {
GATKReportTable argumentsTable;
if(SORT_BY_ALL_COLUMNS) {
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
} else {
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
}
argumentsTable.addColumn("Argument");
argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
argumentsTable.addRowID("covariate", true);
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
argumentsTable.addRowID("no_standard_covs", true);
argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
argumentsTable.addRowID("run_without_dbsnp", true);
argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.addRowID("solid_recal_mode", true);
argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
argumentsTable.addRowID("solid_nocall_strategy", true);
argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.addRowID("mismatches_context_size", true);
argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.addRowID("indels_context_size", true);
argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
argumentsTable.addRowID("mismatches_default_quality", true);
argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.addRowID("insertions_default_quality", true);
argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
argumentsTable.addRowID("low_quality_tail", true);
argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
argumentsTable.addRowID("default_platform", true);
argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.addRowID("force_platform", true);
argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.addRowID("quantizing_levels", true);
argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.addRowID("recalibration_report", true);
argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
argumentsTable.addRowID("plot_pdf_file", true);
argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath());
argumentsTable.addRowID("binary_tag_name", true);
argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
return argumentsTable;
}
}

View File

@ -0,0 +1,216 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
import org.broadinstitute.sting.utils.recalibration.*;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.LinkedList;
import java.util.List;
public class RecalibrationEngine {
final protected Covariate[] covariates;
final private int numReadGroups;
final private PrintStream maybeLogStream;
final private boolean lowMemoryMode;
/**
* Has finalizeData() been called?
*/
private boolean finalized = false;
/**
* The final (merged, etc) recalibration tables, suitable for downstream analysis.
*/
private RecalibrationTables finalRecalibrationTables = null;
private final List<RecalibrationTables> recalibrationTablesList = new LinkedList<RecalibrationTables>();
private final ThreadLocal<RecalibrationTables> threadLocalTables = new ThreadLocal<RecalibrationTables>() {
private synchronized RecalibrationTables makeAndCaptureTable() {
final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream);
recalibrationTablesList.add(newTable);
return newTable;
}
@Override
protected synchronized RecalibrationTables initialValue() {
if ( lowMemoryMode ) {
return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0);
} else {
return makeAndCaptureTable();
}
}
};
/**
* Get a recalibration table suitable for updating the underlying RecalDatums
*
* May return a thread-local version, or a single version, depending on the initialization
* arguments of this instance.
*
* @return updated tables
*/
protected RecalibrationTables getUpdatableRecalibrationTables() {
return threadLocalTables.get();
}
/**
* Initialize the recalibration engine
*
* Called once before any calls to updateDataForRead are made. The engine should prepare itself
* to handle any number of updateDataForRead calls containing ReadRecalibrationInfo containing
* keys for each of the covariates provided.
*
* The engine should collect match and mismatch data into the recalibrationTables data.
*
* @param covariates an array of the covariates we'll be using in this engine, order matters
* @param numReadGroups the number of read groups we should use for the recalibration tables
* @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables
*/
public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) {
if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null");
if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups);
this.covariates = covariates.clone();
this.numReadGroups = numReadGroups;
this.maybeLogStream = maybeLogStream;
this.lowMemoryMode = enableLowMemoryMode;
}
/**
* Update the recalibration statistics using the information in recalInfo
* @param recalInfo data structure holding information about the recalibration values for a single read
*/
@Requires("recalInfo != null")
public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) {
final GATKSAMRecord read = recalInfo.getRead();
final ReadCovariates readCovariates = recalInfo.getCovariatesValues();
final RecalibrationTables tables = getUpdatableRecalibrationTables();
final NestedIntegerArray<RecalDatum> qualityScoreTable = tables.getQualityScoreTable();
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
if( ! recalInfo.skip(offset) ) {
for (final EventType eventType : EventType.values()) {
final int[] keys = readCovariates.getKeySet(offset, eventType);
final int eventIndex = eventType.ordinal();
final byte qual = recalInfo.getQual(eventType, offset);
final double isError = recalInfo.getErrorFraction(eventType, offset);
RecalUtils.incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex);
for (int i = 2; i < covariates.length; i++) {
if (keys[i] < 0)
continue;
RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
}
}
}
}
}
/**
* Finalize, if appropriate, all derived data in recalibrationTables.
*
* Called once after all calls to updateDataForRead have been issued.
*
* Assumes that all of the principal tables (by quality score) have been completely updated,
* and walks over this data to create summary data tables like by read group table.
*/
public void finalizeData() {
if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called");
// merge all of the thread-local tables
finalRecalibrationTables = mergeThreadLocalRecalibrationTables();
final NestedIntegerArray<RecalDatum> byReadGroupTable = finalRecalibrationTables.getReadGroupTable();
final NestedIntegerArray<RecalDatum> byQualTable = finalRecalibrationTables.getQualityScoreTable();
// iterate over all values in the qual table
for ( NestedIntegerArray.Leaf<RecalDatum> leaf : byQualTable.getAllLeaves() ) {
final int rgKey = leaf.keys[0];
final int eventIndex = leaf.keys[2];
final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex);
final RecalDatum qualDatum = leaf.value;
if ( rgDatum == null ) {
// create a copy of qualDatum, and initialize byReadGroup table with it
byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex);
} else {
// combine the qual datum with the existing datum in the byReadGroup table
rgDatum.combine(qualDatum);
}
}
finalized = true;
}
/**
* Merge all of the thread local recalibration tables into a single one.
*
* Reuses one of the recalibration tables to hold the merged table, so this function can only be
* called once in the engine.
*
* @return the merged recalibration table
*/
@Requires("! finalized")
private RecalibrationTables mergeThreadLocalRecalibrationTables() {
if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty");
RecalibrationTables merged = null;
for ( final RecalibrationTables table : recalibrationTablesList ) {
if ( merged == null )
// fast path -- if there's only only one table, so just make it the merged one
merged = table;
else {
merged.combine(table);
}
}
return merged;
}
/**
* Get the final recalibration tables, after finalizeData() has been called
*
* This returns the finalized recalibration table collected by this engine.
*
* It is an error to call this function before finalizeData has been called
*
* @return the finalized recalibration table collected by this engine
*/
public RecalibrationTables getFinalRecalibrationTables() {
if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called");
return finalRecalibrationTables;
}
}

View File

@ -0,0 +1,110 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.*;
import java.io.*;
/**
*/
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@PartitionBy(PartitionType.READ)
public class RecalibrationPerformance extends RodWalker<Integer, Integer> implements NanoSchedulable {
@Output(doc="Write output to this file", required = true)
public PrintStream out;
@Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file")
public File RECAL_FILE = null;
public void initialize() {
out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors");
final GATKReport report = new GATKReport(RECAL_FILE);
final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
for ( int row = 0; row < table.getNumRows(); row++ ) {
final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)));
final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME));
final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported);
//if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 )
// System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp));
if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") &&
table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") &&
table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") &&
(QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) {
out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors));
out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors));
}
}
}
@Override
public boolean isDone() {
return true;
}
private double asDouble(final Object o) {
if ( o instanceof Double )
return (Double)o;
else if ( o instanceof Integer )
return (Integer)o;
else if ( o instanceof Long )
return (Long)o;
else
throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass());
}
@Override
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
@Override
public Integer reduceInit() { return 0; }
@Override
public Integer reduce(Integer counter, Integer sum) { return 0; }
@Override
public void onTraversalDone(Integer sum) {}
}