diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java new file mode 100755 index 000000000..dbb628135 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.RecalUtils; +import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.List; + +/** + * User: carneiro + * Date: 3/29/11 + */ + + +public class BQSRGatherer extends Gatherer { + + private static final String EMPTY_INPUT_LIST = "list of inputs files is empty"; + private static final String MISSING_OUTPUT_FILE = "missing output file name"; + + @Override + public void gather(List inputs, File output) { + final PrintStream outputFile; + try { + outputFile = new PrintStream(output); + } catch(FileNotFoundException e) { + throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); + } + + RecalibrationReport generalReport = null; + for (File input : inputs) { + final RecalibrationReport inputReport = new RecalibrationReport(input); + if (generalReport == null) + generalReport = inputReport; + else + generalReport.combine(inputReport); + } + if (generalReport == null) + throw new ReviewedStingException(EMPTY_INPUT_LIST); + + generalReport.calculateQuantizedQualities(); + + RecalibrationArgumentCollection RAC = generalReport.getRAC(); + if ( RAC.RECAL_PDF_FILE != null ) { + RAC.RECAL_TABLE_FILE = output; + if ( RAC.existingRecalibrationReport != null ) { + final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport); + RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } + else { + RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } + } + + generalReport.output(outputFile); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java new file mode 100755 index 000000000..3b0cb07d5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.recalibration.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). + * + *

+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating + * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative + * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, + * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical + * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. + * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). + *

+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. + * + *

+ * + *

Input

+ *

+ * The input read data whose base quality scores need to be assessed. + *

+ * A database of known polymorphic sites to skip over. + *

+ * + *

Output

+ *

+ * A GATK Report file with many tables: + *

    + *
  1. The list of arguments
  2. + *
  3. The quantized qualities table
  4. + *
  5. The recalibration table by read group
  6. + *
  7. The recalibration table by quality score
  8. + *
  9. The recalibration table for all the optional covariates
  10. + *
+ * + * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T BaseRecalibrator \
+ *   -I my_reads.bam \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ *   -knownSites another/optional/setOfSitesToMask.vcf \
+ *   -o recal_data.grp
+ * 
+ */ + +@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class}) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@PartitionBy(PartitionType.READ) +public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { + /** + * all the command line arguments for BQSR and it's covariates + */ + @ArgumentCollection + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + /** + * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag + * to safely access only one table. There may be some CPU cost, but as long as the table is really big + * there should be relatively little CPU costs. + */ + @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) + public boolean lowMemoryMode = false; + + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; + + /** + * an object that keeps track of the information necessary for quality score quantization + */ + private QuantizationInfo quantizationInfo; + + /** + * list to hold the all the covariate objects that were requested (required + standard + experimental) + */ + private Covariate[] requestedCovariates; + + private RecalibrationEngine recalibrationEngine; + + private int minimumQToUse; + + private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; + + private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector + private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation + private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; + + /** + * Parse the -cov arguments and create a list of covariates to be used here + * Based on the covariates' estimates for initial capacity allocate the data hashmap + */ + public void initialize() { + baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty + + if (RAC.FORCE_PLATFORM != null) + RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; + + if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified + throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION); + + if (RAC.LIST_ONLY) { + RecalUtils.listAvailableCovariates(logger); + System.exit(0); + } + RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table + + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + ArrayList requiredCovariates = covariates.getFirst(); + ArrayList optionalCovariates = covariates.getSecond(); + + requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + int covariateIndex = 0; + for (final Covariate covariate : requiredCovariates) + requestedCovariates[covariateIndex++] = covariate; + for (final Covariate covariate : optionalCovariates) + requestedCovariates[covariateIndex++] = covariate; + + logger.info("The covariates being used here: "); + for (Covariate cov : requestedCovariates) { // list all the covariates being used + logger.info("\t" + cov.getClass().getSimpleName()); + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + } + + try { + RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); + } + + initializeRecalibrationEngine(); + minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; + referenceReader = getToolkit().getReferenceDataSource().getReference(); + } + + /** + * Initialize the recalibration engine + */ + private void initializeRecalibrationEngine() { + int numReadGroups = 0; + for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) + numReadGroups += header.getReadGroups().size(); + + recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode); + } + + private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { + return read.getBaseQualities()[offset] < minimumQToUse; + } + + /** + * For each read at this locus get the various covariate values and increment that location in the map based on + * whether or not the base matches the reference at this particular location + */ + public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { + + final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) ); + if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it + + RecalUtils.parsePlatformForRead(read, RAC); + if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls + return 0L; // skip this read completely + } + + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); + final int nErrors = nEvents(isSNP, isInsertion, isDeletion); + + // note for efficiency regions we don't compute the BAQ array unless we actually have + // some error to marginalize over. For ILMN data ~85% of reads have no error + final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read); + + if( baqArray != null ) { // some reads just can't be BAQ'ed + final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates); + final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases + final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); + final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); + final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); + + // aggregate all of the info into our info object, and update the data + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors); + recalibrationEngine.updateDataForRead(info); + return 1L; + } else { + return 0L; + } + } + + /** + * Compute the number of mutational events across all hasEvent vectors + * + * Simply the sum of entries in hasEvents + * + * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event) + * @return the total number of events across all hasEvent arrays + */ + private int nEvents(final int[]... hasEvents) { + int n = 0; + for ( final int[] hasEvent : hasEvents ) { + n += MathUtils.sum(hasEvent); + } + return n; + } + + protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { + final byte[] bases = read.getReadBases(); + final boolean[] skip = new boolean[bases.length]; + final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites)); + for( int iii = 0; iii < bases.length; iii++ ) { + skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii); + } + return skip; + } + + protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) { + return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset); + } + + protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { + final int readLength = read.getReadBases().length; + final boolean[] knownSites = new boolean[readLength]; + Arrays.fill(knownSites, false); + for( final Feature f : features ) { + int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureStartOnRead = 0; + } + + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureEndOnRead = readLength; + } + + if( featureStartOnRead > readLength ) { + featureStartOnRead = featureEndOnRead = readLength; + } + + Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true); + } + return knownSites; + } + + // BUGBUG: can be merged with calculateIsIndel + protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) { + final byte[] readBases = read.getReadBases(); + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd()); + final int[] snp = new int[readBases.length]; + int readPos = 0; + int refPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + for( int iii = 0; iii < elementLength; iii++ ) { + snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 ); + readPos++; + refPos++; + } + break; + case D: + case N: + refPos += elementLength; + break; + case I: + case S: // ReferenceContext doesn't have the soft clipped bases! + readPos += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return snp; + } + + protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) { + final byte[] readBases = read.getReadBases(); + final int[] indel = new int[readBases.length]; + Arrays.fill(indel, 0); + int readPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + case S: + { + readPos += elementLength; + break; + } + case D: + { + final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) ); + indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 ); + break; + } + case I: + { + final boolean forwardStrandRead = !read.getReadNegativeStrandFlag(); + if( forwardStrandRead ) { + indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 ); + } + for (int iii = 0; iii < elementLength; iii++) { + readPos++; + } + if( !forwardStrandRead ) { + indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 ); + } + break; + } + case N: + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return indel; + } + + protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) { + if(errorArray.length != baqArray.length ) { + throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); + } + + final int BLOCK_START_UNSET = -1; + + final double[] fractionalErrors = new double[baqArray.length]; + Arrays.fill(fractionalErrors, 0.0); + boolean inBlock = false; + int blockStartIndex = BLOCK_START_UNSET; + int iii; + for( iii = 0; iii < fractionalErrors.length; iii++ ) { + if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) { + if( !inBlock ) { + fractionalErrors[iii] = (double) errorArray[iii]; + } else { + calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors); + inBlock = false; // reset state variables + blockStartIndex = BLOCK_START_UNSET; // reset state variables + } + } else { + inBlock = true; + if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; } + } + } + if( inBlock ) { + calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors); + } + if( fractionalErrors.length != errorArray.length ) { + throw new ReviewedStingException("Output array length mismatch detected. Malformed read?"); + } + return fractionalErrors; + } + + private static void calculateAndStoreErrorsInBlock( final int iii, + final int blockStartIndex, + final int[] errorArray, + final double[] fractionalErrors ) { + int totalErrors = 0; + for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) { + totalErrors += errorArray[jjj]; + } + for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) { + fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1)); + } + } + + /** + * Create a BAQ style array that indicates no alignment uncertainty + * @param read the read for which we want a BAQ array + * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values + * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above + */ + private byte[] flatBAQArray(final GATKSAMRecord read) { + final byte[] baq = new byte[read.getReadLength()]; + Arrays.fill(baq, NO_BAQ_UNCERTAINTY); + return baq; + } + + /** + * Compute an actual BAQ array for read, based on its quals and the reference sequence + * @param read the read to BAQ + * @return a non-null BAQ tag array for read + */ + private byte[] calculateBAQArray( final GATKSAMRecord read ) { + baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); + return BAQ.getBAQTag(read); + } + + /** + * Initialize the reduce step by returning 0L + * + * @return returns 0L + */ + public Long reduceInit() { + return 0L; + } + + /** + * The Reduce method doesn't do anything for this walker. + * + * @param mapped Result of the map. This value is immediately ignored. + * @param sum The summing CountedData used to output the CSV data + * @return returns The sum used to output the CSV data + */ + public Long reduce(Long mapped, Long sum) { + sum += mapped; + return sum; + } + + @Override + public void onTraversalDone(Long result) { + recalibrationEngine.finalizeData(); + + logger.info("Calculating quantized quality scores..."); + quantizeQualityScores(); + + logger.info("Writing recalibration report..."); + generateReport(); + logger.info("...done!"); + + if ( RAC.RECAL_PDF_FILE != null ) { + logger.info("Generating recalibration plots..."); + generatePlots(); + } + + logger.info("Processed: " + result + " reads"); + } + + private RecalibrationTables getRecalibrationTable() { + return recalibrationEngine.getFinalRecalibrationTables(); + } + + private void generatePlots() { + File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; + if (recalFile != null) { + RecalibrationReport report = new RecalibrationReport(recalFile); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates); + } + else + RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates); + } + + /** + * go through the quality score table and use the # observations and the empirical quality score + * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to + * generate a quantization map (recalibrated_qual -> quantized_qual) + */ + private void quantizeQualityScores() { + quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS); + } + + private void generateReport() { + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java new file mode 100644 index 000000000..b884b89db --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 12/18/12 + * Time: 3:50 PM + * + * TODO -- merge in ReadCovariates? + */ +public final class ReadRecalibrationInfo { + private final GATKSAMRecord read; + private final int length; + private final ReadCovariates covariates; + private final boolean[] skips; + private final byte[] baseQuals, insertionQuals, deletionQuals; + private final double[] snpErrors, insertionErrors, deletionErrors; + + public ReadRecalibrationInfo(final GATKSAMRecord read, + final ReadCovariates covariates, + final boolean[] skips, + final double[] snpErrors, + final double[] insertionErrors, + final double[] deletionErrors) { + if ( read == null ) throw new IllegalArgumentException("read cannot be null"); + if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null"); + if ( skips == null ) throw new IllegalArgumentException("skips cannot be null"); + if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null"); + if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null"); + if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null"); + + this.read = read; + this.baseQuals = read.getBaseQualities(); + this.length = baseQuals.length; + this.covariates = covariates; + this.skips = skips; + this.insertionQuals = read.getExistingBaseInsertionQualities(); + this.deletionQuals = read.getExistingBaseDeletionQualities(); + this.snpErrors = snpErrors; + this.insertionErrors = insertionErrors; + this.deletionErrors = deletionErrors; + + if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length); + if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length); + if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); + if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); + } + + /** + * Get the qual score for event type at offset + * + * @param eventType the type of event we want the qual for + * @param offset the offset into this read for the qual + * @return a valid quality score for event at offset + */ + @Requires("validOffset(offset)") + @Ensures("validQual(result)") + public byte getQual(final EventType eventType, final int offset) { + switch ( eventType ) { + case BASE_SUBSTITUTION: return baseQuals[offset]; + // note optimization here -- if we don't have ins/del quals we just return the default byte directly + case BASE_INSERTION: return insertionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : insertionQuals[offset]; + case BASE_DELETION: return deletionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : deletionQuals[offset]; + default: throw new IllegalStateException("Unknown event type " + eventType); + } + } + + /** + * Get the error fraction for event type at offset + * + * The error fraction is a value between 0 and 1 that indicates how much certainty we have + * in the error occurring at offset. A value of 1 means that the error definitely occurs at this + * site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight + * of the error belongs here + * + * @param eventType the type of event we want the qual for + * @param offset the offset into this read for the qual + * @return a fractional weight for an error at this offset + */ + @Requires("validOffset(offset)") + @Ensures("result >= 0.0 && result <= 1.0") + public double getErrorFraction(final EventType eventType, final int offset) { + switch ( eventType ) { + case BASE_SUBSTITUTION: return snpErrors[offset]; + case BASE_INSERTION: return insertionErrors[offset]; + case BASE_DELETION: return deletionErrors[offset]; + default: throw new IllegalStateException("Unknown event type " + eventType); + } + } + + /** + * Get the read involved in this recalibration info + * @return a non-null GATKSAMRecord + */ + @Ensures("result != null") + public GATKSAMRecord getRead() { + return read; + } + + /** + * Should offset in this read be skipped (because it's covered by a known variation site?) + * @param offset a valid offset into this info + * @return true if offset should be skipped, false otherwise + */ + @Requires("validOffset(offset)") + public boolean skip(final int offset) { + return skips[offset]; + } + + /** + * Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets + * @return a non-null ReadCovariates object + */ + @Ensures("result != null") + public ReadCovariates getCovariatesValues() { + return covariates; + } + + /** + * Ensures an offset is valid. Used in contracts + * @param offset a proposed offset + * @return true if offset is valid w.r.t. the data in this object, false otherwise + */ + private boolean validOffset(final int offset) { + return offset >= 0 && offset < baseQuals.length; + } + + private boolean validQual(final byte result) { + return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java new file mode 100755 index 000000000..622413b18 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.recalibration.RecalUtils; + +import java.io.File; +import java.io.PrintStream; +import java.util.Collections; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 27, 2009 + * + * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker. + * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. + */ + +public class RecalibrationArgumentCollection { + + /** + * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, + * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) + * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. + * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. + */ + @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) + public List> knownSites = Collections.emptyList(); + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. + */ + @Gather(BQSRGatherer.class) + @Output(doc = "The output recalibration table file to create", required = true) + public File RECAL_TABLE_FILE = null; + public PrintStream RECAL_TABLE; + + /** + * If not provided, then no plots will be generated (useful for queue scatter/gathering). + * However, we *highly* recommend that users generate these plots whenever possible for QC checking. + */ + @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false) + public File RECAL_PDF_FILE = null; + + /** + * If not provided, then a temporary file is created and then deleted upon completion. + * For advanced users only. + */ + @Advanced + @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) + public File RECAL_CSV_FILE = null; + + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ + @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) + public boolean LIST_ONLY = false; + + /** + * Note that the ReadGroup and QualityScore covariates are required and do not need to be specified. + * Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default. + * Use the --list argument to see the available covariates. + */ + @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) + public String[] COVARIATES = null; + + /* + * The Cycle and Context covariates are standard and are included by default unless this argument is provided. + * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. + */ + @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) + public boolean DO_NOT_USE_STANDARD_COVARIATES = false; + + /** + * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. + */ + @Advanced + @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") + public boolean RUN_WITHOUT_DBSNP = false; + + /** + * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the + * reads which have had the reference inserted because of color space inconsistencies. + */ + @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; + + /** + * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles + * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in + * their color space tag can not be recalibrated. + */ + @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) + public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + + /** + * The context covariate will use a context of this size to calculate it's covariate value for base mismatches + */ + @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false) + public int MISMATCHES_CONTEXT_SIZE = 2; + + /** + * The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions + */ + @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false) + public int INDELS_CONTEXT_SIZE = 3; + + /** + * The cycle covariate will generate an error if it encounters a cycle greater than this value. + * This argument is ignored if the Cycle covariate is not used. + */ + @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false) + public int MAXIMUM_CYCLE_VALUE = 500; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) + */ + @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) + public byte MISMATCHES_DEFAULT_QUALITY = -1; + + /** + * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on) + */ + @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) + public byte INSERTIONS_DEFAULT_QUALITY = 45; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) + */ + @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) + public byte DELETIONS_DEFAULT_QUALITY = 45; + + /** + * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality + */ + @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) + public byte LOW_QUAL_TAIL = 2; + + /** + * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. + * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. + */ + @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") + public int QUANTIZING_LEVELS = 16; + + /** + * The tag name for the binary tag covariate (if using it) + */ + @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") + public String BINARY_TAG_NAME = null; + + /* + * whether GATK report tables should have rows in sorted order, starting from leftmost column + */ + @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) + public Boolean SORT_BY_ALL_COLUMNS = false; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + + @Hidden + @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + public String DEFAULT_PLATFORM = null; + + @Hidden + @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + public String FORCE_PLATFORM = null; + + @Hidden + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only") + public PrintStream RECAL_TABLE_UPDATE_LOG = null; + + public File existingRecalibrationReport = null; + + public GATKReportTable generateReportTable(final String covariateNames) { + GATKReportTable argumentsTable; + if(SORT_BY_ALL_COLUMNS) { + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); + } + argumentsTable.addColumn("Argument"); + argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); + argumentsTable.addRowID("covariate", true); + argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames); + argumentsTable.addRowID("no_standard_covs", true); + argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); + argumentsTable.addRowID("run_without_dbsnp", true); + argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); + argumentsTable.addRowID("solid_recal_mode", true); + argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); + argumentsTable.addRowID("solid_nocall_strategy", true); + argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); + argumentsTable.addRowID("mismatches_context_size", true); + argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); + argumentsTable.addRowID("indels_context_size", true); + argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); + argumentsTable.addRowID("mismatches_default_quality", true); + argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); + argumentsTable.addRowID("insertions_default_quality", true); + argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); + argumentsTable.addRowID("low_quality_tail", true); + argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); + argumentsTable.addRowID("default_platform", true); + argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); + argumentsTable.addRowID("force_platform", true); + argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); + argumentsTable.addRowID("quantizing_levels", true); + argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); + argumentsTable.addRowID("recalibration_report", true); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); + argumentsTable.addRowID("plot_pdf_file", true); + argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath()); + argumentsTable.addRowID("binary_tag_name", true); + argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); + return argumentsTable; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java new file mode 100644 index 000000000..ca9fb4bca --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.recalibration.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + +public class RecalibrationEngine { + final protected Covariate[] covariates; + final private int numReadGroups; + final private PrintStream maybeLogStream; + final private boolean lowMemoryMode; + + /** + * Has finalizeData() been called? + */ + private boolean finalized = false; + + /** + * The final (merged, etc) recalibration tables, suitable for downstream analysis. + */ + private RecalibrationTables finalRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + /** + * Get a recalibration table suitable for updating the underlying RecalDatums + * + * May return a thread-local version, or a single version, depending on the initialization + * arguments of this instance. + * + * @return updated tables + */ + protected RecalibrationTables getUpdatableRecalibrationTables() { + return threadLocalTables.get(); + } + + /** + * Initialize the recalibration engine + * + * Called once before any calls to updateDataForRead are made. The engine should prepare itself + * to handle any number of updateDataForRead calls containing ReadRecalibrationInfo containing + * keys for each of the covariates provided. + * + * The engine should collect match and mismatch data into the recalibrationTables data. + * + * @param covariates an array of the covariates we'll be using in this engine, order matters + * @param numReadGroups the number of read groups we should use for the recalibration tables + * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables + */ + public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) { + if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); + + this.covariates = covariates.clone(); + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; + this.lowMemoryMode = enableLowMemoryMode; + } + + /** + * Update the recalibration statistics using the information in recalInfo + * @param recalInfo data structure holding information about the recalibration values for a single read + */ + @Requires("recalInfo != null") + public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { + final GATKSAMRecord read = recalInfo.getRead(); + final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); + final RecalibrationTables tables = getUpdatableRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); + + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( ! recalInfo.skip(offset) ) { + + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.ordinal(); + final byte qual = recalInfo.getQual(eventType, offset); + final double isError = recalInfo.getErrorFraction(eventType, offset); + + RecalUtils.incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } + } + } + } + + + /** + * Finalize, if appropriate, all derived data in recalibrationTables. + * + * Called once after all calls to updateDataForRead have been issued. + * + * Assumes that all of the principal tables (by quality score) have been completely updated, + * and walks over this data to create summary data tables like by read group table. + */ + public void finalizeData() { + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); + + // merge all of the thread-local tables + finalRecalibrationTables = mergeThreadLocalRecalibrationTables(); + + final NestedIntegerArray byReadGroupTable = finalRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable(); + + // iterate over all values in the qual table + for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { + final int rgKey = leaf.keys[0]; + final int eventIndex = leaf.keys[2]; + final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); + final RecalDatum qualDatum = leaf.value; + + if ( rgDatum == null ) { + // create a copy of qualDatum, and initialize byReadGroup table with it + byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); + } else { + // combine the qual datum with the existing datum in the byReadGroup table + rgDatum.combine(qualDatum); + } + } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + /** + * Get the final recalibration tables, after finalizeData() has been called + * + * This returns the finalized recalibration table collected by this engine. + * + * It is an error to call this function before finalizeData has been called + * + * @return the finalized recalibration table collected by this engine + */ + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return finalRecalibrationTables; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java new file mode 100755 index 000000000..dc7c27db0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.*; + +import java.io.*; + +/** + */ + +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@PartitionBy(PartitionType.READ) +public class RecalibrationPerformance extends RodWalker implements NanoSchedulable { + + @Output(doc="Write output to this file", required = true) + public PrintStream out; + + @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file") + public File RECAL_FILE = null; + + public void initialize() { + out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors"); + + final GATKReport report = new GATKReport(RECAL_FILE); + final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); + for ( int row = 0; row < table.getNumRows(); row++ ) { + + final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); + final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME))); + final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); + + final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); + + final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported); + + //if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 ) + // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp)); + + if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") && + table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") && + table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") && + (QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) { + out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors)); + out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors)); + } + } + + } + + @Override + public boolean isDone() { + return true; + } + + private double asDouble(final Object o) { + if ( o instanceof Double ) + return (Double)o; + else if ( o instanceof Integer ) + return (Integer)o; + else if ( o instanceof Long ) + return (Long)o; + else + throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass()); + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; } + + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer counter, Integer sum) { return 0; } + + @Override + public void onTraversalDone(Integer sum) {} +} \ No newline at end of file