diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
new file mode 100755
index 000000000..dbb628135
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2011 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import org.broadinstitute.sting.commandline.Gatherer;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.recalibration.RecalUtils;
+import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.List;
+
+/**
+ * User: carneiro
+ * Date: 3/29/11
+ */
+
+
+public class BQSRGatherer extends Gatherer {
+
+ private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
+ private static final String MISSING_OUTPUT_FILE = "missing output file name";
+
+ @Override
+ public void gather(List inputs, File output) {
+ final PrintStream outputFile;
+ try {
+ outputFile = new PrintStream(output);
+ } catch(FileNotFoundException e) {
+ throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
+ }
+
+ RecalibrationReport generalReport = null;
+ for (File input : inputs) {
+ final RecalibrationReport inputReport = new RecalibrationReport(input);
+ if (generalReport == null)
+ generalReport = inputReport;
+ else
+ generalReport.combine(inputReport);
+ }
+ if (generalReport == null)
+ throw new ReviewedStingException(EMPTY_INPUT_LIST);
+
+ generalReport.calculateQuantizedQualities();
+
+ RecalibrationArgumentCollection RAC = generalReport.getRAC();
+ if ( RAC.RECAL_PDF_FILE != null ) {
+ RAC.RECAL_TABLE_FILE = output;
+ if ( RAC.existingRecalibrationReport != null ) {
+ final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport);
+ RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates());
+ }
+ else {
+ RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates());
+ }
+ }
+
+ generalReport.output(outputFile);
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
new file mode 100755
index 000000000..3b0cb07d5
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.SAMFileHeader;
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.Advanced;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.ArgumentCollection;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.filters.*;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.variant.utils.BaseUtils;
+import org.broadinstitute.sting.utils.baq.BAQ;
+import org.broadinstitute.sting.utils.clipping.ReadClipper;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.recalibration.*;
+import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
+ *
+ *
+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
+ * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
+ * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
+ * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
+ * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
+ * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
+ *
+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
+ *
+ *
+ *
+ *
Input
+ *
+ * The input read data whose base quality scores need to be assessed.
+ *
+ * A database of known polymorphic sites to skip over.
+ *
+ *
+ * Output
+ *
+ * A GATK Report file with many tables:
+ *
+ * - The list of arguments
+ * - The quantized qualities table
+ * - The recalibration table by read group
+ * - The recalibration table by quality score
+ * - The recalibration table for all the optional covariates
+ *
+ *
+ * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ * -T BaseRecalibrator \
+ * -I my_reads.bam \
+ * -R resources/Homo_sapiens_assembly18.fasta \
+ * -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ * -knownSites another/optional/setOfSitesToMask.vcf \
+ * -o recal_data.grp
+ *
+ */
+
+@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class})
+@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
+@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
+@PartitionBy(PartitionType.READ)
+public class BaseRecalibrator extends ReadWalker implements NanoSchedulable {
+ /**
+ * all the command line arguments for BQSR and it's covariates
+ */
+ @ArgumentCollection
+ private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
+
+ /**
+ * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
+ * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag
+ * to safely access only one table. There may be some CPU cost, but as long as the table is really big
+ * there should be relatively little CPU costs.
+ */
+ @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false)
+ public boolean lowMemoryMode = false;
+
+ @Advanced
+ @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
+ public double BAQGOP = BAQ.DEFAULT_GOP;
+
+ /**
+ * an object that keeps track of the information necessary for quality score quantization
+ */
+ private QuantizationInfo quantizationInfo;
+
+ /**
+ * list to hold the all the covariate objects that were requested (required + standard + experimental)
+ */
+ private Covariate[] requestedCovariates;
+
+ private RecalibrationEngine recalibrationEngine;
+
+ private int minimumQToUse;
+
+ private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
+
+ private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
+ private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
+ private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
+
+ /**
+ * Parse the -cov arguments and create a list of covariates to be used here
+ * Based on the covariates' estimates for initial capacity allocate the data hashmap
+ */
+ public void initialize() {
+ baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
+
+ if (RAC.FORCE_PLATFORM != null)
+ RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
+
+ if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
+ throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
+
+ if (RAC.LIST_ONLY) {
+ RecalUtils.listAvailableCovariates(logger);
+ System.exit(0);
+ }
+ RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
+
+ Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
+ ArrayList requiredCovariates = covariates.getFirst();
+ ArrayList optionalCovariates = covariates.getSecond();
+
+ requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
+ int covariateIndex = 0;
+ for (final Covariate covariate : requiredCovariates)
+ requestedCovariates[covariateIndex++] = covariate;
+ for (final Covariate covariate : optionalCovariates)
+ requestedCovariates[covariateIndex++] = covariate;
+
+ logger.info("The covariates being used here: ");
+ for (Covariate cov : requestedCovariates) { // list all the covariates being used
+ logger.info("\t" + cov.getClass().getSimpleName());
+ cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
+ }
+
+ try {
+ RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE);
+ } catch (IOException e) {
+ throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e);
+ }
+
+ initializeRecalibrationEngine();
+ minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
+ referenceReader = getToolkit().getReferenceDataSource().getReference();
+ }
+
+ /**
+ * Initialize the recalibration engine
+ */
+ private void initializeRecalibrationEngine() {
+ int numReadGroups = 0;
+ for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
+ numReadGroups += header.getReadGroups().size();
+
+ recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode);
+ }
+
+ private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
+ return read.getBaseQualities()[offset] < minimumQToUse;
+ }
+
+ /**
+ * For each read at this locus get the various covariate values and increment that location in the map based on
+ * whether or not the base matches the reference at this particular location
+ */
+ public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
+
+ final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
+ if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
+
+ RecalUtils.parsePlatformForRead(read, RAC);
+ if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
+ return 0L; // skip this read completely
+ }
+
+ final int[] isSNP = calculateIsSNP(read, ref, originalRead);
+ final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
+ final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
+ final int nErrors = nEvents(isSNP, isInsertion, isDeletion);
+
+ // note for efficiency regions we don't compute the BAQ array unless we actually have
+ // some error to marginalize over. For ILMN data ~85% of reads have no error
+ final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read);
+
+ if( baqArray != null ) { // some reads just can't be BAQ'ed
+ final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
+ final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
+ final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
+ final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
+ final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
+
+ // aggregate all of the info into our info object, and update the data
+ final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
+ recalibrationEngine.updateDataForRead(info);
+ return 1L;
+ } else {
+ return 0L;
+ }
+ }
+
+ /**
+ * Compute the number of mutational events across all hasEvent vectors
+ *
+ * Simply the sum of entries in hasEvents
+ *
+ * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event)
+ * @return the total number of events across all hasEvent arrays
+ */
+ private int nEvents(final int[]... hasEvents) {
+ int n = 0;
+ for ( final int[] hasEvent : hasEvents ) {
+ n += MathUtils.sum(hasEvent);
+ }
+ return n;
+ }
+
+ protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
+ final byte[] bases = read.getReadBases();
+ final boolean[] skip = new boolean[bases.length];
+ final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
+ for( int iii = 0; iii < bases.length; iii++ ) {
+ skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
+ }
+ return skip;
+ }
+
+ protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
+ return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
+ }
+
+ protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) {
+ final int readLength = read.getReadBases().length;
+ final boolean[] knownSites = new boolean[readLength];
+ Arrays.fill(knownSites, false);
+ for( final Feature f : features ) {
+ int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
+ if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ featureStartOnRead = 0;
+ }
+
+ int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
+ if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ featureEndOnRead = readLength;
+ }
+
+ if( featureStartOnRead > readLength ) {
+ featureStartOnRead = featureEndOnRead = readLength;
+ }
+
+ Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
+ }
+ return knownSites;
+ }
+
+ // BUGBUG: can be merged with calculateIsIndel
+ protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
+ final byte[] readBases = read.getReadBases();
+ final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
+ final int[] snp = new int[readBases.length];
+ int readPos = 0;
+ int refPos = 0;
+ for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
+ final int elementLength = ce.getLength();
+ switch (ce.getOperator()) {
+ case M:
+ case EQ:
+ case X:
+ for( int iii = 0; iii < elementLength; iii++ ) {
+ snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
+ readPos++;
+ refPos++;
+ }
+ break;
+ case D:
+ case N:
+ refPos += elementLength;
+ break;
+ case I:
+ case S: // ReferenceContext doesn't have the soft clipped bases!
+ readPos += elementLength;
+ break;
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
+ }
+ }
+ return snp;
+ }
+
+ protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
+ final byte[] readBases = read.getReadBases();
+ final int[] indel = new int[readBases.length];
+ Arrays.fill(indel, 0);
+ int readPos = 0;
+ for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
+ final int elementLength = ce.getLength();
+ switch (ce.getOperator()) {
+ case M:
+ case EQ:
+ case X:
+ case S:
+ {
+ readPos += elementLength;
+ break;
+ }
+ case D:
+ {
+ final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
+ indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
+ break;
+ }
+ case I:
+ {
+ final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
+ if( forwardStrandRead ) {
+ indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
+ }
+ for (int iii = 0; iii < elementLength; iii++) {
+ readPos++;
+ }
+ if( !forwardStrandRead ) {
+ indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
+ }
+ break;
+ }
+ case N:
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
+ }
+ }
+ return indel;
+ }
+
+ protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
+ if(errorArray.length != baqArray.length ) {
+ throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
+ }
+
+ final int BLOCK_START_UNSET = -1;
+
+ final double[] fractionalErrors = new double[baqArray.length];
+ Arrays.fill(fractionalErrors, 0.0);
+ boolean inBlock = false;
+ int blockStartIndex = BLOCK_START_UNSET;
+ int iii;
+ for( iii = 0; iii < fractionalErrors.length; iii++ ) {
+ if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
+ if( !inBlock ) {
+ fractionalErrors[iii] = (double) errorArray[iii];
+ } else {
+ calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
+ inBlock = false; // reset state variables
+ blockStartIndex = BLOCK_START_UNSET; // reset state variables
+ }
+ } else {
+ inBlock = true;
+ if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
+ }
+ }
+ if( inBlock ) {
+ calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
+ }
+ if( fractionalErrors.length != errorArray.length ) {
+ throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
+ }
+ return fractionalErrors;
+ }
+
+ private static void calculateAndStoreErrorsInBlock( final int iii,
+ final int blockStartIndex,
+ final int[] errorArray,
+ final double[] fractionalErrors ) {
+ int totalErrors = 0;
+ for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
+ totalErrors += errorArray[jjj];
+ }
+ for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
+ fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
+ }
+ }
+
+ /**
+ * Create a BAQ style array that indicates no alignment uncertainty
+ * @param read the read for which we want a BAQ array
+ * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values
+ * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above
+ */
+ private byte[] flatBAQArray(final GATKSAMRecord read) {
+ final byte[] baq = new byte[read.getReadLength()];
+ Arrays.fill(baq, NO_BAQ_UNCERTAINTY);
+ return baq;
+ }
+
+ /**
+ * Compute an actual BAQ array for read, based on its quals and the reference sequence
+ * @param read the read to BAQ
+ * @return a non-null BAQ tag array for read
+ */
+ private byte[] calculateBAQArray( final GATKSAMRecord read ) {
+ baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
+ return BAQ.getBAQTag(read);
+ }
+
+ /**
+ * Initialize the reduce step by returning 0L
+ *
+ * @return returns 0L
+ */
+ public Long reduceInit() {
+ return 0L;
+ }
+
+ /**
+ * The Reduce method doesn't do anything for this walker.
+ *
+ * @param mapped Result of the map. This value is immediately ignored.
+ * @param sum The summing CountedData used to output the CSV data
+ * @return returns The sum used to output the CSV data
+ */
+ public Long reduce(Long mapped, Long sum) {
+ sum += mapped;
+ return sum;
+ }
+
+ @Override
+ public void onTraversalDone(Long result) {
+ recalibrationEngine.finalizeData();
+
+ logger.info("Calculating quantized quality scores...");
+ quantizeQualityScores();
+
+ logger.info("Writing recalibration report...");
+ generateReport();
+ logger.info("...done!");
+
+ if ( RAC.RECAL_PDF_FILE != null ) {
+ logger.info("Generating recalibration plots...");
+ generatePlots();
+ }
+
+ logger.info("Processed: " + result + " reads");
+ }
+
+ private RecalibrationTables getRecalibrationTable() {
+ return recalibrationEngine.getFinalRecalibrationTables();
+ }
+
+ private void generatePlots() {
+ File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE;
+ if (recalFile != null) {
+ RecalibrationReport report = new RecalibrationReport(recalFile);
+ RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates);
+ }
+ else
+ RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates);
+ }
+
+ /**
+ * go through the quality score table and use the # observations and the empirical quality score
+ * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
+ * generate a quantization map (recalibrated_qual -> quantized_qual)
+ */
+ private void quantizeQualityScores() {
+ quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS);
+ }
+
+ private void generateReport() {
+ RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
+ }
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java
new file mode 100644
index 000000000..b884b89db
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.recalibration.EventType;
+import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: depristo
+ * Date: 12/18/12
+ * Time: 3:50 PM
+ *
+ * TODO -- merge in ReadCovariates?
+ */
+public final class ReadRecalibrationInfo {
+ private final GATKSAMRecord read;
+ private final int length;
+ private final ReadCovariates covariates;
+ private final boolean[] skips;
+ private final byte[] baseQuals, insertionQuals, deletionQuals;
+ private final double[] snpErrors, insertionErrors, deletionErrors;
+
+ public ReadRecalibrationInfo(final GATKSAMRecord read,
+ final ReadCovariates covariates,
+ final boolean[] skips,
+ final double[] snpErrors,
+ final double[] insertionErrors,
+ final double[] deletionErrors) {
+ if ( read == null ) throw new IllegalArgumentException("read cannot be null");
+ if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null");
+ if ( skips == null ) throw new IllegalArgumentException("skips cannot be null");
+ if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null");
+ if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null");
+ if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null");
+
+ this.read = read;
+ this.baseQuals = read.getBaseQualities();
+ this.length = baseQuals.length;
+ this.covariates = covariates;
+ this.skips = skips;
+ this.insertionQuals = read.getExistingBaseInsertionQualities();
+ this.deletionQuals = read.getExistingBaseDeletionQualities();
+ this.snpErrors = snpErrors;
+ this.insertionErrors = insertionErrors;
+ this.deletionErrors = deletionErrors;
+
+ if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length);
+ if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length);
+ if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length);
+ if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length);
+ }
+
+ /**
+ * Get the qual score for event type at offset
+ *
+ * @param eventType the type of event we want the qual for
+ * @param offset the offset into this read for the qual
+ * @return a valid quality score for event at offset
+ */
+ @Requires("validOffset(offset)")
+ @Ensures("validQual(result)")
+ public byte getQual(final EventType eventType, final int offset) {
+ switch ( eventType ) {
+ case BASE_SUBSTITUTION: return baseQuals[offset];
+ // note optimization here -- if we don't have ins/del quals we just return the default byte directly
+ case BASE_INSERTION: return insertionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : insertionQuals[offset];
+ case BASE_DELETION: return deletionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : deletionQuals[offset];
+ default: throw new IllegalStateException("Unknown event type " + eventType);
+ }
+ }
+
+ /**
+ * Get the error fraction for event type at offset
+ *
+ * The error fraction is a value between 0 and 1 that indicates how much certainty we have
+ * in the error occurring at offset. A value of 1 means that the error definitely occurs at this
+ * site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight
+ * of the error belongs here
+ *
+ * @param eventType the type of event we want the qual for
+ * @param offset the offset into this read for the qual
+ * @return a fractional weight for an error at this offset
+ */
+ @Requires("validOffset(offset)")
+ @Ensures("result >= 0.0 && result <= 1.0")
+ public double getErrorFraction(final EventType eventType, final int offset) {
+ switch ( eventType ) {
+ case BASE_SUBSTITUTION: return snpErrors[offset];
+ case BASE_INSERTION: return insertionErrors[offset];
+ case BASE_DELETION: return deletionErrors[offset];
+ default: throw new IllegalStateException("Unknown event type " + eventType);
+ }
+ }
+
+ /**
+ * Get the read involved in this recalibration info
+ * @return a non-null GATKSAMRecord
+ */
+ @Ensures("result != null")
+ public GATKSAMRecord getRead() {
+ return read;
+ }
+
+ /**
+ * Should offset in this read be skipped (because it's covered by a known variation site?)
+ * @param offset a valid offset into this info
+ * @return true if offset should be skipped, false otherwise
+ */
+ @Requires("validOffset(offset)")
+ public boolean skip(final int offset) {
+ return skips[offset];
+ }
+
+ /**
+ * Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets
+ * @return a non-null ReadCovariates object
+ */
+ @Ensures("result != null")
+ public ReadCovariates getCovariatesValues() {
+ return covariates;
+ }
+
+ /**
+ * Ensures an offset is valid. Used in contracts
+ * @param offset a proposed offset
+ * @return true if offset is valid w.r.t. the data in this object, false otherwise
+ */
+ private boolean validOffset(final int offset) {
+ return offset >= 0 && offset < baseQuals.length;
+ }
+
+ private boolean validQual(final byte result) {
+ return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
new file mode 100755
index 000000000..622413b18
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.report.GATKReportTable;
+import org.broadinstitute.sting.utils.recalibration.RecalUtils;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Nov 27, 2009
+ *
+ * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
+ * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
+ */
+
+public class RecalibrationArgumentCollection {
+
+ /**
+ * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
+ * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
+ * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
+ * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
+ */
+ @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
+ public List> knownSites = Collections.emptyList();
+
+ /**
+ * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+ * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+ * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+ * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out.
+ */
+ @Gather(BQSRGatherer.class)
+ @Output(doc = "The output recalibration table file to create", required = true)
+ public File RECAL_TABLE_FILE = null;
+ public PrintStream RECAL_TABLE;
+
+ /**
+ * If not provided, then no plots will be generated (useful for queue scatter/gathering).
+ * However, we *highly* recommend that users generate these plots whenever possible for QC checking.
+ */
+ @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false)
+ public File RECAL_PDF_FILE = null;
+
+ /**
+ * If not provided, then a temporary file is created and then deleted upon completion.
+ * For advanced users only.
+ */
+ @Advanced
+ @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
+ public File RECAL_CSV_FILE = null;
+
+ /**
+ * Note that the --list argument requires a fully resolved and correct command-line to work.
+ */
+ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
+ public boolean LIST_ONLY = false;
+
+ /**
+ * Note that the ReadGroup and QualityScore covariates are required and do not need to be specified.
+ * Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default.
+ * Use the --list argument to see the available covariates.
+ */
+ @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false)
+ public String[] COVARIATES = null;
+
+ /*
+ * The Cycle and Context covariates are standard and are included by default unless this argument is provided.
+ * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
+ */
+ @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
+ public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
+
+ /**
+ * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
+ */
+ @Advanced
+ @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
+ public boolean RUN_WITHOUT_DBSNP = false;
+
+ /**
+ * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
+ * reads which have had the reference inserted because of color space inconsistencies.
+ */
+ @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
+ public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
+
+ /**
+ * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles
+ * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
+ * their color space tag can not be recalibrated.
+ */
+ @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
+ public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
+
+ /**
+ * The context covariate will use a context of this size to calculate it's covariate value for base mismatches
+ */
+ @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
+ public int MISMATCHES_CONTEXT_SIZE = 2;
+
+ /**
+ * The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
+ */
+ @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
+ public int INDELS_CONTEXT_SIZE = 3;
+
+ /**
+ * The cycle covariate will generate an error if it encounters a cycle greater than this value.
+ * This argument is ignored if the Cycle covariate is not used.
+ */
+ @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
+ public int MAXIMUM_CYCLE_VALUE = 500;
+
+ /**
+ * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
+ */
+ @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
+ public byte MISMATCHES_DEFAULT_QUALITY = -1;
+
+ /**
+ * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
+ */
+ @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
+ public byte INSERTIONS_DEFAULT_QUALITY = 45;
+
+ /**
+ * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
+ */
+ @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
+ public byte DELETIONS_DEFAULT_QUALITY = 45;
+
+ /**
+ * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
+ */
+ @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
+ public byte LOW_QUAL_TAIL = 2;
+
+ /**
+ * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
+ * This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
+ */
+ @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
+ public int QUANTIZING_LEVELS = 16;
+
+ /**
+ * The tag name for the binary tag covariate (if using it)
+ */
+ @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
+ public String BINARY_TAG_NAME = null;
+
+ /*
+ * whether GATK report tables should have rows in sorted order, starting from leftmost column
+ */
+ @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false)
+ public Boolean SORT_BY_ALL_COLUMNS = false;
+
+ /////////////////////////////
+ // Debugging-only Arguments
+ /////////////////////////////
+
+ @Hidden
+ @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
+ public String DEFAULT_PLATFORM = null;
+
+ @Hidden
+ @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
+ public String FORCE_PLATFORM = null;
+
+ @Hidden
+ @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
+ public PrintStream RECAL_TABLE_UPDATE_LOG = null;
+
+ public File existingRecalibrationReport = null;
+
+ public GATKReportTable generateReportTable(final String covariateNames) {
+ GATKReportTable argumentsTable;
+ if(SORT_BY_ALL_COLUMNS) {
+ argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
+ } else {
+ argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
+ }
+ argumentsTable.addColumn("Argument");
+ argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
+ argumentsTable.addRowID("covariate", true);
+ argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
+ argumentsTable.addRowID("no_standard_covs", true);
+ argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
+ argumentsTable.addRowID("run_without_dbsnp", true);
+ argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
+ argumentsTable.addRowID("solid_recal_mode", true);
+ argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
+ argumentsTable.addRowID("solid_nocall_strategy", true);
+ argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
+ argumentsTable.addRowID("mismatches_context_size", true);
+ argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
+ argumentsTable.addRowID("indels_context_size", true);
+ argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
+ argumentsTable.addRowID("mismatches_default_quality", true);
+ argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
+ argumentsTable.addRowID("insertions_default_quality", true);
+ argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
+ argumentsTable.addRowID("low_quality_tail", true);
+ argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
+ argumentsTable.addRowID("default_platform", true);
+ argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
+ argumentsTable.addRowID("force_platform", true);
+ argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
+ argumentsTable.addRowID("quantizing_levels", true);
+ argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
+ argumentsTable.addRowID("recalibration_report", true);
+ argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
+ argumentsTable.addRowID("plot_pdf_file", true);
+ argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath());
+ argumentsTable.addRowID("binary_tag_name", true);
+ argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
+ return argumentsTable;
+ }
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
new file mode 100644
index 000000000..ca9fb4bca
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
+import org.broadinstitute.sting.utils.recalibration.*;
+import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.io.PrintStream;
+import java.util.LinkedList;
+import java.util.List;
+
+public class RecalibrationEngine {
+ final protected Covariate[] covariates;
+ final private int numReadGroups;
+ final private PrintStream maybeLogStream;
+ final private boolean lowMemoryMode;
+
+ /**
+ * Has finalizeData() been called?
+ */
+ private boolean finalized = false;
+
+ /**
+ * The final (merged, etc) recalibration tables, suitable for downstream analysis.
+ */
+ private RecalibrationTables finalRecalibrationTables = null;
+
+ private final List recalibrationTablesList = new LinkedList();
+
+ private final ThreadLocal threadLocalTables = new ThreadLocal() {
+ private synchronized RecalibrationTables makeAndCaptureTable() {
+ final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream);
+ recalibrationTablesList.add(newTable);
+ return newTable;
+ }
+
+ @Override
+ protected synchronized RecalibrationTables initialValue() {
+ if ( lowMemoryMode ) {
+ return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0);
+ } else {
+ return makeAndCaptureTable();
+ }
+ }
+ };
+
+ /**
+ * Get a recalibration table suitable for updating the underlying RecalDatums
+ *
+ * May return a thread-local version, or a single version, depending on the initialization
+ * arguments of this instance.
+ *
+ * @return updated tables
+ */
+ protected RecalibrationTables getUpdatableRecalibrationTables() {
+ return threadLocalTables.get();
+ }
+
+ /**
+ * Initialize the recalibration engine
+ *
+ * Called once before any calls to updateDataForRead are made. The engine should prepare itself
+ * to handle any number of updateDataForRead calls containing ReadRecalibrationInfo containing
+ * keys for each of the covariates provided.
+ *
+ * The engine should collect match and mismatch data into the recalibrationTables data.
+ *
+ * @param covariates an array of the covariates we'll be using in this engine, order matters
+ * @param numReadGroups the number of read groups we should use for the recalibration tables
+ * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables
+ */
+ public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) {
+ if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null");
+ if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups);
+
+ this.covariates = covariates.clone();
+ this.numReadGroups = numReadGroups;
+ this.maybeLogStream = maybeLogStream;
+ this.lowMemoryMode = enableLowMemoryMode;
+ }
+
+ /**
+ * Update the recalibration statistics using the information in recalInfo
+ * @param recalInfo data structure holding information about the recalibration values for a single read
+ */
+ @Requires("recalInfo != null")
+ public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) {
+ final GATKSAMRecord read = recalInfo.getRead();
+ final ReadCovariates readCovariates = recalInfo.getCovariatesValues();
+ final RecalibrationTables tables = getUpdatableRecalibrationTables();
+ final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable();
+
+ for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
+ if( ! recalInfo.skip(offset) ) {
+
+ for (final EventType eventType : EventType.values()) {
+ final int[] keys = readCovariates.getKeySet(offset, eventType);
+ final int eventIndex = eventType.ordinal();
+ final byte qual = recalInfo.getQual(eventType, offset);
+ final double isError = recalInfo.getErrorFraction(eventType, offset);
+
+ RecalUtils.incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex);
+
+ for (int i = 2; i < covariates.length; i++) {
+ if (keys[i] < 0)
+ continue;
+
+ RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
+ }
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Finalize, if appropriate, all derived data in recalibrationTables.
+ *
+ * Called once after all calls to updateDataForRead have been issued.
+ *
+ * Assumes that all of the principal tables (by quality score) have been completely updated,
+ * and walks over this data to create summary data tables like by read group table.
+ */
+ public void finalizeData() {
+ if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called");
+
+ // merge all of the thread-local tables
+ finalRecalibrationTables = mergeThreadLocalRecalibrationTables();
+
+ final NestedIntegerArray byReadGroupTable = finalRecalibrationTables.getReadGroupTable();
+ final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable();
+
+ // iterate over all values in the qual table
+ for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) {
+ final int rgKey = leaf.keys[0];
+ final int eventIndex = leaf.keys[2];
+ final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex);
+ final RecalDatum qualDatum = leaf.value;
+
+ if ( rgDatum == null ) {
+ // create a copy of qualDatum, and initialize byReadGroup table with it
+ byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex);
+ } else {
+ // combine the qual datum with the existing datum in the byReadGroup table
+ rgDatum.combine(qualDatum);
+ }
+ }
+
+ finalized = true;
+ }
+
+ /**
+ * Merge all of the thread local recalibration tables into a single one.
+ *
+ * Reuses one of the recalibration tables to hold the merged table, so this function can only be
+ * called once in the engine.
+ *
+ * @return the merged recalibration table
+ */
+ @Requires("! finalized")
+ private RecalibrationTables mergeThreadLocalRecalibrationTables() {
+ if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty");
+
+ RecalibrationTables merged = null;
+ for ( final RecalibrationTables table : recalibrationTablesList ) {
+ if ( merged == null )
+ // fast path -- if there's only only one table, so just make it the merged one
+ merged = table;
+ else {
+ merged.combine(table);
+ }
+ }
+
+ return merged;
+ }
+
+ /**
+ * Get the final recalibration tables, after finalizeData() has been called
+ *
+ * This returns the finalized recalibration table collected by this engine.
+ *
+ * It is an error to call this function before finalizeData has been called
+ *
+ * @return the finalized recalibration table collected by this engine
+ */
+ public RecalibrationTables getFinalRecalibrationTables() {
+ if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called");
+ return finalRecalibrationTables;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
new file mode 100755
index 000000000..dc7c27db0
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationPerformance.java
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.filters.*;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.report.GATKReport;
+import org.broadinstitute.sting.gatk.report.GATKReportTable;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.recalibration.*;
+
+import java.io.*;
+
+/**
+ */
+
+@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
+@PartitionBy(PartitionType.READ)
+public class RecalibrationPerformance extends RodWalker implements NanoSchedulable {
+
+ @Output(doc="Write output to this file", required = true)
+ public PrintStream out;
+
+ @Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file")
+ public File RECAL_FILE = null;
+
+ public void initialize() {
+ out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors");
+
+ final GATKReport report = new GATKReport(RECAL_FILE);
+ final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
+ for ( int row = 0; row < table.getNumRows(); row++ ) {
+
+ final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
+ final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)));
+ final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
+
+ final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME));
+
+ final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported);
+
+ //if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 )
+ // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp));
+
+ if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") &&
+ table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") &&
+ table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") &&
+ (QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) {
+ out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors));
+ out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors));
+ }
+ }
+
+ }
+
+ @Override
+ public boolean isDone() {
+ return true;
+ }
+
+ private double asDouble(final Object o) {
+ if ( o instanceof Double )
+ return (Double)o;
+ else if ( o instanceof Integer )
+ return (Integer)o;
+ else if ( o instanceof Long )
+ return (Long)o;
+ else
+ throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass());
+ }
+
+ @Override
+ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
+
+ @Override
+ public Integer reduceInit() { return 0; }
+
+ @Override
+ public Integer reduce(Integer counter, Integer sum) { return 0; }
+
+ @Override
+ public void onTraversalDone(Integer sum) {}
+}
\ No newline at end of file