Not sure why IntelliJ didn't add this for commit like the other dirs
This commit is contained in:
parent
3787ee6de7
commit
9e6c2afb28
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 3/29/11
|
||||
*/
|
||||
|
||||
|
||||
public class BQSRGatherer extends Gatherer {
|
||||
|
||||
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
|
||||
private static final String MISSING_OUTPUT_FILE = "missing output file name";
|
||||
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
final PrintStream outputFile;
|
||||
try {
|
||||
outputFile = new PrintStream(output);
|
||||
} catch(FileNotFoundException e) {
|
||||
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
|
||||
}
|
||||
|
||||
RecalibrationReport generalReport = null;
|
||||
for (File input : inputs) {
|
||||
final RecalibrationReport inputReport = new RecalibrationReport(input);
|
||||
if (generalReport == null)
|
||||
generalReport = inputReport;
|
||||
else
|
||||
generalReport.combine(inputReport);
|
||||
}
|
||||
if (generalReport == null)
|
||||
throw new ReviewedStingException(EMPTY_INPUT_LIST);
|
||||
|
||||
generalReport.calculateQuantizedQualities();
|
||||
|
||||
RecalibrationArgumentCollection RAC = generalReport.getRAC();
|
||||
if ( RAC.RECAL_PDF_FILE != null ) {
|
||||
RAC.RECAL_TABLE_FILE = output;
|
||||
if ( RAC.existingRecalibrationReport != null ) {
|
||||
final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport);
|
||||
RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates());
|
||||
}
|
||||
else {
|
||||
RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates());
|
||||
}
|
||||
}
|
||||
|
||||
generalReport.output(outputFile);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,528 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.variant.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.recalibration.*;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
|
||||
*
|
||||
* <p>
|
||||
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
|
||||
* only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
|
||||
* of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
|
||||
* reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
|
||||
* probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
|
||||
* The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
|
||||
* <p>
|
||||
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
|
||||
*
|
||||
* <p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The input read data whose base quality scores need to be assessed.
|
||||
* <p>
|
||||
* A database of known polymorphic sites to skip over.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A GATK Report file with many tables:
|
||||
* <ol>
|
||||
* <li>The list of arguments</li>
|
||||
* <li>The quantized qualities table</li>
|
||||
* <li>The recalibration table by read group</li>
|
||||
* <li>The recalibration table by quality score</li>
|
||||
* <li>The recalibration table for all the optional covariates</li>
|
||||
* </ol>
|
||||
*
|
||||
* The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -T BaseRecalibrator \
|
||||
* -I my_reads.bam \
|
||||
* -R resources/Homo_sapiens_assembly18.fasta \
|
||||
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
|
||||
* -knownSites another/optional/setOfSitesToMask.vcf \
|
||||
* -o recal_data.grp
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class})
|
||||
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
||||
@PartitionBy(PartitionType.READ)
|
||||
public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSchedulable {
|
||||
/**
|
||||
* all the command line arguments for BQSR and it's covariates
|
||||
*/
|
||||
@ArgumentCollection
|
||||
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/**
|
||||
* When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
|
||||
* purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag
|
||||
* to safely access only one table. There may be some CPU cost, but as long as the table is really big
|
||||
* there should be relatively little CPU costs.
|
||||
*/
|
||||
@Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false)
|
||||
public boolean lowMemoryMode = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
/**
|
||||
* an object that keeps track of the information necessary for quality score quantization
|
||||
*/
|
||||
private QuantizationInfo quantizationInfo;
|
||||
|
||||
/**
|
||||
* list to hold the all the covariate objects that were requested (required + standard + experimental)
|
||||
*/
|
||||
private Covariate[] requestedCovariates;
|
||||
|
||||
private RecalibrationEngine recalibrationEngine;
|
||||
|
||||
private int minimumQToUse;
|
||||
|
||||
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
|
||||
|
||||
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
|
||||
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
|
||||
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
|
||||
|
||||
/**
|
||||
* Parse the -cov arguments and create a list of covariates to be used here
|
||||
* Based on the covariates' estimates for initial capacity allocate the data hashmap
|
||||
*/
|
||||
public void initialize() {
|
||||
baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
|
||||
|
||||
if (RAC.FORCE_PLATFORM != null)
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
|
||||
if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
|
||||
throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
|
||||
|
||||
if (RAC.LIST_ONLY) {
|
||||
RecalUtils.listAvailableCovariates(logger);
|
||||
System.exit(0);
|
||||
}
|
||||
RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
|
||||
|
||||
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
|
||||
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
|
||||
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
|
||||
|
||||
requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
|
||||
int covariateIndex = 0;
|
||||
for (final Covariate covariate : requiredCovariates)
|
||||
requestedCovariates[covariateIndex++] = covariate;
|
||||
for (final Covariate covariate : optionalCovariates)
|
||||
requestedCovariates[covariateIndex++] = covariate;
|
||||
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) { // list all the covariates being used
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
|
||||
try {
|
||||
RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE);
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e);
|
||||
}
|
||||
|
||||
initializeRecalibrationEngine();
|
||||
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
|
||||
referenceReader = getToolkit().getReferenceDataSource().getReference();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the recalibration engine
|
||||
*/
|
||||
private void initializeRecalibrationEngine() {
|
||||
int numReadGroups = 0;
|
||||
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
|
||||
numReadGroups += header.getReadGroups().size();
|
||||
|
||||
recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode);
|
||||
}
|
||||
|
||||
private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
|
||||
return read.getBaseQualities()[offset] < minimumQToUse;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
*/
|
||||
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
|
||||
|
||||
final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
|
||||
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
|
||||
|
||||
RecalUtils.parsePlatformForRead(read, RAC);
|
||||
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
|
||||
return 0L; // skip this read completely
|
||||
}
|
||||
|
||||
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
|
||||
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
|
||||
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
|
||||
final int nErrors = nEvents(isSNP, isInsertion, isDeletion);
|
||||
|
||||
// note for efficiency regions we don't compute the BAQ array unless we actually have
|
||||
// some error to marginalize over. For ILMN data ~85% of reads have no error
|
||||
final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read);
|
||||
|
||||
if( baqArray != null ) { // some reads just can't be BAQ'ed
|
||||
final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
|
||||
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
|
||||
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
|
||||
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
|
||||
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
|
||||
|
||||
// aggregate all of the info into our info object, and update the data
|
||||
final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
|
||||
recalibrationEngine.updateDataForRead(info);
|
||||
return 1L;
|
||||
} else {
|
||||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the number of mutational events across all hasEvent vectors
|
||||
*
|
||||
* Simply the sum of entries in hasEvents
|
||||
*
|
||||
* @param hasEvents a vector a vectors of 0 (no event) and 1 (has event)
|
||||
* @return the total number of events across all hasEvent arrays
|
||||
*/
|
||||
private int nEvents(final int[]... hasEvents) {
|
||||
int n = 0;
|
||||
for ( final int[] hasEvent : hasEvents ) {
|
||||
n += MathUtils.sum(hasEvent);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
|
||||
final byte[] bases = read.getReadBases();
|
||||
final boolean[] skip = new boolean[bases.length];
|
||||
final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
|
||||
for( int iii = 0; iii < bases.length; iii++ ) {
|
||||
skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
|
||||
}
|
||||
return skip;
|
||||
}
|
||||
|
||||
protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
|
||||
return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
|
||||
}
|
||||
|
||||
protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
|
||||
final int readLength = read.getReadBases().length;
|
||||
final boolean[] knownSites = new boolean[readLength];
|
||||
Arrays.fill(knownSites, false);
|
||||
for( final Feature f : features ) {
|
||||
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
|
||||
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
featureStartOnRead = 0;
|
||||
}
|
||||
|
||||
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
|
||||
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
featureEndOnRead = readLength;
|
||||
}
|
||||
|
||||
if( featureStartOnRead > readLength ) {
|
||||
featureStartOnRead = featureEndOnRead = readLength;
|
||||
}
|
||||
|
||||
Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
|
||||
}
|
||||
return knownSites;
|
||||
}
|
||||
|
||||
// BUGBUG: can be merged with calculateIsIndel
|
||||
protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
|
||||
final int[] snp = new int[readBases.length];
|
||||
int readPos = 0;
|
||||
int refPos = 0;
|
||||
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
switch (ce.getOperator()) {
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
|
||||
readPos++;
|
||||
refPos++;
|
||||
}
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
refPos += elementLength;
|
||||
break;
|
||||
case I:
|
||||
case S: // ReferenceContext doesn't have the soft clipped bases!
|
||||
readPos += elementLength;
|
||||
break;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
|
||||
}
|
||||
}
|
||||
return snp;
|
||||
}
|
||||
|
||||
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final int[] indel = new int[readBases.length];
|
||||
Arrays.fill(indel, 0);
|
||||
int readPos = 0;
|
||||
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
switch (ce.getOperator()) {
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
case S:
|
||||
{
|
||||
readPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case D:
|
||||
{
|
||||
final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
|
||||
indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
|
||||
break;
|
||||
}
|
||||
case I:
|
||||
{
|
||||
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
|
||||
if( forwardStrandRead ) {
|
||||
indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
|
||||
}
|
||||
for (int iii = 0; iii < elementLength; iii++) {
|
||||
readPos++;
|
||||
}
|
||||
if( !forwardStrandRead ) {
|
||||
indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
|
||||
}
|
||||
break;
|
||||
}
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
|
||||
}
|
||||
}
|
||||
return indel;
|
||||
}
|
||||
|
||||
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
|
||||
if(errorArray.length != baqArray.length ) {
|
||||
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
|
||||
}
|
||||
|
||||
final int BLOCK_START_UNSET = -1;
|
||||
|
||||
final double[] fractionalErrors = new double[baqArray.length];
|
||||
Arrays.fill(fractionalErrors, 0.0);
|
||||
boolean inBlock = false;
|
||||
int blockStartIndex = BLOCK_START_UNSET;
|
||||
int iii;
|
||||
for( iii = 0; iii < fractionalErrors.length; iii++ ) {
|
||||
if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
|
||||
if( !inBlock ) {
|
||||
fractionalErrors[iii] = (double) errorArray[iii];
|
||||
} else {
|
||||
calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
|
||||
inBlock = false; // reset state variables
|
||||
blockStartIndex = BLOCK_START_UNSET; // reset state variables
|
||||
}
|
||||
} else {
|
||||
inBlock = true;
|
||||
if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
|
||||
}
|
||||
}
|
||||
if( inBlock ) {
|
||||
calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
|
||||
}
|
||||
if( fractionalErrors.length != errorArray.length ) {
|
||||
throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
|
||||
}
|
||||
return fractionalErrors;
|
||||
}
|
||||
|
||||
private static void calculateAndStoreErrorsInBlock( final int iii,
|
||||
final int blockStartIndex,
|
||||
final int[] errorArray,
|
||||
final double[] fractionalErrors ) {
|
||||
int totalErrors = 0;
|
||||
for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
|
||||
totalErrors += errorArray[jjj];
|
||||
}
|
||||
for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
|
||||
fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a BAQ style array that indicates no alignment uncertainty
|
||||
* @param read the read for which we want a BAQ array
|
||||
* @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values
|
||||
* // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above
|
||||
*/
|
||||
private byte[] flatBAQArray(final GATKSAMRecord read) {
|
||||
final byte[] baq = new byte[read.getReadLength()];
|
||||
Arrays.fill(baq, NO_BAQ_UNCERTAINTY);
|
||||
return baq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute an actual BAQ array for read, based on its quals and the reference sequence
|
||||
* @param read the read to BAQ
|
||||
* @return a non-null BAQ tag array for read
|
||||
*/
|
||||
private byte[] calculateBAQArray( final GATKSAMRecord read ) {
|
||||
baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
|
||||
return BAQ.getBAQTag(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the reduce step by returning 0L
|
||||
*
|
||||
* @return returns 0L
|
||||
*/
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
*
|
||||
* @param mapped Result of the map. This value is immediately ignored.
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @return returns The sum used to output the CSV data
|
||||
*/
|
||||
public Long reduce(Long mapped, Long sum) {
|
||||
sum += mapped;
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
recalibrationEngine.finalizeData();
|
||||
|
||||
logger.info("Calculating quantized quality scores...");
|
||||
quantizeQualityScores();
|
||||
|
||||
logger.info("Writing recalibration report...");
|
||||
generateReport();
|
||||
logger.info("...done!");
|
||||
|
||||
if ( RAC.RECAL_PDF_FILE != null ) {
|
||||
logger.info("Generating recalibration plots...");
|
||||
generatePlots();
|
||||
}
|
||||
|
||||
logger.info("Processed: " + result + " reads");
|
||||
}
|
||||
|
||||
private RecalibrationTables getRecalibrationTable() {
|
||||
return recalibrationEngine.getFinalRecalibrationTables();
|
||||
}
|
||||
|
||||
private void generatePlots() {
|
||||
File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE;
|
||||
if (recalFile != null) {
|
||||
RecalibrationReport report = new RecalibrationReport(recalFile);
|
||||
RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates);
|
||||
}
|
||||
else
|
||||
RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates);
|
||||
}
|
||||
|
||||
/**
|
||||
* go through the quality score table and use the # observations and the empirical quality score
|
||||
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
|
||||
* generate a quantization map (recalibrated_qual -> quantized_qual)
|
||||
*/
|
||||
private void quantizeQualityScores() {
|
||||
quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS);
|
||||
}
|
||||
|
||||
private void generateReport() {
|
||||
RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 12/18/12
|
||||
* Time: 3:50 PM
|
||||
*
|
||||
* TODO -- merge in ReadCovariates?
|
||||
*/
|
||||
public final class ReadRecalibrationInfo {
|
||||
private final GATKSAMRecord read;
|
||||
private final int length;
|
||||
private final ReadCovariates covariates;
|
||||
private final boolean[] skips;
|
||||
private final byte[] baseQuals, insertionQuals, deletionQuals;
|
||||
private final double[] snpErrors, insertionErrors, deletionErrors;
|
||||
|
||||
public ReadRecalibrationInfo(final GATKSAMRecord read,
|
||||
final ReadCovariates covariates,
|
||||
final boolean[] skips,
|
||||
final double[] snpErrors,
|
||||
final double[] insertionErrors,
|
||||
final double[] deletionErrors) {
|
||||
if ( read == null ) throw new IllegalArgumentException("read cannot be null");
|
||||
if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null");
|
||||
if ( skips == null ) throw new IllegalArgumentException("skips cannot be null");
|
||||
if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null");
|
||||
if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null");
|
||||
if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null");
|
||||
|
||||
this.read = read;
|
||||
this.baseQuals = read.getBaseQualities();
|
||||
this.length = baseQuals.length;
|
||||
this.covariates = covariates;
|
||||
this.skips = skips;
|
||||
this.insertionQuals = read.getExistingBaseInsertionQualities();
|
||||
this.deletionQuals = read.getExistingBaseDeletionQualities();
|
||||
this.snpErrors = snpErrors;
|
||||
this.insertionErrors = insertionErrors;
|
||||
this.deletionErrors = deletionErrors;
|
||||
|
||||
if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length);
|
||||
if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length);
|
||||
if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length);
|
||||
if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the qual score for event type at offset
|
||||
*
|
||||
* @param eventType the type of event we want the qual for
|
||||
* @param offset the offset into this read for the qual
|
||||
* @return a valid quality score for event at offset
|
||||
*/
|
||||
@Requires("validOffset(offset)")
|
||||
@Ensures("validQual(result)")
|
||||
public byte getQual(final EventType eventType, final int offset) {
|
||||
switch ( eventType ) {
|
||||
case BASE_SUBSTITUTION: return baseQuals[offset];
|
||||
// note optimization here -- if we don't have ins/del quals we just return the default byte directly
|
||||
case BASE_INSERTION: return insertionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : insertionQuals[offset];
|
||||
case BASE_DELETION: return deletionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : deletionQuals[offset];
|
||||
default: throw new IllegalStateException("Unknown event type " + eventType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the error fraction for event type at offset
|
||||
*
|
||||
* The error fraction is a value between 0 and 1 that indicates how much certainty we have
|
||||
* in the error occurring at offset. A value of 1 means that the error definitely occurs at this
|
||||
* site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight
|
||||
* of the error belongs here
|
||||
*
|
||||
* @param eventType the type of event we want the qual for
|
||||
* @param offset the offset into this read for the qual
|
||||
* @return a fractional weight for an error at this offset
|
||||
*/
|
||||
@Requires("validOffset(offset)")
|
||||
@Ensures("result >= 0.0 && result <= 1.0")
|
||||
public double getErrorFraction(final EventType eventType, final int offset) {
|
||||
switch ( eventType ) {
|
||||
case BASE_SUBSTITUTION: return snpErrors[offset];
|
||||
case BASE_INSERTION: return insertionErrors[offset];
|
||||
case BASE_DELETION: return deletionErrors[offset];
|
||||
default: throw new IllegalStateException("Unknown event type " + eventType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the read involved in this recalibration info
|
||||
* @return a non-null GATKSAMRecord
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GATKSAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should offset in this read be skipped (because it's covered by a known variation site?)
|
||||
* @param offset a valid offset into this info
|
||||
* @return true if offset should be skipped, false otherwise
|
||||
*/
|
||||
@Requires("validOffset(offset)")
|
||||
public boolean skip(final int offset) {
|
||||
return skips[offset];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets
|
||||
* @return a non-null ReadCovariates object
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ReadCovariates getCovariatesValues() {
|
||||
return covariates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures an offset is valid. Used in contracts
|
||||
* @param offset a proposed offset
|
||||
* @return true if offset is valid w.r.t. the data in this object, false otherwise
|
||||
*/
|
||||
private boolean validOffset(final int offset) {
|
||||
return offset >= 0 && offset < baseQuals.length;
|
||||
}
|
||||
|
||||
private boolean validQual(final byte result) {
|
||||
return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,251 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 27, 2009
|
||||
*
|
||||
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
|
||||
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
|
||||
*/
|
||||
|
||||
public class RecalibrationArgumentCollection {
|
||||
|
||||
/**
|
||||
* This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
|
||||
* so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
|
||||
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
||||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||
*/
|
||||
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out.
|
||||
*/
|
||||
@Gather(BQSRGatherer.class)
|
||||
@Output(doc = "The output recalibration table file to create", required = true)
|
||||
public File RECAL_TABLE_FILE = null;
|
||||
public PrintStream RECAL_TABLE;
|
||||
|
||||
/**
|
||||
* If not provided, then no plots will be generated (useful for queue scatter/gathering).
|
||||
* However, we *highly* recommend that users generate these plots whenever possible for QC checking.
|
||||
*/
|
||||
@Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false)
|
||||
public File RECAL_PDF_FILE = null;
|
||||
|
||||
/**
|
||||
* If not provided, then a temporary file is created and then deleted upon completion.
|
||||
* For advanced users only.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false)
|
||||
public File RECAL_CSV_FILE = null;
|
||||
|
||||
/**
|
||||
* Note that the --list argument requires a fully resolved and correct command-line to work.
|
||||
*/
|
||||
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||
public boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* Note that the ReadGroup and QualityScore covariates are required and do not need to be specified.
|
||||
* Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default.
|
||||
* Use the --list argument to see the available covariates.
|
||||
*/
|
||||
@Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false)
|
||||
public String[] COVARIATES = null;
|
||||
|
||||
/*
|
||||
* The Cycle and Context covariates are standard and are included by default unless this argument is provided.
|
||||
* Note that the ReadGroup and QualityScore covariates are required and cannot be excluded.
|
||||
*/
|
||||
@Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
|
||||
public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
|
||||
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
public boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* reads which have had the reference inserted because of color space inconsistencies.
|
||||
*/
|
||||
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
|
||||
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
|
||||
* their color space tag can not be recalibrated.
|
||||
*/
|
||||
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
|
||||
public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base mismatches
|
||||
*/
|
||||
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
|
||||
public int MISMATCHES_CONTEXT_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
|
||||
*/
|
||||
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
|
||||
public int INDELS_CONTEXT_SIZE = 3;
|
||||
|
||||
/**
|
||||
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
|
||||
* This argument is ignored if the Cycle covariate is not used.
|
||||
*/
|
||||
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
|
||||
public int MAXIMUM_CYCLE_VALUE = 500;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
*/
|
||||
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
|
||||
public byte MISMATCHES_DEFAULT_QUALITY = -1;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
|
||||
*/
|
||||
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
|
||||
public byte INSERTIONS_DEFAULT_QUALITY = 45;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
*/
|
||||
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
|
||||
public byte DELETIONS_DEFAULT_QUALITY = 45;
|
||||
|
||||
/**
|
||||
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
|
||||
*/
|
||||
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
|
||||
public byte LOW_QUAL_TAIL = 2;
|
||||
|
||||
/**
|
||||
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
|
||||
* This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
|
||||
*/
|
||||
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
|
||||
public int QUANTIZING_LEVELS = 16;
|
||||
|
||||
/**
|
||||
* The tag name for the binary tag covariate (if using it)
|
||||
*/
|
||||
@Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
|
||||
public String BINARY_TAG_NAME = null;
|
||||
|
||||
/*
|
||||
* whether GATK report tables should have rows in sorted order, starting from leftmost column
|
||||
*/
|
||||
@Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false)
|
||||
public Boolean SORT_BY_ALL_COLUMNS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
|
||||
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
|
||||
|
||||
public File existingRecalibrationReport = null;
|
||||
|
||||
public GATKReportTable generateReportTable(final String covariateNames) {
|
||||
GATKReportTable argumentsTable;
|
||||
if(SORT_BY_ALL_COLUMNS) {
|
||||
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
|
||||
} else {
|
||||
argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2);
|
||||
}
|
||||
argumentsTable.addColumn("Argument");
|
||||
argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
|
||||
argumentsTable.addRowID("covariate", true);
|
||||
argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames);
|
||||
argumentsTable.addRowID("no_standard_covs", true);
|
||||
argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES);
|
||||
argumentsTable.addRowID("run_without_dbsnp", true);
|
||||
argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
|
||||
argumentsTable.addRowID("solid_recal_mode", true);
|
||||
argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
|
||||
argumentsTable.addRowID("solid_nocall_strategy", true);
|
||||
argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
|
||||
argumentsTable.addRowID("mismatches_context_size", true);
|
||||
argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
|
||||
argumentsTable.addRowID("indels_context_size", true);
|
||||
argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
|
||||
argumentsTable.addRowID("mismatches_default_quality", true);
|
||||
argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
|
||||
argumentsTable.addRowID("insertions_default_quality", true);
|
||||
argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
|
||||
argumentsTable.addRowID("low_quality_tail", true);
|
||||
argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
|
||||
argumentsTable.addRowID("default_platform", true);
|
||||
argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
|
||||
argumentsTable.addRowID("force_platform", true);
|
||||
argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
|
||||
argumentsTable.addRowID("quantizing_levels", true);
|
||||
argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
|
||||
argumentsTable.addRowID("recalibration_report", true);
|
||||
argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath());
|
||||
argumentsTable.addRowID("plot_pdf_file", true);
|
||||
argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath());
|
||||
argumentsTable.addRowID("binary_tag_name", true);
|
||||
argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME);
|
||||
return argumentsTable;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
|
||||
import org.broadinstitute.sting.utils.recalibration.*;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public class RecalibrationEngine {
|
||||
final protected Covariate[] covariates;
|
||||
final private int numReadGroups;
|
||||
final private PrintStream maybeLogStream;
|
||||
final private boolean lowMemoryMode;
|
||||
|
||||
/**
|
||||
* Has finalizeData() been called?
|
||||
*/
|
||||
private boolean finalized = false;
|
||||
|
||||
/**
|
||||
* The final (merged, etc) recalibration tables, suitable for downstream analysis.
|
||||
*/
|
||||
private RecalibrationTables finalRecalibrationTables = null;
|
||||
|
||||
private final List<RecalibrationTables> recalibrationTablesList = new LinkedList<RecalibrationTables>();
|
||||
|
||||
private final ThreadLocal<RecalibrationTables> threadLocalTables = new ThreadLocal<RecalibrationTables>() {
|
||||
private synchronized RecalibrationTables makeAndCaptureTable() {
|
||||
final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream);
|
||||
recalibrationTablesList.add(newTable);
|
||||
return newTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected synchronized RecalibrationTables initialValue() {
|
||||
if ( lowMemoryMode ) {
|
||||
return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0);
|
||||
} else {
|
||||
return makeAndCaptureTable();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Get a recalibration table suitable for updating the underlying RecalDatums
|
||||
*
|
||||
* May return a thread-local version, or a single version, depending on the initialization
|
||||
* arguments of this instance.
|
||||
*
|
||||
* @return updated tables
|
||||
*/
|
||||
protected RecalibrationTables getUpdatableRecalibrationTables() {
|
||||
return threadLocalTables.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the recalibration engine
|
||||
*
|
||||
* Called once before any calls to updateDataForRead are made. The engine should prepare itself
|
||||
* to handle any number of updateDataForRead calls containing ReadRecalibrationInfo containing
|
||||
* keys for each of the covariates provided.
|
||||
*
|
||||
* The engine should collect match and mismatch data into the recalibrationTables data.
|
||||
*
|
||||
* @param covariates an array of the covariates we'll be using in this engine, order matters
|
||||
* @param numReadGroups the number of read groups we should use for the recalibration tables
|
||||
* @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables
|
||||
*/
|
||||
public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) {
|
||||
if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null");
|
||||
if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups);
|
||||
|
||||
this.covariates = covariates.clone();
|
||||
this.numReadGroups = numReadGroups;
|
||||
this.maybeLogStream = maybeLogStream;
|
||||
this.lowMemoryMode = enableLowMemoryMode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the recalibration statistics using the information in recalInfo
|
||||
* @param recalInfo data structure holding information about the recalibration values for a single read
|
||||
*/
|
||||
@Requires("recalInfo != null")
|
||||
public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) {
|
||||
final GATKSAMRecord read = recalInfo.getRead();
|
||||
final ReadCovariates readCovariates = recalInfo.getCovariatesValues();
|
||||
final RecalibrationTables tables = getUpdatableRecalibrationTables();
|
||||
final NestedIntegerArray<RecalDatum> qualityScoreTable = tables.getQualityScoreTable();
|
||||
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( ! recalInfo.skip(offset) ) {
|
||||
|
||||
for (final EventType eventType : EventType.values()) {
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
final int eventIndex = eventType.ordinal();
|
||||
final byte qual = recalInfo.getQual(eventType, offset);
|
||||
final double isError = recalInfo.getErrorFraction(eventType, offset);
|
||||
|
||||
RecalUtils.incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
|
||||
RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finalize, if appropriate, all derived data in recalibrationTables.
|
||||
*
|
||||
* Called once after all calls to updateDataForRead have been issued.
|
||||
*
|
||||
* Assumes that all of the principal tables (by quality score) have been completely updated,
|
||||
* and walks over this data to create summary data tables like by read group table.
|
||||
*/
|
||||
public void finalizeData() {
|
||||
if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called");
|
||||
|
||||
// merge all of the thread-local tables
|
||||
finalRecalibrationTables = mergeThreadLocalRecalibrationTables();
|
||||
|
||||
final NestedIntegerArray<RecalDatum> byReadGroupTable = finalRecalibrationTables.getReadGroupTable();
|
||||
final NestedIntegerArray<RecalDatum> byQualTable = finalRecalibrationTables.getQualityScoreTable();
|
||||
|
||||
// iterate over all values in the qual table
|
||||
for ( NestedIntegerArray.Leaf<RecalDatum> leaf : byQualTable.getAllLeaves() ) {
|
||||
final int rgKey = leaf.keys[0];
|
||||
final int eventIndex = leaf.keys[2];
|
||||
final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex);
|
||||
final RecalDatum qualDatum = leaf.value;
|
||||
|
||||
if ( rgDatum == null ) {
|
||||
// create a copy of qualDatum, and initialize byReadGroup table with it
|
||||
byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex);
|
||||
} else {
|
||||
// combine the qual datum with the existing datum in the byReadGroup table
|
||||
rgDatum.combine(qualDatum);
|
||||
}
|
||||
}
|
||||
|
||||
finalized = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge all of the thread local recalibration tables into a single one.
|
||||
*
|
||||
* Reuses one of the recalibration tables to hold the merged table, so this function can only be
|
||||
* called once in the engine.
|
||||
*
|
||||
* @return the merged recalibration table
|
||||
*/
|
||||
@Requires("! finalized")
|
||||
private RecalibrationTables mergeThreadLocalRecalibrationTables() {
|
||||
if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty");
|
||||
|
||||
RecalibrationTables merged = null;
|
||||
for ( final RecalibrationTables table : recalibrationTablesList ) {
|
||||
if ( merged == null )
|
||||
// fast path -- if there's only only one table, so just make it the merged one
|
||||
merged = table;
|
||||
else {
|
||||
merged.combine(table);
|
||||
}
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the final recalibration tables, after finalizeData() has been called
|
||||
*
|
||||
* This returns the finalized recalibration table collected by this engine.
|
||||
*
|
||||
* It is an error to call this function before finalizeData has been called
|
||||
*
|
||||
* @return the finalized recalibration table collected by this engine
|
||||
*/
|
||||
public RecalibrationTables getFinalRecalibrationTables() {
|
||||
if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called");
|
||||
return finalRecalibrationTables;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.*;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
*/
|
||||
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
||||
@PartitionBy(PartitionType.READ)
|
||||
public class RecalibrationPerformance extends RodWalker<Integer, Integer> implements NanoSchedulable {
|
||||
|
||||
@Output(doc="Write output to this file", required = true)
|
||||
public PrintStream out;
|
||||
|
||||
@Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file")
|
||||
public File RECAL_FILE = null;
|
||||
|
||||
public void initialize() {
|
||||
out.println("Cycle\tQrep\tQemp\tIsJoint\tObservations\tErrors");
|
||||
|
||||
final GATKReport report = new GATKReport(RECAL_FILE);
|
||||
final GATKReportTable table = report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
|
||||
for ( int row = 0; row < table.getNumRows(); row++ ) {
|
||||
|
||||
final int nObservations = (int)asDouble(table.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
|
||||
final int nErrors = (int)Math.round(asDouble(table.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)));
|
||||
final double empiricalQuality = asDouble(table.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
|
||||
|
||||
final byte QReported = Byte.parseByte((String) table.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME));
|
||||
|
||||
final double jointEstimateQemp = RecalDatum.bayesianEstimateOfEmpiricalQuality(nObservations, nErrors, QReported);
|
||||
|
||||
//if ( Math.abs((int)(jointEstimateQemp - empiricalQuality)) > 1 )
|
||||
// System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f, point Qemp = %f, joint Qemp = %f", estimatedQReported, nObservations, nErrors, empiricalQuality, jointEstimateQemp));
|
||||
|
||||
if ( table.get(row, RecalUtils.COVARIATE_NAME_COLUMN_NAME).equals("Cycle") &&
|
||||
table.get(row, RecalUtils.EVENT_TYPE_COLUMN_NAME).equals("M") &&
|
||||
table.get(row, RecalUtils.READGROUP_COLUMN_NAME).equals("20FUKAAXX100202.6") &&
|
||||
(QReported == 6 || QReported == 10 || QReported == 20 || QReported == 30 || QReported == 45) ) {
|
||||
out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, Math.round(empiricalQuality), "False", (int)nObservations, (int)nErrors));
|
||||
out.println(String.format("%s\t%d\t%d\t%s\t%d\t%d", table.get(row, RecalUtils.COVARIATE_VALUE_COLUMN_NAME), QReported, (int)jointEstimateQemp, "True", (int)nObservations, (int)nErrors));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDone() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private double asDouble(final Object o) {
|
||||
if ( o instanceof Double )
|
||||
return (Double)o;
|
||||
else if ( o instanceof Integer )
|
||||
return (Integer)o;
|
||||
else if ( o instanceof Long )
|
||||
return (Long)o;
|
||||
else
|
||||
throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer counter, Integer sum) { return 0; }
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer sum) {}
|
||||
}
|
||||
Loading…
Reference in New Issue