DelocalizedBaseRecalibrator becomes the BaseRecalibrator.
This commit is contained in:
parent
ac99437eec
commit
4e661847b2
|
|
@ -39,57 +39,12 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
|
||||
// optimization: only allocate temp arrays once per thread
|
||||
private final ThreadLocal<byte[]> threadLocalTempQualArray = new ThreadLocalArray<byte[]>(EventType.values().length, byte.class);
|
||||
private final ThreadLocal<boolean[]> threadLocalTempErrorArray = new ThreadLocalArray<boolean[]>(EventType.values().length, boolean.class);
|
||||
private final ThreadLocal<double[]> threadLocalTempFractionalErrorArray = new ThreadLocalArray<double[]>(EventType.values().length, double.class);
|
||||
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
|
||||
super.initialize(covariates, recalibrationTables);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches for all three
|
||||
* categories (mismatches, insertions and deletions).
|
||||
*
|
||||
* @param pileupElement The pileup element to update
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
@Override
|
||||
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
|
||||
final int offset = pileupElement.getOffset();
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
|
||||
|
||||
byte[] tempQualArray = threadLocalTempQualArray.get();
|
||||
boolean[] tempErrorArray = threadLocalTempErrorArray.get();
|
||||
|
||||
tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual();
|
||||
tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase);
|
||||
tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual();
|
||||
tempErrorArray[EventType.BASE_INSERTION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterInsertion() : pileupElement.isBeforeInsertion();
|
||||
tempQualArray[EventType.BASE_DELETION.index] = pileupElement.getBaseDeletionQual();
|
||||
tempErrorArray[EventType.BASE_DELETION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterDeletedBase() : pileupElement.isBeforeDeletedBase();
|
||||
|
||||
for (final EventType eventType : EventType.values()) {
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
final int eventIndex = eventType.index;
|
||||
final byte qual = tempQualArray[eventIndex];
|
||||
final boolean isError = tempErrorArray[eventIndex];
|
||||
|
||||
// TODO: should this really be combine rather than increment?
|
||||
combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
|
|
|
|||
|
|
@ -51,21 +51,21 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
|
||||
String HiSeqInterval = "chr1:10,000,000-10,100,000";
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "55a46d8f5d2f9acfa2d7659e18b6df43")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "8e930f56a8905a5999af7d6ba8a92f91")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "8e87bee4bd6531b405082c4da785f1f5")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "b309a5f57b861d7f31cb76cdac4ff8a7")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "4c75d47ed2cf93b499be8fbb29b24dfd")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "43b06e5568a89e4ce1dd9146ce580c89")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "25f4f48dba27475b0cd7c06ef0239aba")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "dfcba9acc32b4a1dfeceea135b48615a")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "e8077b721f2e6f51c1945b6f6236835c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "fbdc8d0fd312e3a7f49063c580cf5d92")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "4f47415628201a4f3c33e48ec066677b")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "1e89d2b88f4218363b9322b38e9536f2")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "a7beb0b16756257a274eecf73474ed90")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "dfcba9acc32b4a1dfeceea135b48615a")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2082c70e08f1c14290c3812021832f83")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "f70d8b5358bc2f76696f14b7a807ede0")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5e657fd6a44dcdc7674b6e5a2de5dc83")},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,35 +25,41 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.recalibration.QuantizationInfo;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationReport;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.recalibration.*;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
|
||||
|
|
@ -103,19 +109,20 @@ import java.util.ArrayList;
|
|||
* </pre>
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class})
|
||||
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
|
||||
@By(DataSource.READS)
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality
|
||||
@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta
|
||||
public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
||||
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
||||
@PartitionBy(PartitionType.READ)
|
||||
public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSchedulable {
|
||||
@ArgumentCollection
|
||||
private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization
|
||||
|
||||
|
||||
private RecalibrationTables recalibrationTables;
|
||||
|
||||
private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental)
|
||||
|
|
@ -124,12 +131,12 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
|
||||
private int minimumQToUse;
|
||||
|
||||
protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
|
||||
protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
|
||||
protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
|
||||
private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
|
||||
|
||||
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
|
||||
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
|
||||
|
||||
/**
|
||||
* Parse the -cov arguments and create a list of covariates to be used here
|
||||
|
|
@ -137,6 +144,8 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
|
||||
|
||||
// check for unsupported access
|
||||
if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals)
|
||||
throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument");
|
||||
|
|
@ -185,13 +194,21 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);
|
||||
|
||||
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
|
||||
|
||||
try {
|
||||
// fasta reference reader for use with BAQ calculation
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private RecalibrationEngine initializeRecalibrationEngine() {
|
||||
|
||||
final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class);
|
||||
try {
|
||||
Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null);
|
||||
final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null);
|
||||
constructor.setAccessible(true);
|
||||
return (RecalibrationEngine)constructor.newInstance();
|
||||
}
|
||||
|
|
@ -200,60 +217,207 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
}
|
||||
}
|
||||
|
||||
private boolean readHasBeenSkipped( final GATKSAMRecord read ) {
|
||||
return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE);
|
||||
}
|
||||
|
||||
private boolean isLowQualityBase( final PileupElement p ) {
|
||||
return p.getQual() < minimumQToUse;
|
||||
}
|
||||
|
||||
private boolean readNotSeen( final GATKSAMRecord read ) {
|
||||
return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE);
|
||||
private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
|
||||
return read.getBaseQualities()[offset] < minimumQToUse;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
*
|
||||
* @param tracker the reference metadata tracker
|
||||
* @param ref the reference context
|
||||
* @param context the alignment context
|
||||
* @return returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
long countedSites = 0L;
|
||||
// Only analyze sites not present in the provided known sites
|
||||
if (tracker.getValues(RAC.knownSites).size() == 0) {
|
||||
for (final PileupElement p : context.getBasePileup()) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
final int offset = p.getOffset();
|
||||
public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
|
||||
|
||||
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
|
||||
if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) )
|
||||
continue;
|
||||
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead);
|
||||
if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
|
||||
|
||||
if (readNotSeen(read)) {
|
||||
read.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
|
||||
RecalUtils.parsePlatformForRead(read, RAC);
|
||||
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) {
|
||||
read.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
|
||||
continue;
|
||||
RecalUtils.parsePlatformForRead(read, RAC);
|
||||
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
|
||||
return 0L; // skip this read completely
|
||||
}
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
|
||||
|
||||
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
|
||||
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
|
||||
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
|
||||
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
|
||||
final byte[] baqArray = calculateBAQArray(read);
|
||||
|
||||
if( baqArray != null ) { // some reads just can't be BAQ'ed
|
||||
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
|
||||
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
|
||||
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
|
||||
recalibrationEngine.updateDataForRead(read, skip, snpErrors, insertionErrors, deletionErrors);
|
||||
return 1L;
|
||||
} else {
|
||||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
|
||||
final byte[] bases = read.getReadBases();
|
||||
final boolean[] skip = new boolean[bases.length];
|
||||
final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
|
||||
for( int iii = 0; iii < bases.length; iii++ ) {
|
||||
skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
|
||||
}
|
||||
return skip;
|
||||
}
|
||||
|
||||
protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
|
||||
return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
|
||||
}
|
||||
|
||||
protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
|
||||
final int BUFFER_SIZE = 0;
|
||||
final int readLength = read.getReadBases().length;
|
||||
final boolean[] knownSites = new boolean[readLength];
|
||||
Arrays.fill(knownSites, false);
|
||||
for( final Feature f : features ) {
|
||||
int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
|
||||
if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; }
|
||||
int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
|
||||
if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; }
|
||||
Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true);
|
||||
}
|
||||
return knownSites;
|
||||
}
|
||||
|
||||
// BUGBUG: can be merged with calculateIsIndel
|
||||
protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
|
||||
final int[] snp = new int[readBases.length];
|
||||
int readPos = 0;
|
||||
int refPos = 0;
|
||||
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
switch (ce.getOperator()) {
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
|
||||
readPos++;
|
||||
refPos++;
|
||||
}
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
|
||||
}
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if (!ReadUtils.isSOLiDRead(read) ||
|
||||
RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
RecalUtils.isColorSpaceConsistent(read, offset))
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
recalibrationEngine.updateDataForPileupElement(p, ref.getBase());
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
refPos += elementLength;
|
||||
break;
|
||||
case I:
|
||||
case S: // ReferenceContext doesn't have the soft clipped bases!
|
||||
readPos += elementLength;
|
||||
break;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
|
||||
}
|
||||
countedSites++;
|
||||
}
|
||||
return snp;
|
||||
}
|
||||
|
||||
protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final int[] indel = new int[readBases.length];
|
||||
Arrays.fill(indel, 0);
|
||||
int readPos = 0;
|
||||
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
switch (ce.getOperator()) {
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
case S:
|
||||
{
|
||||
readPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case D:
|
||||
{
|
||||
final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) );
|
||||
indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 );
|
||||
break;
|
||||
}
|
||||
case I:
|
||||
{
|
||||
final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
|
||||
if( forwardStrandRead ) {
|
||||
indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
|
||||
}
|
||||
for (int iii = 0; iii < elementLength; iii++) {
|
||||
readPos++;
|
||||
}
|
||||
if( !forwardStrandRead ) {
|
||||
indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 );
|
||||
}
|
||||
break;
|
||||
}
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
|
||||
}
|
||||
}
|
||||
return indel;
|
||||
}
|
||||
|
||||
protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
|
||||
if(errorArray.length != baqArray.length ) {
|
||||
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
|
||||
}
|
||||
|
||||
return countedSites;
|
||||
final byte NO_BAQ_UNCERTAINTY = (byte)'@';
|
||||
final int BLOCK_START_UNSET = -1;
|
||||
|
||||
final double[] fractionalErrors = new double[baqArray.length];
|
||||
Arrays.fill(fractionalErrors, 0.0);
|
||||
boolean inBlock = false;
|
||||
int blockStartIndex = BLOCK_START_UNSET;
|
||||
int iii;
|
||||
for( iii = 0; iii < fractionalErrors.length; iii++ ) {
|
||||
if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
|
||||
if( !inBlock ) {
|
||||
fractionalErrors[iii] = (double) errorArray[iii];
|
||||
} else {
|
||||
calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
|
||||
inBlock = false; // reset state variables
|
||||
blockStartIndex = BLOCK_START_UNSET; // reset state variables
|
||||
}
|
||||
} else {
|
||||
inBlock = true;
|
||||
if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
|
||||
}
|
||||
}
|
||||
if( inBlock ) {
|
||||
calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
|
||||
}
|
||||
if( fractionalErrors.length != errorArray.length ) {
|
||||
throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
|
||||
}
|
||||
return fractionalErrors;
|
||||
}
|
||||
|
||||
private static void calculateAndStoreErrorsInBlock( final int iii,
|
||||
final int blockStartIndex,
|
||||
final int[] errorArray,
|
||||
final double[] fractionalErrors ) {
|
||||
int totalErrors = 0;
|
||||
for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
|
||||
totalErrors += errorArray[jjj];
|
||||
}
|
||||
for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
|
||||
fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] calculateBAQArray( final GATKSAMRecord read ) {
|
||||
baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
|
||||
return BAQ.getBAQTag(read);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -286,12 +450,12 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
generateReport();
|
||||
logger.info("...done!");
|
||||
|
||||
if (RAC.RECAL_PDF_FILE != null) {
|
||||
if ( RAC.RECAL_PDF_FILE != null ) {
|
||||
logger.info("Generating recalibration plots...");
|
||||
generatePlots();
|
||||
}
|
||||
|
||||
logger.info("Processed: " + result + " sites");
|
||||
logger.info("Processed: " + result + " reads");
|
||||
}
|
||||
|
||||
private void generatePlots() {
|
||||
|
|
@ -304,7 +468,6 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* go through the quality score table and use the # observations and the empirical quality score
|
||||
* to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
|
||||
|
|
@ -317,5 +480,4 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> {
|
|||
private void generateReport() {
|
||||
RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -33,7 +33,5 @@ public interface RecalibrationEngine {
|
|||
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables);
|
||||
|
||||
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase);
|
||||
|
||||
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,41 +46,30 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
this.recalibrationTables = recalibrationTables;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches for mismatches only.
|
||||
*
|
||||
* @param pileupElement The pileup element to update
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
@Override
|
||||
public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
|
||||
final int offset = pileupElement.getOffset();
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
|
||||
|
||||
final byte qual = pileupElement.getQual();
|
||||
final boolean isError = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase);
|
||||
|
||||
final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION);
|
||||
final int eventIndex = EventType.BASE_SUBSTITUTION.index;
|
||||
|
||||
// TODO: should this really be combine rather than increment?
|
||||
combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version");
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( !skip[offset] ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
|
||||
final byte qual = read.getBaseQualities()[offset];
|
||||
final double isError = snpErrors[offset];
|
||||
|
||||
final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION);
|
||||
final int eventIndex = EventType.BASE_SUBSTITUTION.index;
|
||||
|
||||
combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -90,10 +79,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
* @param isError whether or not the observation is an error
|
||||
* @return a new RecalDatum object with the observation and the error
|
||||
*/
|
||||
protected RecalDatum createDatumObject(final byte reportedQual, final boolean isError) {
|
||||
return new RecalDatum(1, isError ? 1:0, reportedQual);
|
||||
}
|
||||
|
||||
protected RecalDatum createDatumObject(final byte reportedQual, final double isError) {
|
||||
return new RecalDatum(1, isError, reportedQual);
|
||||
}
|
||||
|
|
@ -108,46 +93,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments the RecalDatum at the specified position in the specified table, or put a new item there
|
||||
* if there isn't already one.
|
||||
*
|
||||
* Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put()
|
||||
* to return false if another thread inserts a new item at our position in the middle of our put operation.
|
||||
*
|
||||
* @param table the table that holds/will hold our item
|
||||
* @param qual qual for this event
|
||||
* @param isError error status for this event
|
||||
* @param keys location in table of our item
|
||||
*/
|
||||
protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray<RecalDatum> table, final byte qual, final boolean isError, final int... keys ) {
|
||||
final RecalDatum existingDatum = table.get(keys);
|
||||
|
||||
// There are three cases here:
|
||||
//
|
||||
// 1. The original get succeeded (existingDatum != null) because there already was an item in this position.
|
||||
// In this case we can just increment the existing item.
|
||||
//
|
||||
// 2. The original get failed (existingDatum == null), and we successfully put a new item in this position
|
||||
// and are done.
|
||||
//
|
||||
// 3. The original get failed (existingDatum == null), AND the put fails because another thread comes along
|
||||
// in the interim and puts an item in this position. In this case we need to do another get after the
|
||||
// failed put to get the item the other thread put here and increment it.
|
||||
if ( existingDatum == null ) {
|
||||
// No existing item, try to put a new one
|
||||
if ( ! table.put(createDatumObject(qual, isError), keys) ) {
|
||||
// Failed to put a new item because another thread came along and put an item here first.
|
||||
// Get the newly-put item and increment it (item is guaranteed to exist at this point)
|
||||
table.get(keys).increment(isError);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Easy case: already an item here, so increment it
|
||||
existingDatum.increment(isError);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments the RecalDatum at the specified position in the specified table, or put a new item there
|
||||
* if there isn't already one.
|
||||
|
|
@ -177,36 +122,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a
|
||||
* new item there if there isn't already one.
|
||||
*
|
||||
* Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put()
|
||||
* to return false if another thread inserts a new item at our position in the middle of our put operation.
|
||||
*
|
||||
* @param table the table that holds/will hold our item
|
||||
* @param qual qual for this event
|
||||
* @param isError error status for this event
|
||||
* @param keys location in table of our item
|
||||
*/
|
||||
protected void combineDatumOrPutIfNecessary( final NestedIntegerArray<RecalDatum> table, final byte qual, final boolean isError, final int... keys ) {
|
||||
final RecalDatum existingDatum = table.get(keys);
|
||||
final RecalDatum newDatum = createDatumObject(qual, isError);
|
||||
|
||||
if ( existingDatum == null ) {
|
||||
// No existing item, try to put a new one
|
||||
if ( ! table.put(newDatum, keys) ) {
|
||||
// Failed to put a new item because another thread came along and put an item here first.
|
||||
// Get the newly-put item and combine it with our item (item is guaranteed to exist at this point)
|
||||
table.get(keys).combine(newDatum);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Easy case: already an item here, so combine it with our item
|
||||
existingDatum.combine(newDatum);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a
|
||||
* new item there if there isn't already one.
|
||||
|
|
|
|||
Loading…
Reference in New Issue