diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index be8357679..d0bcd0eb3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -39,57 +39,12 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp // optimization: only allocate temp arrays once per thread private final ThreadLocal threadLocalTempQualArray = new ThreadLocalArray(EventType.values().length, byte.class); - private final ThreadLocal threadLocalTempErrorArray = new ThreadLocalArray(EventType.values().length, boolean.class); private final ThreadLocal threadLocalTempFractionalErrorArray = new ThreadLocalArray(EventType.values().length, double.class); public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { super.initialize(covariates, recalibrationTables); } - /** - * Loop through the list of requested covariates and pick out the value from the read, offset, and reference - * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches for all three - * categories (mismatches, insertions and deletions). - * - * @param pileupElement The pileup element to update - * @param refBase The reference base at this locus - */ - @Override - public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { - final int offset = pileupElement.getOffset(); - final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); - - byte[] tempQualArray = threadLocalTempQualArray.get(); - boolean[] tempErrorArray = threadLocalTempErrorArray.get(); - - tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual(); - tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); - tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual(); - tempErrorArray[EventType.BASE_INSERTION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterInsertion() : pileupElement.isBeforeInsertion(); - tempQualArray[EventType.BASE_DELETION.index] = pileupElement.getBaseDeletionQual(); - tempErrorArray[EventType.BASE_DELETION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterDeletedBase() : pileupElement.isBeforeDeletedBase(); - - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = tempQualArray[eventIndex]; - final boolean isError = tempErrorArray[eventIndex]; - - // TODO: should this really be combine rather than increment? - combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); - } - } - } - @Override public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { for( int offset = 0; offset < read.getReadBases().length; offset++ ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 780ef9e0b..b839382dc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -51,21 +51,21 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "55a46d8f5d2f9acfa2d7659e18b6df43")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "8e930f56a8905a5999af7d6ba8a92f91")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "8e87bee4bd6531b405082c4da785f1f5")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "b309a5f57b861d7f31cb76cdac4ff8a7")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "4c75d47ed2cf93b499be8fbb29b24dfd")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "43b06e5568a89e4ce1dd9146ce580c89")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "25f4f48dba27475b0cd7c06ef0239aba")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "dfcba9acc32b4a1dfeceea135b48615a")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "e8077b721f2e6f51c1945b6f6236835c")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "fbdc8d0fd312e3a7f49063c580cf5d92")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "4f47415628201a4f3c33e48ec066677b")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "1e89d2b88f4218363b9322b38e9536f2")}, - {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "a7beb0b16756257a274eecf73474ed90")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "dfcba9acc32b4a1dfeceea135b48615a")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2082c70e08f1c14290c3812021832f83")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")}, + {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")}, + {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "f70d8b5358bc2f76696f14b7a807ede0")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5e657fd6a44dcdc7674b6e5a2de5dc83")}, }; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 245c8e3e8..9506510a9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -25,35 +25,41 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.CigarElement; import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.recalibration.QuantizationInfo; -import org.broadinstitute.sting.utils.recalibration.RecalUtils; -import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.Constructor; import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; /** * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). @@ -103,19 +109,20 @@ import java.util.ArrayList; * */ -@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) +@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class}) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) -@By(DataSource.READS) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file -@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality -@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker { - +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@PartitionBy(PartitionType.READ) +public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization - + private RecalibrationTables recalibrationTables; private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) @@ -124,12 +131,12 @@ public class BaseRecalibrator extends LocusWalker { private int minimumQToUse; - protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. - protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; + private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector + private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation /** * Parse the -cov arguments and create a list of covariates to be used here @@ -137,6 +144,8 @@ public class BaseRecalibrator extends LocusWalker { */ public void initialize() { + baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty + // check for unsupported access if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals) throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument"); @@ -185,13 +194,21 @@ public class BaseRecalibrator extends LocusWalker { recalibrationEngine.initialize(requestedCovariates, recalibrationTables); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; + + try { + // fasta reference reader for use with BAQ calculation + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); + } + } private RecalibrationEngine initializeRecalibrationEngine() { final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class); try { - Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); + final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); constructor.setAccessible(true); return (RecalibrationEngine)constructor.newInstance(); } @@ -200,60 +217,207 @@ public class BaseRecalibrator extends LocusWalker { } } - private boolean readHasBeenSkipped( final GATKSAMRecord read ) { - return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE); - } - - private boolean isLowQualityBase( final PileupElement p ) { - return p.getQual() < minimumQToUse; - } - - private boolean readNotSeen( final GATKSAMRecord read ) { - return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE); + private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { + return read.getBaseQualities()[offset] < minimumQToUse; } /** * For each read at this locus get the various covariate values and increment that location in the map based on * whether or not the base matches the reference at this particular location - * - * @param tracker the reference metadata tracker - * @param ref the reference context - * @param context the alignment context - * @return returns 1, but this value isn't used in the reduce step */ - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - long countedSites = 0L; - // Only analyze sites not present in the provided known sites - if (tracker.getValues(RAC.knownSites).size() == 0) { - for (final PileupElement p : context.getBasePileup()) { - final GATKSAMRecord read = p.getRead(); - final int offset = p.getOffset(); + public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { - // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases) - if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) ) - continue; + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead); + if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it - if (readNotSeen(read)) { - read.setTemporaryAttribute(SEEN_ATTRIBUTE, true); - RecalUtils.parsePlatformForRead(read, RAC); - if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { - read.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true); - continue; + RecalUtils.parsePlatformForRead(read, RAC); + if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls + return 0L; // skip this read completely + } + read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); + + final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); + final byte[] baqArray = calculateBAQArray(read); + + if( baqArray != null ) { // some reads just can't be BAQ'ed + final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); + final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); + final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); + recalibrationEngine.updateDataForRead(read, skip, snpErrors, insertionErrors, deletionErrors); + return 1L; + } else { + return 0L; + } + } + + protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { + final byte[] bases = read.getReadBases(); + final boolean[] skip = new boolean[bases.length]; + final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites)); + for( int iii = 0; iii < bases.length; iii++ ) { + skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii); + } + return skip; + } + + protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) { + return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset); + } + + protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { + final int BUFFER_SIZE = 0; + final int readLength = read.getReadBases().length; + final boolean[] knownSites = new boolean[readLength]; + Arrays.fill(knownSites, false); + for( final Feature f : features ) { + int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; } + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; } + Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true); + } + return knownSites; + } + + // BUGBUG: can be merged with calculateIsIndel + protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) { + final byte[] readBases = read.getReadBases(); + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd()); + final int[] snp = new int[readBases.length]; + int readPos = 0; + int refPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + for( int iii = 0; iii < elementLength; iii++ ) { + snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 ); + readPos++; + refPos++; } - read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); - } - - // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if (!ReadUtils.isSOLiDRead(read) || - RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING || - RecalUtils.isColorSpaceConsistent(read, offset)) - // This base finally passed all the checks for a good base, so add it to the big data hashmap - recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); + break; + case D: + case N: + refPos += elementLength; + break; + case I: + case S: // ReferenceContext doesn't have the soft clipped bases! + readPos += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); } - countedSites++; + } + return snp; + } + + protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) { + final byte[] readBases = read.getReadBases(); + final int[] indel = new int[readBases.length]; + Arrays.fill(indel, 0); + int readPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + case S: + { + readPos += elementLength; + break; + } + case D: + { + final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) ); + indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 ); + break; + } + case I: + { + final boolean forwardStrandRead = !read.getReadNegativeStrandFlag(); + if( forwardStrandRead ) { + indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 ); + } + for (int iii = 0; iii < elementLength; iii++) { + readPos++; + } + if( !forwardStrandRead ) { + indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 ); + } + break; + } + case N: + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return indel; + } + + protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) { + if(errorArray.length != baqArray.length ) { + throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); } - return countedSites; + final byte NO_BAQ_UNCERTAINTY = (byte)'@'; + final int BLOCK_START_UNSET = -1; + + final double[] fractionalErrors = new double[baqArray.length]; + Arrays.fill(fractionalErrors, 0.0); + boolean inBlock = false; + int blockStartIndex = BLOCK_START_UNSET; + int iii; + for( iii = 0; iii < fractionalErrors.length; iii++ ) { + if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) { + if( !inBlock ) { + fractionalErrors[iii] = (double) errorArray[iii]; + } else { + calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors); + inBlock = false; // reset state variables + blockStartIndex = BLOCK_START_UNSET; // reset state variables + } + } else { + inBlock = true; + if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; } + } + } + if( inBlock ) { + calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors); + } + if( fractionalErrors.length != errorArray.length ) { + throw new ReviewedStingException("Output array length mismatch detected. Malformed read?"); + } + return fractionalErrors; + } + + private static void calculateAndStoreErrorsInBlock( final int iii, + final int blockStartIndex, + final int[] errorArray, + final double[] fractionalErrors ) { + int totalErrors = 0; + for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) { + totalErrors += errorArray[jjj]; + } + for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) { + fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1)); + } + } + + private byte[] calculateBAQArray( final GATKSAMRecord read ) { + baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); + return BAQ.getBAQTag(read); } /** @@ -286,12 +450,12 @@ public class BaseRecalibrator extends LocusWalker { generateReport(); logger.info("...done!"); - if (RAC.RECAL_PDF_FILE != null) { + if ( RAC.RECAL_PDF_FILE != null ) { logger.info("Generating recalibration plots..."); generatePlots(); } - logger.info("Processed: " + result + " sites"); + logger.info("Processed: " + result + " reads"); } private void generatePlots() { @@ -304,7 +468,6 @@ public class BaseRecalibrator extends LocusWalker { RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); } - /** * go through the quality score table and use the # observations and the empirical quality score * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to @@ -317,5 +480,4 @@ public class BaseRecalibrator extends LocusWalker { private void generateReport() { RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates); } -} - +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index ce60f5a3a..962d62d5e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -33,7 +33,5 @@ public interface RecalibrationEngine { public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables); - public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase); - public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 1f7fbbf42..6031aa955 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -46,41 +46,30 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP this.recalibrationTables = recalibrationTables; } - /** - * Loop through the list of requested covariates and pick out the value from the read, offset, and reference - * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches for mismatches only. - * - * @param pileupElement The pileup element to update - * @param refBase The reference base at this locus - */ - @Override - public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { - final int offset = pileupElement.getOffset(); - final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); - - final byte qual = pileupElement.getQual(); - final boolean isError = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); - - final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); - final int eventIndex = EventType.BASE_SUBSTITUTION.index; - - // TODO: should this really be combine rather than increment? - combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); - } - } - @Override public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { - throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version"); + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( !skip[offset] ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); + + final byte qual = read.getBaseQualities()[offset]; + final double isError = snpErrors[offset]; + + final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); + final int eventIndex = EventType.BASE_SUBSTITUTION.index; + + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); + + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } + } } /** @@ -90,10 +79,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @param isError whether or not the observation is an error * @return a new RecalDatum object with the observation and the error */ - protected RecalDatum createDatumObject(final byte reportedQual, final boolean isError) { - return new RecalDatum(1, isError ? 1:0, reportedQual); - } - protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { return new RecalDatum(1, isError, reportedQual); } @@ -108,46 +93,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE); } - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error status for this event - * @param keys location in table of our item - */ - protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final boolean isError, final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - // There are three cases here: - // - // 1. The original get succeeded (existingDatum != null) because there already was an item in this position. - // In this case we can just increment the existing item. - // - // 2. The original get failed (existingDatum == null), and we successfully put a new item in this position - // and are done. - // - // 3. The original get failed (existingDatum == null), AND the put fails because another thread comes along - // in the interim and puts an item in this position. In this case we need to do another get after the - // failed put to get the item the other thread put here and increment it. - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(isError); - } - } - /** * Increments the RecalDatum at the specified position in the specified table, or put a new item there * if there isn't already one. @@ -177,36 +122,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP } } - /** - * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a - * new item there if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error status for this event - * @param keys location in table of our item - */ - protected void combineDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final boolean isError, final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - final RecalDatum newDatum = createDatumObject(qual, isError); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(newDatum, keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and combine it with our item (item is guaranteed to exist at this point) - table.get(keys).combine(newDatum); - } - } - else { - // Easy case: already an item here, so combine it with our item - existingDatum.combine(newDatum); - } - } - /** * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a * new item there if there isn't already one.