From b76dbc72f0d856866689e8b150b0acc39b4ece59 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 26 Sep 2011 08:13:44 -0400 Subject: [PATCH 1/8] Fixed interval navigation bug. If a read was hard clipped away from the current interval, all subsequent reads within that interval (not hardclipped) would be filtered out. Fixed. --- .../sting/utils/sam/ReadUtils.java | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index c328bbc5a..49d79a72c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -157,7 +157,7 @@ public class ReadUtils { * |----------------| (interval) * <--------> (read) */ - public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * God, there's a huge information asymmetry in SAM format: @@ -640,27 +640,35 @@ public class ReadUtils { */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { - int start = getRefCoordSoftUnclippedStart(read); - int stop = getRefCoordSoftUnclippedEnd(read); + int sStart = getRefCoordSoftUnclippedStart(read); + int sStop = getRefCoordSoftUnclippedEnd(read); + int uStart = read.getUnclippedStart(); + int uStop = read.getUnclippedEnd(); if ( !read.getReferenceName().equals(interval.getContig()) ) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - else if ( stop < interval.getStart() ) + else if ( uStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - else if ( start > interval.getStop() ) + else if ( uStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - else if ( (start >= interval.getStart()) && - (stop <= interval.getStop()) ) + else if ( sStop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; + + else if ( sStart > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; + + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - else if ( (start < interval.getStart()) && - (stop > interval.getStop()) ) + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - else if ( (start < interval.getStart()) ) + else if ( (sStart < interval.getStart()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT; else From 317b95fa5738ee0434aa8e122408abe88c0b9242 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 26 Sep 2011 11:46:45 -0500 Subject: [PATCH 3/8] Fixing some annotator docs --- .../gatk/walkers/annotator/DepthOfCoverage.java | 15 +++++++++++++-- .../walkers/annotator/DepthPerAlleleBySample.java | 8 ++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 8826de232..864be55b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -19,8 +19,19 @@ import java.util.Map; /** * Total (unfiltered) depth over all samples. * - * Affected by downsampling (-dcov) though, so the max value one can obtain for N samples with -dcov D - * is N * D + * This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample + * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal + * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site. + * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the + * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the + * power I have to determine the genotype of the sample at this site, while the AD tells me how many times + * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering + * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like + * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would + * normally be excluded from the statistical calculations going into GQ and QUAL. + * + * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with + * -dcov D is N * D */ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 1cd30c51d..5d706d9c5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -24,9 +24,9 @@ import java.util.Map; /** - * The depth of coverage of each VCF allele in this sample + * The depth of coverage of each VCF allele in this sample. * - * Complementary fields that two important ways of thinking about the depth of the data for this sample + * This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site. * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the @@ -38,8 +38,8 @@ import java.util.Map; * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that * are actually present and correctly left-aligned in the alignments themselves). Because of this fact and - * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base - * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what + * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base + * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what * determine the genotype calls (see below). */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { From 4f09453470d3f86cd985eb31215623ae8d62a1d8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Sep 2011 12:58:31 -0400 Subject: [PATCH 4/8] Refactored reduced read utilities -- UnitTests for key functions on reduced reads -- PileupElement calls static functions in ReadUtils -- Simple routine that takes a reduced read and fills in its quals with its reduced qual --- .../sting/utils/pileup/PileupElement.java | 9 ++-- .../sting/utils/sam/ReadUtils.java | 37 ++++++++++++++- .../sting/utils/ReadUtilsUnitTest.java | 45 ++++++++++++++++++- 3 files changed, 82 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 12899e898..053864791 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -81,20 +81,17 @@ public class PileupElement { // // -------------------------------------------------------------------------- - private Integer getReducedReadQualityTagValue() { - return getRead().getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); - } - public boolean isReducedRead() { - return getReducedReadQualityTagValue() != null; + return ReadUtils.isReducedRead(getRead()); } public int getReducedCount() { + if ( ! isReducedRead() ) throw new IllegalArgumentException("Cannot get reduced count for non-reduced read " + getRead().getReadName()); return (int)getQual(); } public byte getReducedQual() { - return (byte)(int)getReducedReadQualityTagValue(); + return (byte)(int)ReadUtils.getReducedReadQualityTagValue(getRead()); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 49d79a72c..8beb1b21a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -43,10 +43,43 @@ import java.util.*; * @version 0.1 */ public class ReadUtils { - public static final String REDUCED_READ_QUALITY_TAG = "RQ"; - private ReadUtils() { } + // ---------------------------------------------------------------------------------------------------- + // + // Reduced read utilities + // + // ---------------------------------------------------------------------------------------------------- + + public static final String REDUCED_READ_QUALITY_TAG = "RQ"; + + public final static Integer getReducedReadQualityTagValue(final SAMRecord read) { + return read.getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); + } + + public final static boolean isReducedRead(final SAMRecord read) { + return getReducedReadQualityTagValue(read) != null; + } + + public final static SAMRecord reducedReadWithReducedQuals(final SAMRecord read) { + if ( ! isReducedRead(read) ) throw new IllegalArgumentException("read must be a reduced read"); + try { + SAMRecord newRead = (SAMRecord)read.clone(); + byte reducedQual = (byte)(int)getReducedReadQualityTagValue(read); + byte[] newQuals = new byte[read.getBaseQualities().length]; + Arrays.fill(newQuals, reducedQual); + newRead.setBaseQualities(newQuals); + return newRead; + } catch ( CloneNotSupportedException e ) { + throw new ReviewedStingException("SAMRecord no longer supports clone", e); + } + } + + // ---------------------------------------------------------------------------------------------------- + // + // General utilities + // + // ---------------------------------------------------------------------------------------------------- public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { SAMFileHeader copy = new SAMFileHeader(); diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index 7cb7fec98..e1fdadadc 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; @@ -12,9 +13,10 @@ import org.testng.annotations.Test; public class ReadUtilsUnitTest extends BaseTest { - SAMRecord read; + SAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; + final private static int REDUCED_READ_QUAL = 20; @BeforeTest public void init() { @@ -23,6 +25,11 @@ public class ReadUtilsUnitTest extends BaseTest { read.setReadUnmappedFlag(true); read.setReadBases(new String(BASES).getBytes()); read.setBaseQualityString(new String(QUALS)); + + reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); + reducedRead.setReadBases(BASES.getBytes()); + reducedRead.setBaseQualityString(QUALS); + reducedRead.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, REDUCED_READ_QUAL); } private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) { @@ -38,4 +45,40 @@ public class ReadUtilsUnitTest extends BaseTest { @Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); } @Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); } @Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); } + + @Test + public void testReducedReads() { + Assert.assertFalse(ReadUtils.isReducedRead(read), "isReducedRead is false for normal read"); + Assert.assertEquals(ReadUtils.getReducedReadQualityTagValue(read), null, "No reduced read tag in normal read"); + + Assert.assertTrue(ReadUtils.isReducedRead(reducedRead), "isReducedRead is true for reduced read"); + Assert.assertEquals((int) ReadUtils.getReducedReadQualityTagValue(reducedRead), REDUCED_READ_QUAL, "Reduced read tag is set to expected value"); + } + + @Test + public void testreducedReadWithReducedQualsWithReducedRead() { + SAMRecord replacedRead = ReadUtils.reducedReadWithReducedQuals(reducedRead); + Assert.assertEquals(replacedRead.getReadBases(), reducedRead.getReadBases()); + Assert.assertEquals(replacedRead.getBaseQualities().length, reducedRead.getBaseQualities().length); + for ( int i = 0; i < replacedRead.getBaseQualities().length; i++) + Assert.assertEquals(replacedRead.getBaseQualities()[i], REDUCED_READ_QUAL); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testreducedReadWithReducedQualsWithNormalRead() { + ReadUtils.reducedReadWithReducedQuals(read); + } + + @Test + public void testReducedReadPileupElement() { + PileupElement readp = new PileupElement(read,0); + PileupElement reducedreadp = new PileupElement(reducedRead,0); + + Assert.assertFalse(readp.isReducedRead()); + + Assert.assertTrue(reducedreadp.isReducedRead()); + Assert.assertEquals(reducedreadp.getReducedCount(), 0); + Assert.assertEquals(reducedreadp.getReducedQual(), REDUCED_READ_QUAL); + + } } From fa0efbc4ca9b304e6e67326912e65263a996c56a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 26 Sep 2011 13:28:56 -0400 Subject: [PATCH 5/8] Refactoring of PairHMM to support reduced reads --- .../indels/PairHMMIndelErrorModel.java | 295 +++++++++--------- 1 file changed, 151 insertions(+), 144 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 31e9819ab..6e4db9303 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -244,31 +245,31 @@ public class PairHMMIndelErrorModel { /** * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) */ - /* - private void addCSVData(final File file, final String line) { - final String[] vals = line.split(","); + /* + private void addCSVData(final File file, final String line) { + final String[] vals = line.split(","); - // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical - throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + - " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly + if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + } + + final Object[] key = new Object[requestedCovariates.size()]; + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + key[iii] = cov.getValue( vals[iii] ); + } + + // Create a new datum using the number of observations, number of mismatches, and reported quality score + final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + // Add that datum to all the collapsed tables which will be used in the sequential calculation + dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); } - final Object[] key = new Object[requestedCovariates.size()]; - Covariate cov; - int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); - } - - // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); - // Add that datum to all the collapsed tables which will be used in the sequential calculation - dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); - } - -*/ + */ public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) { this(indelGOP, indelGCP, deb, doCDP); this.doViterbi = dovit; @@ -588,7 +589,7 @@ public class PairHMMIndelErrorModel { } else { c = currentGOP[jm1]; - d = currentGCP[jm1]; + d = currentGCP[jm1]; } if (indI == X_METRIC_LENGTH-1) c = d = END_GAP_COST; @@ -707,12 +708,12 @@ public class PairHMMIndelErrorModel { } } public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, - ReferenceContext ref, int eventLength, - HashMap> indelLikelihoodMap){ + ReferenceContext ref, int eventLength, + HashMap> indelLikelihoodMap){ int numHaplotypes = haplotypeMap.size(); - double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; - double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes]; + final double readLikelihoods[][] = new double[pileup.size()][numHaplotypes]; + final int readCounts[] = new int[pileup.size()]; int readIdx=0; LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); @@ -751,6 +752,9 @@ public class PairHMMIndelErrorModel { } } for (PileupElement p: pileup) { + // > 1 when the read is a consensus read representing multiple independent observations + final boolean isReduced = ReadUtils.isReducedRead(p.getRead()); + readCounts[readIdx] = isReduced ? p.getReducedCount() : 1; // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (indelLikelihoodMap.containsKey(p)) { @@ -762,61 +766,65 @@ public class PairHMMIndelErrorModel { } else { //System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); - GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; + if ( isReduced ) { + read = ReadUtils.reducedReadWithReducedQuals(read); + } + if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) { continue; } double[] recalQuals = null; - /* - if (getGapPenaltiesFromFile) { - RecalDataManager.parseSAMRecord( read, RAC ); + /* + if (getGapPenaltiesFromFile) { + RecalDataManager.parseSAMRecord( read, RAC ); - recalQuals = new double[read.getReadLength()]; + recalQuals = new double[read.getReadLength()]; - //compute all covariate values for this read - final Comparable[][] covariateValues_offset_x_covar = - RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); - // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { + //compute all covariate values for this read + final Comparable[][] covariateValues_offset_x_covar = + RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); + // For each base in the read + for( int offset = 0; offset < read.getReadLength(); offset++ ) { - final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; + final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; - Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); - if(qualityScore == null) - { - qualityScore = performSequentialQualityCalculation( fullCovariateKey ); - qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); - } + Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); + if(qualityScore == null) + { + qualityScore = performSequentialQualityCalculation( fullCovariateKey ); + qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); + } - recalQuals[offset] = -((double)qualityScore)/10.0; - } + recalQuals[offset] = -((double)qualityScore)/10.0; + } - // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) - // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent - if (DEBUG) { - System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), - read.getAlignmentStart(), - read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), - read.getCigarString()); + // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) + // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent + if (DEBUG) { + System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), + read.getAlignmentStart(), + read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), + read.getCigarString()); - byte[] bases = read.getReadBases(); - for (int k = 0; k < recalQuals.length; k++) { - System.out.format("%c",bases[k]); - } - System.out.println(); + byte[] bases = read.getReadBases(); + for (int k = 0; k < recalQuals.length; k++) { + System.out.format("%c",bases[k]); + } + System.out.println(); - for (int k = 0; k < recalQuals.length; k++) { - System.out.format("%.0f ",recalQuals[k]); - } - System.out.println(); - } - } */ + for (int k = 0; k < recalQuals.length; k++) { + System.out.format("%.0f ",recalQuals[k]); + } + System.out.println(); + } + } */ // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -971,7 +979,7 @@ public class PairHMMIndelErrorModel { System.out.println(new String(haplotypeBases)); } - Double readLikelihood = 0.0; + double readLikelihood = 0.0; if (useAffineGapModel) { double[] currentContextGOP = null; @@ -979,14 +987,14 @@ public class PairHMMIndelErrorModel { if (doContextDependentPenalties) { - if (getGapPenaltiesFromFile) { - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null); + if (getGapPenaltiesFromFile) { + readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null); - } else { - currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); - } + } else { + currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); + currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); + readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); + } } } @@ -1004,7 +1012,7 @@ public class PairHMMIndelErrorModel { if (DEBUG) { System.out.println("\nLikelihood summary"); - for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + for (readIdx=0; readIdx < pileup.size(); readIdx++) { System.out.format("Read Index: %d ",readIdx); for (int i=0; i < readLikelihoods[readIdx].length; i++) System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); @@ -1012,36 +1020,35 @@ public class PairHMMIndelErrorModel { } } + + return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + } + + private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; + + // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix for (int i=0; i < numHaplotypes; i++) { for (int j=i; j < numHaplotypes; j++){ // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) - for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { - + for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j])) continue; - haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i], - readLikelihoods[readIdx][j]) + LOG_ONE_HALF); - + final double li = readLikelihoods[readIdx][i]; + final double lj = readLikelihoods[readIdx][j]; + final int readCount = readCounts[readIdx]; + haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF); } - - } } - return getHaplotypeLikelihoods(haplotypeLikehoodMatrix); - - } - - public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) { - int hSize = haplotypeLikehoodMatrix.length; - double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2]; - + final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; int k=0; - for (int j=0; j < hSize; j++) { + for (int j=0; j < numHaplotypes; j++) { for (int i=0; i <= j; i++){ genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; } @@ -1066,63 +1073,63 @@ public class PairHMMIndelErrorModel { * @param key The list of Comparables that were calculated from the covariates * @return A recalibrated quality score as a byte */ - /* - private byte performSequentialQualityCalculation( final Object... key ) { + /* + private byte performSequentialQualityCalculation( final Object... key ) { - final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; + final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; - // The global quality shift (over the read group only) - readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); - double globalDeltaQ = 0.0; - if( globalRecalDatum != null ) { - final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); - final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); - globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; - } - - // The shift in quality between reported and empirical - qualityScoreCollapsedKey[0] = key[0]; - qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); - double deltaQReported = 0.0; - if( qReportedRecalDatum != null ) { - final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); - deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; - } - - // The shift in quality due to each covariate by itself in turn - double deltaQCovariates = 0.0; - double deltaQCovariateEmpirical; - covariateCollapsedKey[0] = key[0]; - covariateCollapsedKey[1] = key[1]; - for( int iii = 2; iii < key.length; iii++ ) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); - if( covariateRecalDatum != null ) { - deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); + double globalDeltaQ = 0.0; + if( globalRecalDatum != null ) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); + double deltaQReported = 0.0; + if( qReportedRecalDatum != null ) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for( int iii = 2; iii < key.length; iii++ ) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); + if( covariateRecalDatum != null ) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + //return newQualityByte; + } - - final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); - - // Verbose printouts used to validate with old recalibrator - //if(key.contains(null)) { - // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", - // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); - //} - //else { - // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", - // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); - //} - - //return newQualityByte; - - } -*/ + */ } From 059cdcb1bee9c8fa8aac4bff79706d514082097a Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Mon, 26 Sep 2011 14:58:19 -0400 Subject: [PATCH 6/8] Changing packaging system path for GATK-only Tribble codecs. --- public/packages/GATKEngine.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/packages/GATKEngine.xml b/public/packages/GATKEngine.xml index 4f635f7fb..4364988e7 100644 --- a/public/packages/GATKEngine.xml +++ b/public/packages/GATKEngine.xml @@ -29,7 +29,7 @@ - + From 648b959361dc15247a1cb9b487f9093f96d2c967 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Tue, 27 Sep 2011 00:50:19 -0400 Subject: [PATCH 7/8] Minor change to log an info message when a signal such as Ctrl-C is caught. --- .../broadinstitute/sting/queue/QCommandLine.scala | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) mode change 100755 => 100644 public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala old mode 100755 new mode 100644 index 297da8cc9..a3e83871e --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -37,7 +37,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException /** * Entry point of Queue. Compiles and runs QScripts passed in to the command line. */ -object QCommandLine { +object QCommandLine extends Logging { /** * Main. * @param argv Arguments. @@ -45,22 +45,23 @@ object QCommandLine { def main(argv: Array[String]) { val qCommandLine = new QCommandLine - Runtime.getRuntime.addShutdownHook(new Thread { - /** Cleanup as the JVM shuts down. */ + val shutdownHook = new Thread { override def run() { + logger.info("Shutting down jobs. Please wait...") ProcessController.shutdown() qCommandLine.shutdown() } - }) + } + + Runtime.getRuntime.addShutdownHook(shutdownHook) try { CommandLineProgram.start(qCommandLine, argv); + Runtime.getRuntime.removeShutdownHook(shutdownHook) if (CommandLineProgram.result != 0) System.exit(CommandLineProgram.result); } catch { case e: Exception => CommandLineProgram.exitSystemWithError(e) - } finally { - } } } From e99ff3caae467d562d3b1e00be022d34c410a189 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 27 Sep 2011 10:08:40 -0400 Subject: [PATCH 8/8] Removed lots of old, and not to be used, HMM options -- resulted in massive code cleanup -- GdA will integrate his new banded algorithm here -- Removed: DO_CONTEXT_DEPENDENT_PENALTIES, GET_GAP_PENALTIES_FROM_DATA, INDEL_RECAL_FILE, dovit, GSA_PRODUCTION_ONLY --- ...elGenotypeLikelihoodsCalculationModel.java | 43 +- .../genotyper/UnifiedArgumentCollection.java | 31 +- .../indels/PairHMMIndelErrorModel.java | 702 ++---------------- 3 files changed, 64 insertions(+), 712 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 11cde5ffe..6d917325e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -71,9 +71,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup - private HaplotypeIndelErrorModel model; - private boolean useOldWrongHorribleHackedUpLikelihoodModel = false; -// private GenomeLoc lastSiteVisited; private ArrayList alleleList; @@ -84,26 +81,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - if (UAC.GSA_PRODUCTION_ONLY == false) { - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE); - useOldWrongHorribleHackedUpLikelihoodModel = false; - } - else { - useOldWrongHorribleHackedUpLikelihoodModel = true; - double INSERTION_START_PROBABILITY = 1e-3; - - double INSERTION_END_PROBABILITY = 0.5; - - double ALPHA_DELETION_PROBABILITY = 1e-3; - - - model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY, - INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO); - } - - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE); + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,UAC.OUTPUT_DEBUG_INDEL_INFO); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; @@ -383,14 +361,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } } - int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; - int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; - if (useOldWrongHorribleHackedUpLikelihoodModel) { - numPrefBases = 20; - hsize=80; - } + final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); + final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; + final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; + if (DEBUG) System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength, (int)ref.getWindow().size(), loc.getStart(), numPrefBases); @@ -413,13 +388,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood pileup = context.getBasePileup(); if (pileup != null ) { - double[] genotypeLikelihoods; - - if (useOldWrongHorribleHackedUpLikelihoodModel) - genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); - else - genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - + final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), alleleList, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 3c8fd4451..f5a107ee7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -143,31 +143,21 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "doContextDependentGapPenalties", shortName = "doCDP", doc = "Vary gap penalties by context", required = false) - public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true; + //gdebug+ // experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!! - @Hidden - @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) - public boolean GET_GAP_PENALTIES_FROM_DATA = false; - - @Hidden - @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") - public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); +// @Hidden +// @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) +// public boolean GET_GAP_PENALTIES_FROM_DATA = false; +// +// @Hidden +// @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") +// public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; - @Hidden - @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false) - public boolean dovit = false; - - @Hidden - @Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false) - public boolean GSA_PRODUCTION_ONLY = false; - @Hidden @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; @@ -204,15 +194,10 @@ public class UnifiedArgumentCollection { uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES; uac.alleles = alleles; - uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA; - uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE; // todo- arguments to remove uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT; - uac.dovit = dovit; - uac.GSA_PRODUCTION_ONLY = GSA_PRODUCTION_ONLY; uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; return uac; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 6e4db9303..68cbd4fb7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -51,36 +51,8 @@ import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Reca public class PairHMMIndelErrorModel { - - public static final int BASE_QUAL_THRESHOLD = 20; - - private static final int MATCH_OFFSET = 0; - private static final int X_OFFSET = 1; - private static final int Y_OFFSET = 2; - - private static final int DIAG = 0; - private static final int UP = 1; - private static final int LEFT = 2; - - private static final int DIAG_GOTO_M = 0; - private static final int DIAG_GOTO_X = 1; - private static final int DIAG_GOTO_Y = 2; - - private static final int UP_GOTO_M = 4; - private static final int UP_GOTO_X = 5; - private static final int UP_GOTO_Y = 6; - - private static final int LEFT_GOTO_M = 8; - private static final int LEFT_GOTO_X = 9; - private static final int LEFT_GOTO_Y = 10; - - private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y}; - private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y}; - private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y}; - - private final double logGapOpenProbability; private final double logGapContinuationProbability; @@ -101,36 +73,13 @@ public class PairHMMIndelErrorModel { private static final double MIN_GAP_CONT_PENALTY = 10.0; private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this. - - private boolean doViterbi = false; - - private final boolean useAffineGapModel = true; - private boolean doContextDependentPenalties = false; - private final double[] GAP_OPEN_PROB_TABLE; private final double[] GAP_CONT_PROB_TABLE; - private boolean getGapPenaltiesFromFile = false; - - private int SMOOTHING = 1; - private int MAX_QUALITY_SCORE = 50; - private int PRESERVE_QSCORES_LESS_THAN = 5; - ///////////////////////////// // Private Member Variables ///////////////////////////// -//copy+ -/* private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation - private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); - protected static final String EOF_MARKER = "EOF"; - private long numReadsWithMalformedColorSpace = 0; - private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. - */ -//copy- + static { LOG_ONE_HALF= -Math.log10(2.0); END_GAP_COST = LOG_ONE_HALF; @@ -146,141 +95,9 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit,boolean gpf, File RECAL_FILE) { - - this(indelGOP, indelGCP, deb, doCDP, dovit); - this.getGapPenaltiesFromFile = gpf; - - // read data from recal file - // gdebug - start copy from TableRecalibrationWalker -/* if (gpf) { - boolean sawEOF = false; - boolean REQUIRE_EOF = false; - - int lineNumber = 0; - boolean foundAllCovariates = false; - // Get a list of all available covariates - final List> classes = new PluginManager(Covariate.class).getPlugins(); - - try { - for ( String line : new XReadLines(RECAL_FILE) ) { - lineNumber++; - if ( EOF_MARKER.equals(line) ) { - sawEOF = true; - } else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { - ; // Skip over the comment lines, (which start with '#') - } - // Read in the covariates that were used from the input file - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); - } else { // Found the covariate list in input file, loop through all of them and instantiate them - String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical - boolean foundClass = false; - for( Class covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { - foundClass = true; - try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - - } - } - - if( !foundClass ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); - } - } - } - - } else { // Found a line of data - if( !foundAllCovariates ) { - foundAllCovariates = true; - - // At this point all the covariates should have been found and initialized - if( requestedCovariates.size() < 2 ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); - } - - final boolean createCollapsedTables = true; - - // Initialize any covariate member variables using the shared argument collection - for( Covariate cov : requestedCovariates ) { - cov.initialize( RAC ); - } - // Initialize the data hashMaps - dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); - - } - addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap - } - } - - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); - } catch ( NumberFormatException e ) { - throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); - } - - if ( !sawEOF ) { - final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; - if ( REQUIRE_EOF ) - throw new UserException.MalformedFile(RECAL_FILE, errorMessage); - } - - if( dataManager == null ) { - throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); - } - - // Create the tables of empirical quality scores that will be used in the sequential calculation - dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE ); - } - // debug end copy - */ - } - /** - * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) - */ - /* - private void addCSVData(final File file, final String line) { - final String[] vals = line.split(","); - - // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical - throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + - " --Perhaps the read group string contains a comma and isn't being parsed correctly."); - } - - final Object[] key = new Object[requestedCovariates.size()]; - Covariate cov; - int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); - } - - // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); - // Add that datum to all the collapsed tables which will be used in the sequential calculation - dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); - } - - */ - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) { - this(indelGOP, indelGCP, deb, doCDP); - this.doViterbi = dovit; - } - - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) { - - + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb) { this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob - this.doContextDependentPenalties = doCDP; this.DEBUG = deb; @@ -314,132 +131,6 @@ public class PairHMMIndelErrorModel { } - private double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; - - // initialize path metric and traceback memories for likelihood computation - double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY; - - for (int i=1; i < X_METRIC_LENGTH; i++) { - pathMetricArray[i][0] = 0; - bestMetricArray[i][0] = UP; - } - - for (int j=1; j < Y_METRIC_LENGTH; j++) { - pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability; - bestMetricArray[0][j] = LEFT; - } - - for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { - - byte x = readBases[indI-1]; - byte y = haplotypeBases[indJ-1]; - byte qual = readQuals[indI-1]; - - double bestMetric = 0.0; - int bestMetricIdx = 0; - - // compute metric for match/mismatch - // workaround for reads whose bases quality = 0, - if (qual < 1) - qual = 1; - - if (qual > MAX_CACHED_QUAL) - qual = MAX_CACHED_QUAL; - - double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - double[] metrics = new double[3]; - - metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead; - metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability); - metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability); - - if (doViterbi) { - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(metrics); - - pathMetricArray[indI][indJ] = bestMetric; - bestMetricArray[indI][indJ] = bestMetricIdx; - - } - } - - - double bestMetric=0.0; - int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; - - for (int i=0; i < X_METRIC_LENGTH; i ++ ) { - int j= Y_METRIC_LENGTH-1; - - if (pathMetricArray[i][j] > bestMetric) { - bestMetric = pathMetricArray[i][j]; - bestI = i; - bestJ = j; - } - } - for (int j=0; j < Y_METRIC_LENGTH; j++ ) { - int i= X_METRIC_LENGTH-1; - if (pathMetricArray[i][j] >= bestMetric) { - bestMetric = pathMetricArray[i][j]; - bestI = i; - bestJ = j; - } - } - - if (DEBUG && doViterbi) { - - String haplotypeString = new String (haplotypeBases); - String readString = new String(readBases); - - - int i = bestI; - int j = bestJ; - - - System.out.println("Simple NW"); - - while (i >0 || j >0) { - bestMetricIdx = bestMetricArray[i][j]; - System.out.print(bestMetricIdx); - if (bestMetricIdx == UP) { - // insert gap in Y - haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); - i--; - } else if (bestMetricIdx == LEFT) { - readString = readString.substring(0,i)+"-"+readString.substring(i); - j--; - } - else { - i--; j--; - } - } - - - - - System.out.println("\nAlignment: "); - System.out.println("R:"+readString); - System.out.println("H:"+haplotypeString); - System.out.println(); - - - } - if (DEBUG) - System.out.format("Likelihood: %5.4f\n", bestMetric); - - return bestMetric; - - - } - static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG @@ -480,14 +171,10 @@ public class PairHMMIndelErrorModel { final int Y_METRIC_LENGTH = haplotypeBases.length+1; // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - double c,d; matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; for (int i=1; i < X_METRIC_LENGTH; i++) { @@ -495,8 +182,6 @@ public class PairHMMIndelErrorModel { matchMetricArray[i][0] = Double.NEGATIVE_INFINITY; YMetricArray[i][0] = Double.NEGATIVE_INFINITY; XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability; - - bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X; } for (int j=1; j < Y_METRIC_LENGTH; j++) { @@ -504,188 +189,46 @@ public class PairHMMIndelErrorModel { matchMetricArray[0][j] = Double.NEGATIVE_INFINITY; XMetricArray[0][j] = Double.NEGATIVE_INFINITY; YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability; - - bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y; } for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - int im1 = indI-1; + final int im1 = indI-1; for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { - int jm1 = indJ-1; - byte x = readBases[im1]; - byte y = haplotypeBases[jm1]; - byte qual = readQuals[im1]; - - double bestMetric = 0.0; - int bestMetricIdx = 0; - - // compute metric for match/mismatch - // workaround for reads whose bases quality = 0, - if (qual < 1) - qual = 1; - - if (qual > MAX_CACHED_QUAL) - qual = MAX_CACHED_QUAL; - - double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - - - double[] metrics = new double[3]; - - - if (doViterbi) { - // update match array - metrics[MATCH_OFFSET] = matchMetricArray[im1][jm1] + pBaseRead; - metrics[X_OFFSET] = XMetricArray[im1][jm1] + pBaseRead; - metrics[Y_OFFSET] = YMetricArray[im1][jm1] + pBaseRead; - - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, - YMetricArray[im1][jm1] + pBaseRead); + final int jm1 = indJ-1; + final byte x = readBases[im1]; + final byte y = haplotypeBases[jm1]; + final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]); + final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; + double bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, + XMetricArray[im1][jm1] + pBaseRead, + YMetricArray[im1][jm1] + pBaseRead); matchMetricArray[indI][indJ] = bestMetric; - bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx]; // update X array // State X(i,j): X(1:i) aligned to a gap in Y(1:j). // When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps - //c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); - //d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - if (getGapPenaltiesFromFile) { - c = currentGOP[im1]; - d = logGapContinuationProbability; - - } else { - c = currentGOP[jm1]; - d = currentGCP[jm1]; - } - if (indJ == Y_METRIC_LENGTH-1) - c = d = END_GAP_COST; - - if (doViterbi) { - metrics[MATCH_OFFSET] = matchMetricArray[im1][indJ] + c; - metrics[X_OFFSET] = XMetricArray[im1][indJ] + d; - metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability; - - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c, XMetricArray[im1][indJ] + d); - + final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; + final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); XMetricArray[indI][indJ] = bestMetric; - bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx]; // update Y array //c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); //d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - if (getGapPenaltiesFromFile) { - c = currentGOP[im1]; - d = logGapContinuationProbability; - } - else { - c = currentGOP[jm1]; - d = currentGCP[jm1]; - } - if (indI == X_METRIC_LENGTH-1) - c = d = END_GAP_COST; - - - - if (doViterbi) { - metrics[MATCH_OFFSET] = matchMetricArray[indI][jm1] + c; - metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability; - metrics[Y_OFFSET] = YMetricArray[indI][jm1] + d; - - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c, YMetricArray[indI][jm1] + d); - + final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; + final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); YMetricArray[indI][indJ] = bestMetric; - bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx]; - - - } } - double bestMetric; - double metrics[] = new double[3]; - int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; - metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ]; - metrics[X_OFFSET] = XMetricArray[bestI][bestJ]; - metrics[Y_OFFSET] = YMetricArray[bestI][bestJ]; - if (doViterbi) { - bestTable = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestTable]; - } - else - bestMetric = MathUtils.softMax(metrics); + final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; + final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ], + XMetricArray[bestI][bestJ], + YMetricArray[bestI][bestJ]); - // Do traceback (needed only for debugging!) - if (DEBUG && doViterbi) { - - int bestAction; - int i = bestI; - int j = bestJ; - - - System.out.println("Affine gap NW"); - - - String haplotypeString = new String (haplotypeBases); - String readString = new String(readBases); - - - while (i >0 || j >0) { - if (bestTable == X_OFFSET) { - // insert gap in Y - haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); - bestAction = bestActionArrayX[i][j]; - } - else if (bestTable == Y_OFFSET) { - readString = readString.substring(0,i)+"-"+readString.substring(i); - bestAction = bestActionArrayY[i][j]; - - } - else { - bestAction = bestActionArrayM[i][j]; - } - System.out.print(bestAction); - - - // bestAction contains action to take at next step - // encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table - - // bestTable and nextDirection for next step - bestTable = bestAction & 0x3; - int nextDirection = bestAction >> 2; - if (nextDirection == UP) { - i--; - } else if (nextDirection == LEFT) { - j--; - } else { // if (nextDirection == DIAG) - i--; j--; - } - - } - - - - - System.out.println("\nAlignment: "); - System.out.println("R:"+readString); - System.out.println("H:"+haplotypeString); - System.out.println(); - - - } if (DEBUG) System.out.format("Likelihood: %5.4f\n", bestMetric); @@ -724,33 +267,31 @@ public class PairHMMIndelErrorModel { System.out.println(new String(ref.getBases())); } - if (doContextDependentPenalties && !getGapPenaltiesFromFile) { - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. - - - for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - byte[] haplotypeBases = haplotype.getBasesAsBytes(); - double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; - double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; - - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - if (DEBUG) { - System.out.println("Haplotype bases:"); - System.out.println(new String(haplotypeBases)); - for (int i=0; i < hrunProfile.length; i++) - System.out.format("%d",hrunProfile[i]); - System.out.println(); - } - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); - gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); + // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. + // todo -- refactor into separate function + for (Allele a: haplotypeMap.keySet()) { + Haplotype haplotype = haplotypeMap.get(a); + byte[] haplotypeBases = haplotype.getBasesAsBytes(); + double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; + double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[haplotypeBases.length]; + getContextHomopolymerLength(haplotypeBases,hrunProfile); + if (DEBUG) { + System.out.println("Haplotype bases:"); + System.out.println(new String(haplotypeBases)); + for (int i=0; i < hrunProfile.length; i++) + System.out.format("%d",hrunProfile[i]); + System.out.println(); } + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); + gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); + } + for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations final boolean isReduced = ReadUtils.isReducedRead(p.getRead()); @@ -774,57 +315,12 @@ public class PairHMMIndelErrorModel { read = ReadUtils.reducedReadWithReducedQuals(read); } - if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) { + if(ReadUtils.is454Read(read)) { continue; } double[] recalQuals = null; - /* - if (getGapPenaltiesFromFile) { - RecalDataManager.parseSAMRecord( read, RAC ); - - - recalQuals = new double[read.getReadLength()]; - - //compute all covariate values for this read - final Comparable[][] covariateValues_offset_x_covar = - RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); - // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { - - final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; - - Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); - if(qualityScore == null) - { - qualityScore = performSequentialQualityCalculation( fullCovariateKey ); - qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); - } - - recalQuals[offset] = -((double)qualityScore)/10.0; - } - - // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) - // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent - if (DEBUG) { - System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), - read.getAlignmentStart(), - read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), - read.getCigarString()); - - byte[] bases = read.getReadBases(); - for (int k = 0; k < recalQuals.length; k++) { - System.out.format("%c",bases[k]); - } - System.out.println(); - - for (int k = 0; k < recalQuals.length; k++) { - System.out.format("%.0f ",recalQuals[k]); - } - System.out.println(); - } - } */ // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -945,11 +441,6 @@ public class PairHMMIndelErrorModel { unclippedReadBases.length-numEndClippedBases); double[] recalCDP = null; - if (getGapPenaltiesFromFile) { - recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases, - unclippedReadBases.length-numEndClippedBases); - - } if (DEBUG) { System.out.println("Read bases:"); @@ -979,27 +470,9 @@ public class PairHMMIndelErrorModel { System.out.println(new String(haplotypeBases)); } - double readLikelihood = 0.0; - if (useAffineGapModel) { - - double[] currentContextGOP = null; - double[] currentContextGCP = null; - - if (doContextDependentPenalties) { - - if (getGapPenaltiesFromFile) { - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null); - - } else { - currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); - } - } - - } - else - readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals); + final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); + final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); + final double readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; @@ -1057,79 +530,4 @@ public class PairHMMIndelErrorModel { // renormalize so that max element is zero. return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); } - - /** - * Implements a serial recalibration of the reads using the combinational table. - * First, we perform a positional recalibration, and then a subsequent dinuc correction. - * - * Given the full recalibration table, we perform the following preprocessing steps: - * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: - * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) - * @param key The list of Comparables that were calculated from the covariates - * @return A recalibrated quality score as a byte - */ - /* - private byte performSequentialQualityCalculation( final Object... key ) { - - final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; - - // The global quality shift (over the read group only) - readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); - double globalDeltaQ = 0.0; - if( globalRecalDatum != null ) { - final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); - final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); - globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; - } - - // The shift in quality between reported and empirical - qualityScoreCollapsedKey[0] = key[0]; - qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); - double deltaQReported = 0.0; - if( qReportedRecalDatum != null ) { - final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); - deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; - } - - // The shift in quality due to each covariate by itself in turn - double deltaQCovariates = 0.0; - double deltaQCovariateEmpirical; - covariateCollapsedKey[0] = key[0]; - covariateCollapsedKey[1] = key[1]; - for( int iii = 2; iii < key.length; iii++ ) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); - if( covariateRecalDatum != null ) { - deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); - } - } - - final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); - - // Verbose printouts used to validate with old recalibrator - //if(key.contains(null)) { - // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", - // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); - //} - //else { - // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", - // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); - //} - - //return newQualityByte; - - } - */ }