diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java index 58117a168..f2034944d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java @@ -64,13 +64,16 @@ import java.util.*; *

Odds Ratios in the 2x2 contingency table below are R = (X[0][0] * X[1][1]) / (X[0][1] * X[1][0]) and its inverse * + strand - strand * Ref X[0][0] X[0][1] - * Alt X[1][0] X[1][0] + * Alt X[1][0] X[1][1] * The sum R + 1/R is used to detect a difference in strand bias for ref and for alt (the sum makes it symmetric): * A high value is indicative of large difference where one entry is very small compared to the others. + * + * A scale factor of refRatio/altRatio where refRatio = (max(X[0][0], X[0][1])/min(X[0][0], X[0][1])) and + * altRatio = (max(X[1][0], X[1][1])/min(X[1][0], X[1][1])) ensures that the annotation value is large only *

*/ public class StrandOddsRatio extends StrandBiasTest implements ActiveRegionBasedAnnotation { - private final static double AUGMENTATION_CONSTANT = 0.1; + private final static double AUGMENTATION_CONSTANT = 1.0; private static final int MIN_COUNT = 0; private static final String SOR = "SOR"; @@ -88,20 +91,20 @@ public class StrandOddsRatio extends StrandBiasTest implements ActiveRegionBased if ( vc.hasGenotypes() ) { final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes(), MIN_COUNT ); if ( tableFromPerSampleAnnotations != null ) { - final double ratio = symmetricOddsRatio(tableFromPerSampleAnnotations); + final double ratio = calculateSOR(tableFromPerSampleAnnotations); return annotationForOneTable(ratio); } } if (vc.isSNP() && stratifiedContexts != null) { final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1, MIN_COUNT); - final double ratio = symmetricOddsRatio(tableNoFiltering); + final double ratio = calculateSOR(tableNoFiltering); return annotationForOneTable(ratio); } else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc, MIN_COUNT); - final double ratio = symmetricOddsRatio(table); + final double ratio = calculateSOR(table); return annotationForOneTable(ratio); } else @@ -111,13 +114,16 @@ public class StrandOddsRatio extends StrandBiasTest implements ActiveRegionBased } /** - * Computes the symmetric odds ratio of a table after augmentation. + * Computes the SOR value of a table after augmentation. Based on the symmetric odds ratio but modified to take on + * low values when the reference +/- read count ratio is skewed but the alt count ratio is not. Natural log is taken + * to keep values within roughly the same range as other annotations. + * * Augmentation avoids quotient by zero. * * @param originalTable The table before augmentation - * @return the symmetric odds ratio + * @return the SOR annotation value */ - final protected double symmetricOddsRatio(final int[][] originalTable) { + final protected double calculateSOR(final int[][] originalTable) { final double[][] augmentedTable = augmentContingencyTable(originalTable); double ratio = 0; @@ -125,7 +131,12 @@ public class StrandOddsRatio extends StrandBiasTest implements ActiveRegionBased ratio += (augmentedTable[0][0] / augmentedTable[0][1]) * (augmentedTable[1][1] / augmentedTable[1][0]); ratio += (augmentedTable[0][1] / augmentedTable[0][0]) * (augmentedTable[1][0] / augmentedTable[1][1]); - return ratio; + final double refRatio = (Math.min(augmentedTable[0][0], augmentedTable[0][1])/Math.max(augmentedTable[0][0], augmentedTable[0][1])); + final double altRatio = (Math.min(augmentedTable[1][0], augmentedTable[1][1])/Math.max(augmentedTable[1][0], augmentedTable[1][1])); + + ratio = ratio*refRatio/altRatio; + + return Math.log(ratio); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java index d0fb51318..f512277b0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java @@ -340,12 +340,15 @@ public class VariantDataManager { private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) { double value; + final double LOG_OF_TWO = 0.6931472; + try { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln } catch( Exception e ) { value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index ae683993d..ebcb82444 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -134,7 +134,7 @@ public class GenotypeGVCFs extends RodWalker annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts", "GenotypeSummaries"})); + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts", "GenotypeSummaries", "StrandOddsRatio"})); /** * The rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. Note that dbSNP is not used in any way for the calculations themselves. diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatioUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatioUnitTest.java index aa52c1577..787ffda04 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatioUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatioUnitTest.java @@ -62,27 +62,28 @@ public class StrandOddsRatioUnitTest { @DataProvider(name = "UsingSOR") public Object[][] makeUsingSORData() { + final double LOG_OF_TWO = 0.6931472; List tests = new ArrayList<>(); - tests.add(new Object[]{0, 0, 0, 0, 2.0}); - tests.add(new Object[]{100000, 100000, 100000, 100000, 2.0} ); - tests.add(new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE, 2.0} ); + tests.add(new Object[]{0, 0, 0, 0, LOG_OF_TWO}); + tests.add(new Object[]{100000, 100000, 100000, 100000, LOG_OF_TWO} ); + tests.add(new Object[]{Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE, LOG_OF_TWO} ); - tests.add(new Object[]{0, 0, 100000, 100000, 2.0}); - tests.add(new Object[]{0, 0, Integer.MAX_VALUE, Integer.MAX_VALUE, 2.0}); + tests.add(new Object[]{0, 0, 100000, 100000, LOG_OF_TWO}); + tests.add(new Object[]{0, 0, Integer.MAX_VALUE, Integer.MAX_VALUE, LOG_OF_TWO}); - tests.add(new Object[]{100000,100000,100000,0, 1000001.000001}); - tests.add(new Object[]{100,100,100,0, 1001.000999}); - tests.add(new Object[]{Integer.MAX_VALUE,Integer.MAX_VALUE,Integer.MAX_VALUE,0, 21474836471.0}); + tests.add(new Object[]{100000,100000,100000,0, 23.02587}); + tests.add(new Object[]{100,100,100,0, 9.230339}); + tests.add(new Object[]{Integer.MAX_VALUE,Integer.MAX_VALUE,Integer.MAX_VALUE,0, 42.97513}); - tests.add(new Object[]{13736,9047,41,1433, 52.95947}); - tests.add(new Object[]{66, 14, 64, 4, 3.63482}); - tests.add(new Object[]{351169, 306836, 153739, 2379, 56.48043}); - tests.add(new Object[]{116449, 131216, 289, 16957, 52.07302}); - tests.add(new Object[]{137, 159, 9, 23, 2.64460}); - tests.add(new Object[]{129, 90, 21, 20, 2.09757}); - tests.add(new Object[]{14054, 9160, 16, 7827, 745.89657}); - tests.add(new Object[]{32803, 9184, 32117, 3283, 3.10399}); - tests.add(new Object[]{2068, 6796, 1133, 0, 37235.43791}); + tests.add(new Object[]{13736,9047,41,1433, 7.061479}); + tests.add(new Object[]{66, 14, 64, 4, 2.248203}); + tests.add(new Object[]{351169, 306836, 153739, 2379, 8.066731}); + tests.add(new Object[]{116449, 131216, 289, 16957, 7.898818}); + tests.add(new Object[]{137, 159, 9, 23, 1.664854}); + tests.add(new Object[]{129, 90, 21, 20, 0.4303384}); + tests.add(new Object[]{14054, 9160, 16, 7827, 12.2645}); + tests.add(new Object[]{32803, 9184, 32117, 3283, 2.139932}); + tests.add(new Object[]{2068, 6796, 1133, 0, 14.06701}); return tests.toArray(new Object[][]{}); } @@ -94,7 +95,7 @@ public class StrandOddsRatioUnitTest { contingencyTable[0][1] = refneg; contingencyTable[1][0] = altpos; contingencyTable[1][1] = altneg; - final double ratio = new StrandOddsRatio().symmetricOddsRatio(contingencyTable); + final double ratio = new StrandOddsRatio().calculateSOR(contingencyTable); Assert.assertEquals(ratio, expectedOddsRatio, DELTA_PRECISION, "Pass"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 4685f21e6..07392ffdd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -69,7 +69,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("e61d4cbfd21f102c2e6c0bd56cb68312")); + Arrays.asList("5487ad609548c30e79a431115dc772ba")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -81,7 +81,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Arrays.asList("4200fce6f253ef51dc4cd903794270b9")); + Arrays.asList("f1dd2b9ab422c0b806392464e466ed91")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -94,7 +94,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("1c553decc3f9fe1d752137a7f34d7e4e")); + Arrays.asList("f7650a8a861dec3138848bb972929002")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -106,7 +106,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("e6952f19e692091332809c59069a1eab")); + Arrays.asList("df5a6a574c48c243fad5b44f34343fe3")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -116,7 +116,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Arrays.asList("b18ddcf828227c07f1b38d99c91f6e09")); + Arrays.asList("b4bb1d21c7a3d793a98b0857c7c5d52b")); executeTest("testJustOneSample", spec); } @@ -127,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Arrays.asList("a453f2e15b20f8cf0edadb998cb883d5")); + Arrays.asList("81c4cc8a6b72c24598ee899df838f1e8")); executeTest("testSamplesWithDifferentLs", spec); } @@ -138,12 +138,12 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf", 1, - Arrays.asList("1dda31e020086a4c5643571fbccd40ba")); + Arrays.asList("9626a7108d616d63a2a8069b306c1fe0")); WalkerTestSpec spec2 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.nocall.vcf", 1, - Arrays.asList("1dda31e020086a4c5643571fbccd40ba")); + Arrays.asList("9626a7108d616d63a2a8069b306c1fe0")); executeTest("testNoPLsException.1", spec1); executeTest("testNoPLsException.2", spec2); } @@ -153,7 +153,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-nda"), 1, - Arrays.asList("715ef2c2f954b91aad8606188ed1b6c5")); + Arrays.asList("27cddeb2287827a58d79fc7d3ddad080")); executeTest("testNDA", spec); } @@ -162,7 +162,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-maxAltAlleles 1"), 1, - Arrays.asList("8973ec01a29e53cd4181f9d8662d9284")); + Arrays.asList("8fa78191298b4d8c9b40fba2c705ad56")); executeTest("testMaxAltAlleles", spec); } @@ -171,7 +171,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"), 1, - Arrays.asList("502ed8ce40c461158520076dd02cc29c")); + Arrays.asList("c841b408e41596529009bf7f07de9b3f")); executeTest("testStandardConf", spec); } } \ No newline at end of file