From cf3e826a6978fcd50dfd3410a22ebf304e511e2f Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Fri, 29 Jul 2011 15:53:56 -0400 Subject: [PATCH 1/5] 1) RFCombine switched to the new ROD system 2) TreeReduce added to useful RODWalkers, but doesn't help very much due to scaling problems 3) RFA refactored, and a genotype-free calculation model added to calculate skew in a genotype-free way (still needs generalization to any ploidy) 4) Added walker to genotype intron loss events, calls into the UG engine to do so. This is very much a first-pass walker. 5) Documentation added for ValidationAmplicons --- .../filters/VariantFiltrationWalker.java | 9 +- .../genotyper/ExactAFCalculationModel.java | 69 ++++++++- .../validation/ValidationAmplicons.java | 72 ++++++++-- .../walkers/variantutils/CombineVariants.java | 6 +- .../broadinstitute/sting/utils/MathUtils.java | 135 +++++++++++++++++- 5 files changed, 279 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index c555e88cd..e34fa772b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -28,6 +28,9 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -47,7 +50,7 @@ import java.util.*; * Filters variant calls using a number of user-selectable, parameterizable criteria. */ @Reference(window=@Window(start=-50,stop=50)) -public class VariantFiltrationWalker extends RodWalker { +public class VariantFiltrationWalker extends RodWalker implements TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -278,6 +281,10 @@ public class VariantFiltrationWalker extends RodWalker { return sum + value; } + public Integer treeReduce(Integer left, Integer right) { + return left + right; + } + /** * Tell the user the number of loci processed and close out the new variants file. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index cd006a3cf..fa4863330 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import sun.reflect.generics.reflectiveObjects.NotImplementedException; @@ -580,7 +581,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // TODO -- remove me for clarity in this code // // ------------------------------------------------------------------------------------- - public int gdaN2GoldStandard(Map GLs, + public static int gdaN2GoldStandard(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { int numSamples = GLs.size(); @@ -658,4 +659,70 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + // todo -- generalize and merge into gdaN2GoldStandard + public static int rfaN2GoldStandard(Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { + int numSamples = GLs.size(); + int numChr = 2*numSamples; + + double[][] logYMatrix = new double[1+numSamples][1+numChr]; + + for (int i=0; i <=numSamples; i++) + for (int j=0; j <=numChr; j++) + logYMatrix[i][j] = Double.NEGATIVE_INFINITY; + + //YMatrix[0][0] = 1.0; + logYMatrix[0][0] = 0.0; + int j=0; + + for ( Map.Entry sample : GLs.entrySet() ) { + j++; + + //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); + double[] genotypeLikelihoods = sample.getValue().getAsVector(); + //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); + double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + // special treatment for k=0: iteration reduces to: + //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; + logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA]; + + for (int k=1; k <= 2*j; k++ ) { + + //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; + double logNumerator[]; + logNumerator = new double[3]; + if (k < 2*j-1) + logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + + genotypeLikelihoods[idxAA]; + else + logNumerator[0] = Double.NEGATIVE_INFINITY; + + + if (k < 2*j) + logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + + genotypeLikelihoods[idxAB]; + else + logNumerator[1] = Double.NEGATIVE_INFINITY; + + if (k > 1) + logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + + genotypeLikelihoods[idxBB]; + else + logNumerator[2] = Double.NEGATIVE_INFINITY; + + double logNum = MathUtils.softMax(logNumerator); + + //YMatrix[j][k] = num/den; + logYMatrix[j][k] = logNum - logDenominator; + } + + } + + for (int k=0; k <= numChr; k++) + log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; + + return numChr; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 7653f511f..82c5fc593 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -31,21 +31,77 @@ import java.util.LinkedList; import java.util.List; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/13/11 - * Time: 2:12 PM - * To change this template use File | Settings | File Templates. + * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation + * + *

+ * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe + * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within + * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies + * reasons why the site may fail validation (nearby variation, for example). + *

+ * + *

Input

+ *

+ * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an + * interval list defining the size of the amplicons around the sites to be validated + *

+ * + *

Output

+ *

+ * Output is a FASTA-formatted file with some modifications at probe sites. For instance: + * + * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414 + * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC + * >20:792122 Valid 20_792122 + * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT + * >20:994145 Valid 20_994145 + * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC + * >20:1074230 SITE_IS_FILTERED=1, 20_1074230 + * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC + * >20:1084330 DELETION=1, 20_1084330 + * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC + * + * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: + * + * Valid // amplicon is valid + * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant") + * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak + * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon + * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate + * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak + * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer + * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer + * NO_VARIANTS_FOUND, // no variants found within the amplicon region + * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself) + *

+ * + *

Examples

+ * PRE-TAG + * java + * -jar GenomeAnalysisTK.jar + * -T ValidationAmplicons + * -R /humgen/1kg/reference/human_g1k_v37.fasta + * -BTI ProbeIntervals + * -ProbeIntervals:table interval_table.table + * -ValidateAlleles:vcf sites_to_validate.vcf + * -MaskAlleles:vcf mask_sites.vcf + * --virtualPrimerSize 30 + * -o probes.fasta + * PRE-TAG + * + * @author chartl + * @since July 2011 */ @Requires(value={DataSource.REFERENCE}) public class ValidationAmplicons extends RodWalker { - @Input(fullName = "ProbeIntervals", doc="Chris document me", required=true) + @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+ + "intervals surrounding the probe sites amplicons should be designed for", required=true) RodBinding probeIntervals; - @Input(fullName = "ValidateAlleles", doc="Chris document me", required=true) + @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true) RodBinding validateAlleles; - @Input(fullName = "MaskAlleles", doc="Chris document me", required=true) + @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true) RodBinding maskAlleles; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index fb172e1b7..d46387084 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -31,6 +31,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.SampleUtils; @@ -49,7 +51,7 @@ import java.util.*; * priority list (if provided), emits a single record instance at every position represented in the rods. */ @Reference(window=@Window(start=-50,stop=50)) -public class CombineVariants extends RodWalker { +public class CombineVariants extends RodWalker implements TreeReducible{ /** * The VCF files to merge together * @@ -210,6 +212,8 @@ public class CombineVariants extends RodWalker { return 0; } + public Integer treeReduce(Integer left, Integer right) { return left + right; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index e197bb973..8c35119dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,10 +25,13 @@ package org.broadinstitute.sting.utils; +import cern.jet.random.ChiSquare; +import cern.jet.math.Arithmetic; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.apache.lucene.messages.NLS; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -893,6 +896,7 @@ public class MathUtils { return orderStatisticSearch((int) Math.ceil(list.size()/2), list); } + /* public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { // version of the order statistic calculator for SAMRecord/Integer lists, where the // list index maps to a q-score only through the offset index @@ -937,7 +941,7 @@ public class MathUtils { public static byte getQScoreMedian(List reads, List offsets) { return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); - } + }*/ /** A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that @@ -1355,4 +1359,133 @@ public class MathUtils { public static double log10Factorial (int x) { return log10Gamma(x+1); } + + /** + * Computes the p-value for the null hypothesis that the rows of the table are i.i.d. using a pearson chi-square test + * @param contingencyTable - a contingency table + * @return - the ensuing p-value (via chi-square) + */ + public static double contingencyChiSquare(short[][] contingencyTable ) { + int[] rowSum = new int[contingencyTable.length]; + int[] colSum = new int[contingencyTable[0].length]; + int total = 0; + for ( int i = 0; i < contingencyTable.length; i++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + rowSum[i] += contingencyTable[i][j]; + colSum[j] += contingencyTable[i][j]; + total += contingencyTable[i][j]; + } + } + + double chi = 0; + for ( int i = 0; i < contingencyTable.length; i ++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + double expected = (((double) colSum[j])/total)*rowSum[i]; + double resid = contingencyTable[i][j] - expected; + chi += resid*resid/expected; + } + } + + return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); + } + + /** + * Exactly the same as above, but using int arrays rather than short arrays on input + * @param contingencyTable + * @return + */ + public static double contingencyChiSquare(int[][] contingencyTable ) { + int[] rowSum = new int[contingencyTable.length]; + int[] colSum = new int[contingencyTable[0].length]; + int total = 0; + for ( int i = 0; i < contingencyTable.length; i++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + rowSum[i] += contingencyTable[i][j]; + colSum[j] += contingencyTable[i][j]; + total += contingencyTable[i][j]; + } + } + + double chi = 0; + for ( int i = 0; i < contingencyTable.length; i ++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + double expected = (((double) colSum[j])/total)*rowSum[i]; + double resid = contingencyTable[i][j] - expected; + chi += resid*resid/expected; + } + } + + return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); + } + +======= + public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { + int N = ns1 + ns2; + int[] rowSums = { ns1, ns2 }; + double logP = Double.NEGATIVE_INFINITY; + // todo -- sorting and strobing should chage this n^2 loop to a nlog(n) algorithm + for ( int ac1 = 0; ac1 < spectrum1.length; ac1++ ) { + for ( int ac2 = 0; ac2 < spectrum2.length; ac2++ ) { + double logPTable = spectrum1[ac1] + spectrum2[ac2]; + int[][] table = { + { ac1, ns1-ac1 }, + { ac2, ns2-ac2 } + }; + int[] colSums = { ac1 + ac2, N-ac1-ac2}; + double logPH0 = Math.log10(fisherExact(table,rowSums,colSums,N)); + logP = log10sumLog10(new double[]{logP,logPTable+logPH0}); + } + } + + return Math.pow(10,logP); + } + + /** + * Calculates the p-value for a fisher exact test for a 2x2 contingency table + */ + public static double fisherExact(int[][] table) { + if ( table.length > 2 || table[0].length > 2 ) { + throw new ReviewedStingException("Fisher exact is only implemented for 2x2 contingency tables"); + } + + int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; + int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; + int N = rowSums[0] + rowSums[1]; + + return fisherExact(table,rowSums,colSums,N); + + } + + public static double fisherExact(int[][] table, int[] rowSums, int[] colSums, int N ) { + + // calculate in log space so we don't die with high numbers + double pCutoff = Arithmetic.logFactorial(rowSums[0]) + + Arithmetic.logFactorial(rowSums[1]) + + Arithmetic.logFactorial(colSums[0]) + + Arithmetic.logFactorial(colSums[1]) + - Arithmetic.logFactorial(table[0][0]) + - Arithmetic.logFactorial(table[0][1]) + - Arithmetic.logFactorial(table[1][0]) + - Arithmetic.logFactorial(table[1][1]) + - Arithmetic.logFactorial(N); + return Math.exp(pCutoff); + } + + public static int sumRow(int[][] table, int column) { + int sum = 0; + for (int r = 0; r < table.length; r++) { + sum += table[r][column]; + } + + return sum; + } + + public static int sumColumn(int[][] table, int row) { + int sum = 0; + for (int c = 0; c < table[row].length; c++) { + sum += table[row][c]; + } + + return sum; + } } From 5aa61fefec8442de1826343154e4f83c89704ede Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 15 Aug 2011 16:53:05 -0400 Subject: [PATCH 2/5] Remove merge-added ======'s so this compiles --- public/java/src/org/broadinstitute/sting/utils/MathUtils.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8c35119dd..75bb0151a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1418,7 +1418,6 @@ public class MathUtils { return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); } -======= public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { int N = ns1 + ns2; int[] rowSums = { ns1, ns2 }; From 1968b65ca86f6a5bf5142049366462aa86a0768c Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 15 Aug 2011 18:45:21 -0400 Subject: [PATCH 3/5] Revert "Remove merge-added ======'s so this compiles" This reverts commit be028b6513a129f81aa6f3593ea7d396c0e8fc25. --- public/java/src/org/broadinstitute/sting/utils/MathUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 75bb0151a..8c35119dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1418,6 +1418,7 @@ public class MathUtils { return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); } +======= public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { int N = ns1 + ns2; int[] rowSums = { ns1, ns2 }; From 3e9ef0622de51e5ffbfb88e0be2a6825f33d43f4 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 15 Aug 2011 18:45:38 -0400 Subject: [PATCH 4/5] Revert "1) RFCombine switched to the new ROD system" This reverts commit cf989bd3cfae119ba9011873c5f5d5b80e37f67b. --- .../filters/VariantFiltrationWalker.java | 9 +- .../genotyper/ExactAFCalculationModel.java | 69 +-------- .../validation/ValidationAmplicons.java | 72 ++-------- .../walkers/variantutils/CombineVariants.java | 6 +- .../broadinstitute/sting/utils/MathUtils.java | 135 +----------------- 5 files changed, 12 insertions(+), 279 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index e34fa772b..c555e88cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -28,9 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -50,7 +47,7 @@ import java.util.*; * Filters variant calls using a number of user-selectable, parameterizable criteria. */ @Reference(window=@Window(start=-50,stop=50)) -public class VariantFiltrationWalker extends RodWalker implements TreeReducible { +public class VariantFiltrationWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -281,10 +278,6 @@ public class VariantFiltrationWalker extends RodWalker impleme return sum + value; } - public Integer treeReduce(Integer left, Integer right) { - return left + right; - } - /** * Tell the user the number of loci processed and close out the new variants file. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index fa4863330..cd006a3cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -34,7 +34,6 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import sun.reflect.generics.reflectiveObjects.NotImplementedException; @@ -581,7 +580,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // TODO -- remove me for clarity in this code // // ------------------------------------------------------------------------------------- - public static int gdaN2GoldStandard(Map GLs, + public int gdaN2GoldStandard(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { int numSamples = GLs.size(); @@ -659,70 +658,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - // todo -- generalize and merge into gdaN2GoldStandard - public static int rfaN2GoldStandard(Map GLs, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { - int numSamples = GLs.size(); - int numChr = 2*numSamples; - - double[][] logYMatrix = new double[1+numSamples][1+numChr]; - - for (int i=0; i <=numSamples; i++) - for (int j=0; j <=numChr; j++) - logYMatrix[i][j] = Double.NEGATIVE_INFINITY; - - //YMatrix[0][0] = 1.0; - logYMatrix[0][0] = 0.0; - int j=0; - - for ( Map.Entry sample : GLs.entrySet() ) { - j++; - - //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - double[] genotypeLikelihoods = sample.getValue().getAsVector(); - //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); - double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - - // special treatment for k=0: iteration reduces to: - //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; - logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA]; - - for (int k=1; k <= 2*j; k++ ) { - - //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; - double logNumerator[]; - logNumerator = new double[3]; - if (k < 2*j-1) - logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + - genotypeLikelihoods[idxAA]; - else - logNumerator[0] = Double.NEGATIVE_INFINITY; - - - if (k < 2*j) - logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + - genotypeLikelihoods[idxAB]; - else - logNumerator[1] = Double.NEGATIVE_INFINITY; - - if (k > 1) - logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + - genotypeLikelihoods[idxBB]; - else - logNumerator[2] = Double.NEGATIVE_INFINITY; - - double logNum = MathUtils.softMax(logNumerator); - - //YMatrix[j][k] = num/den; - logYMatrix[j][k] = logNum - logDenominator; - } - - } - - for (int k=0; k <= numChr; k++) - log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - - return numChr; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 82c5fc593..7653f511f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -31,77 +31,21 @@ import java.util.LinkedList; import java.util.List; /** - * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation - * - *

- * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe - * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within - * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies - * reasons why the site may fail validation (nearby variation, for example). - *

- * - *

Input

- *

- * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an - * interval list defining the size of the amplicons around the sites to be validated - *

- * - *

Output

- *

- * Output is a FASTA-formatted file with some modifications at probe sites. For instance: - * - * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414 - * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC - * >20:792122 Valid 20_792122 - * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT - * >20:994145 Valid 20_994145 - * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC - * >20:1074230 SITE_IS_FILTERED=1, 20_1074230 - * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC - * >20:1084330 DELETION=1, 20_1084330 - * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC - * - * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: - * - * Valid // amplicon is valid - * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant") - * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak - * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon - * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate - * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak - * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer - * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer - * NO_VARIANTS_FOUND, // no variants found within the amplicon region - * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself) - *

- * - *

Examples

- * PRE-TAG - * java - * -jar GenomeAnalysisTK.jar - * -T ValidationAmplicons - * -R /humgen/1kg/reference/human_g1k_v37.fasta - * -BTI ProbeIntervals - * -ProbeIntervals:table interval_table.table - * -ValidateAlleles:vcf sites_to_validate.vcf - * -MaskAlleles:vcf mask_sites.vcf - * --virtualPrimerSize 30 - * -o probes.fasta - * PRE-TAG - * - * @author chartl - * @since July 2011 + * Created by IntelliJ IDEA. + * User: chartl + * Date: 6/13/11 + * Time: 2:12 PM + * To change this template use File | Settings | File Templates. */ @Requires(value={DataSource.REFERENCE}) public class ValidationAmplicons extends RodWalker { - @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+ - "intervals surrounding the probe sites amplicons should be designed for", required=true) + @Input(fullName = "ProbeIntervals", doc="Chris document me", required=true) RodBinding probeIntervals; - @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true) + @Input(fullName = "ValidateAlleles", doc="Chris document me", required=true) RodBinding validateAlleles; - @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true) + @Input(fullName = "MaskAlleles", doc="Chris document me", required=true) RodBinding maskAlleles; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index d46387084..fb172e1b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -31,8 +31,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.SampleUtils; @@ -51,7 +49,7 @@ import java.util.*; * priority list (if provided), emits a single record instance at every position represented in the rods. */ @Reference(window=@Window(start=-50,stop=50)) -public class CombineVariants extends RodWalker implements TreeReducible{ +public class CombineVariants extends RodWalker { /** * The VCF files to merge together * @@ -212,8 +210,6 @@ public class CombineVariants extends RodWalker implements Tree return 0; } - public Integer treeReduce(Integer left, Integer right) { return left + right; } - public Integer reduce(Integer counter, Integer sum) { return counter + sum; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8c35119dd..e197bb973 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,13 +25,10 @@ package org.broadinstitute.sting.utils; -import cern.jet.random.ChiSquare; -import cern.jet.math.Arithmetic; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.apache.lucene.messages.NLS; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -896,7 +893,6 @@ public class MathUtils { return orderStatisticSearch((int) Math.ceil(list.size()/2), list); } - /* public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { // version of the order statistic calculator for SAMRecord/Integer lists, where the // list index maps to a q-score only through the offset index @@ -941,7 +937,7 @@ public class MathUtils { public static byte getQScoreMedian(List reads, List offsets) { return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); - }*/ + } /** A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that @@ -1359,133 +1355,4 @@ public class MathUtils { public static double log10Factorial (int x) { return log10Gamma(x+1); } - - /** - * Computes the p-value for the null hypothesis that the rows of the table are i.i.d. using a pearson chi-square test - * @param contingencyTable - a contingency table - * @return - the ensuing p-value (via chi-square) - */ - public static double contingencyChiSquare(short[][] contingencyTable ) { - int[] rowSum = new int[contingencyTable.length]; - int[] colSum = new int[contingencyTable[0].length]; - int total = 0; - for ( int i = 0; i < contingencyTable.length; i++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - rowSum[i] += contingencyTable[i][j]; - colSum[j] += contingencyTable[i][j]; - total += contingencyTable[i][j]; - } - } - - double chi = 0; - for ( int i = 0; i < contingencyTable.length; i ++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - double expected = (((double) colSum[j])/total)*rowSum[i]; - double resid = contingencyTable[i][j] - expected; - chi += resid*resid/expected; - } - } - - return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); - } - - /** - * Exactly the same as above, but using int arrays rather than short arrays on input - * @param contingencyTable - * @return - */ - public static double contingencyChiSquare(int[][] contingencyTable ) { - int[] rowSum = new int[contingencyTable.length]; - int[] colSum = new int[contingencyTable[0].length]; - int total = 0; - for ( int i = 0; i < contingencyTable.length; i++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - rowSum[i] += contingencyTable[i][j]; - colSum[j] += contingencyTable[i][j]; - total += contingencyTable[i][j]; - } - } - - double chi = 0; - for ( int i = 0; i < contingencyTable.length; i ++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - double expected = (((double) colSum[j])/total)*rowSum[i]; - double resid = contingencyTable[i][j] - expected; - chi += resid*resid/expected; - } - } - - return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); - } - -======= - public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { - int N = ns1 + ns2; - int[] rowSums = { ns1, ns2 }; - double logP = Double.NEGATIVE_INFINITY; - // todo -- sorting and strobing should chage this n^2 loop to a nlog(n) algorithm - for ( int ac1 = 0; ac1 < spectrum1.length; ac1++ ) { - for ( int ac2 = 0; ac2 < spectrum2.length; ac2++ ) { - double logPTable = spectrum1[ac1] + spectrum2[ac2]; - int[][] table = { - { ac1, ns1-ac1 }, - { ac2, ns2-ac2 } - }; - int[] colSums = { ac1 + ac2, N-ac1-ac2}; - double logPH0 = Math.log10(fisherExact(table,rowSums,colSums,N)); - logP = log10sumLog10(new double[]{logP,logPTable+logPH0}); - } - } - - return Math.pow(10,logP); - } - - /** - * Calculates the p-value for a fisher exact test for a 2x2 contingency table - */ - public static double fisherExact(int[][] table) { - if ( table.length > 2 || table[0].length > 2 ) { - throw new ReviewedStingException("Fisher exact is only implemented for 2x2 contingency tables"); - } - - int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; - int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; - int N = rowSums[0] + rowSums[1]; - - return fisherExact(table,rowSums,colSums,N); - - } - - public static double fisherExact(int[][] table, int[] rowSums, int[] colSums, int N ) { - - // calculate in log space so we don't die with high numbers - double pCutoff = Arithmetic.logFactorial(rowSums[0]) - + Arithmetic.logFactorial(rowSums[1]) - + Arithmetic.logFactorial(colSums[0]) - + Arithmetic.logFactorial(colSums[1]) - - Arithmetic.logFactorial(table[0][0]) - - Arithmetic.logFactorial(table[0][1]) - - Arithmetic.logFactorial(table[1][0]) - - Arithmetic.logFactorial(table[1][1]) - - Arithmetic.logFactorial(N); - return Math.exp(pCutoff); - } - - public static int sumRow(int[][] table, int column) { - int sum = 0; - for (int r = 0; r < table.length; r++) { - sum += table[r][column]; - } - - return sum; - } - - public static int sumColumn(int[][] table, int row) { - int sum = 0; - for (int c = 0; c < table[row].length; c++) { - sum += table[row][c]; - } - - return sum; - } }