1) RFCombine switched to the new ROD system
2) TreeReduce added to useful RODWalkers, but doesn't help very much due to scaling problems 3) RFA refactored, and a genotype-free calculation model added to calculate skew in a genotype-free way (still needs generalization to any ploidy) 4) Added walker to genotype intron loss events, calls into the UG engine to do so. This is very much a first-pass walker. 5) Documentation added for ValidationAmplicons
This commit is contained in:
parent
1246b89049
commit
cf3e826a69
|
|
@ -28,6 +28,9 @@ package org.broadinstitute.sting.gatk.walkers.filters;
|
|||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -47,7 +50,7 @@ import java.util.*;
|
|||
* Filters variant calls using a number of user-selectable, parameterizable criteria.
|
||||
*/
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||
public class VariantFiltrationWalker extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
@ -278,6 +281,10 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
return sum + value;
|
||||
}
|
||||
|
||||
public Integer treeReduce(Integer left, Integer right) {
|
||||
return left + right;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell the user the number of loci processed and close out the new variants file.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import sun.reflect.generics.reflectiveObjects.NotImplementedException;
|
||||
|
||||
|
|
@ -580,7 +581,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// TODO -- remove me for clarity in this code
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
|
||||
public static int gdaN2GoldStandard(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
int numSamples = GLs.size();
|
||||
|
|
@ -658,4 +659,70 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
// todo -- generalize and merge into gdaN2GoldStandard
|
||||
public static int rfaN2GoldStandard(Map<String, GenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
int numSamples = GLs.size();
|
||||
int numChr = 2*numSamples;
|
||||
|
||||
double[][] logYMatrix = new double[1+numSamples][1+numChr];
|
||||
|
||||
for (int i=0; i <=numSamples; i++)
|
||||
for (int j=0; j <=numChr; j++)
|
||||
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
//YMatrix[0][0] = 1.0;
|
||||
logYMatrix[0][0] = 0.0;
|
||||
int j=0;
|
||||
|
||||
for ( Map.Entry<String, GenotypeLikelihoods> sample : GLs.entrySet() ) {
|
||||
j++;
|
||||
|
||||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
double[] genotypeLikelihoods = sample.getValue().getAsVector();
|
||||
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
|
||||
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
// special treatment for k=0: iteration reduces to:
|
||||
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA];
|
||||
|
||||
for (int k=1; k <= 2*j; k++ ) {
|
||||
|
||||
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
double logNumerator[];
|
||||
logNumerator = new double[3];
|
||||
if (k < 2*j-1)
|
||||
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
|
||||
genotypeLikelihoods[idxAA];
|
||||
else
|
||||
logNumerator[0] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
|
||||
if (k < 2*j)
|
||||
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
|
||||
genotypeLikelihoods[idxAB];
|
||||
else
|
||||
logNumerator[1] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if (k > 1)
|
||||
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
|
||||
genotypeLikelihoods[idxBB];
|
||||
else
|
||||
logNumerator[2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
double logNum = MathUtils.softMax(logNumerator);
|
||||
|
||||
//YMatrix[j][k] = num/den;
|
||||
logYMatrix[j][k] = logNum - logDenominator;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (int k=0; k <= numChr; k++)
|
||||
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
|
||||
|
||||
return numChr;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,21 +31,77 @@ import java.util.LinkedList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: 6/13/11
|
||||
* Time: 2:12 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation
|
||||
*
|
||||
* <p>
|
||||
* ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe
|
||||
* sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within
|
||||
* the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies
|
||||
* reasons why the site may fail validation (nearby variation, for example).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an
|
||||
* interval list defining the size of the amplicons around the sites to be validated
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Output is a FASTA-formatted file with some modifications at probe sites. For instance:
|
||||
*
|
||||
* >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
|
||||
* CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
|
||||
* >20:792122 Valid 20_792122
|
||||
* TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
|
||||
* >20:994145 Valid 20_994145
|
||||
* TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
|
||||
* >20:1074230 SITE_IS_FILTERED=1, 20_1074230
|
||||
* ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
|
||||
* >20:1084330 DELETION=1, 20_1084330
|
||||
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
|
||||
*
|
||||
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
|
||||
*
|
||||
* Valid // amplicon is valid
|
||||
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
|
||||
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
|
||||
* MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon
|
||||
* DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate
|
||||
* DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak
|
||||
* START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
|
||||
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* PRE-TAG
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ValidationAmplicons
|
||||
* -R /humgen/1kg/reference/human_g1k_v37.fasta
|
||||
* -BTI ProbeIntervals
|
||||
* -ProbeIntervals:table interval_table.table
|
||||
* -ValidateAlleles:vcf sites_to_validate.vcf
|
||||
* -MaskAlleles:vcf mask_sites.vcf
|
||||
* --virtualPrimerSize 30
|
||||
* -o probes.fasta
|
||||
* PRE-TAG
|
||||
*
|
||||
* @author chartl
|
||||
* @since July 2011
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE})
|
||||
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||
@Input(fullName = "ProbeIntervals", doc="Chris document me", required=true)
|
||||
@Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+
|
||||
"intervals surrounding the probe sites amplicons should be designed for", required=true)
|
||||
RodBinding<TableFeature> probeIntervals;
|
||||
|
||||
@Input(fullName = "ValidateAlleles", doc="Chris document me", required=true)
|
||||
@Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true)
|
||||
RodBinding<VariantContext> validateAlleles;
|
||||
|
||||
@Input(fullName = "MaskAlleles", doc="Chris document me", required=true)
|
||||
@Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true)
|
||||
RodBinding<VariantContext> maskAlleles;
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
|
@ -49,7 +51,7 @@ import java.util.*;
|
|||
* priority list (if provided), emits a single record instance at every position represented in the rods.
|
||||
*/
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||
public class CombineVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer>{
|
||||
/**
|
||||
* The VCF files to merge together
|
||||
*
|
||||
|
|
@ -210,6 +212,8 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
public Integer treeReduce(Integer left, Integer right) { return left + right; }
|
||||
|
||||
public Integer reduce(Integer counter, Integer sum) {
|
||||
return counter + sum;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,10 +25,13 @@
|
|||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import cern.jet.random.ChiSquare;
|
||||
import cern.jet.math.Arithmetic;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.lucene.messages.NLS;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
|
|
@ -893,6 +896,7 @@ public class MathUtils {
|
|||
return orderStatisticSearch((int) Math.ceil(list.size()/2), list);
|
||||
}
|
||||
|
||||
/*
|
||||
public static byte getQScoreOrderStatistic(List<SAMRecord> reads, List<Integer> offsets, int k) {
|
||||
// version of the order statistic calculator for SAMRecord/Integer lists, where the
|
||||
// list index maps to a q-score only through the offset index
|
||||
|
|
@ -937,7 +941,7 @@ public class MathUtils {
|
|||
|
||||
public static byte getQScoreMedian(List<SAMRecord> reads, List<Integer> offsets) {
|
||||
return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.));
|
||||
}
|
||||
}*/
|
||||
|
||||
/** A utility class that computes on the fly average and standard deviation for a stream of numbers.
|
||||
* The number of observations does not have to be known in advance, and can be also very big (so that
|
||||
|
|
@ -1355,4 +1359,133 @@ public class MathUtils {
|
|||
public static double log10Factorial (int x) {
|
||||
return log10Gamma(x+1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the p-value for the null hypothesis that the rows of the table are i.i.d. using a pearson chi-square test
|
||||
* @param contingencyTable - a contingency table
|
||||
* @return - the ensuing p-value (via chi-square)
|
||||
*/
|
||||
public static double contingencyChiSquare(short[][] contingencyTable ) {
|
||||
int[] rowSum = new int[contingencyTable.length];
|
||||
int[] colSum = new int[contingencyTable[0].length];
|
||||
int total = 0;
|
||||
for ( int i = 0; i < contingencyTable.length; i++ ) {
|
||||
for ( int j = 0; j < contingencyTable[0].length; j++ ) {
|
||||
rowSum[i] += contingencyTable[i][j];
|
||||
colSum[j] += contingencyTable[i][j];
|
||||
total += contingencyTable[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
double chi = 0;
|
||||
for ( int i = 0; i < contingencyTable.length; i ++ ) {
|
||||
for ( int j = 0; j < contingencyTable[0].length; j++ ) {
|
||||
double expected = (((double) colSum[j])/total)*rowSum[i];
|
||||
double resid = contingencyTable[i][j] - expected;
|
||||
chi += resid*resid/expected;
|
||||
}
|
||||
}
|
||||
|
||||
return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi);
|
||||
}
|
||||
|
||||
/**
|
||||
* Exactly the same as above, but using int arrays rather than short arrays on input
|
||||
* @param contingencyTable
|
||||
* @return
|
||||
*/
|
||||
public static double contingencyChiSquare(int[][] contingencyTable ) {
|
||||
int[] rowSum = new int[contingencyTable.length];
|
||||
int[] colSum = new int[contingencyTable[0].length];
|
||||
int total = 0;
|
||||
for ( int i = 0; i < contingencyTable.length; i++ ) {
|
||||
for ( int j = 0; j < contingencyTable[0].length; j++ ) {
|
||||
rowSum[i] += contingencyTable[i][j];
|
||||
colSum[j] += contingencyTable[i][j];
|
||||
total += contingencyTable[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
double chi = 0;
|
||||
for ( int i = 0; i < contingencyTable.length; i ++ ) {
|
||||
for ( int j = 0; j < contingencyTable[0].length; j++ ) {
|
||||
double expected = (((double) colSum[j])/total)*rowSum[i];
|
||||
double resid = contingencyTable[i][j] - expected;
|
||||
chi += resid*resid/expected;
|
||||
}
|
||||
}
|
||||
|
||||
return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi);
|
||||
}
|
||||
|
||||
=======
|
||||
public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) {
|
||||
int N = ns1 + ns2;
|
||||
int[] rowSums = { ns1, ns2 };
|
||||
double logP = Double.NEGATIVE_INFINITY;
|
||||
// todo -- sorting and strobing should chage this n^2 loop to a nlog(n) algorithm
|
||||
for ( int ac1 = 0; ac1 < spectrum1.length; ac1++ ) {
|
||||
for ( int ac2 = 0; ac2 < spectrum2.length; ac2++ ) {
|
||||
double logPTable = spectrum1[ac1] + spectrum2[ac2];
|
||||
int[][] table = {
|
||||
{ ac1, ns1-ac1 },
|
||||
{ ac2, ns2-ac2 }
|
||||
};
|
||||
int[] colSums = { ac1 + ac2, N-ac1-ac2};
|
||||
double logPH0 = Math.log10(fisherExact(table,rowSums,colSums,N));
|
||||
logP = log10sumLog10(new double[]{logP,logPTable+logPH0});
|
||||
}
|
||||
}
|
||||
|
||||
return Math.pow(10,logP);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the p-value for a fisher exact test for a 2x2 contingency table
|
||||
*/
|
||||
public static double fisherExact(int[][] table) {
|
||||
if ( table.length > 2 || table[0].length > 2 ) {
|
||||
throw new ReviewedStingException("Fisher exact is only implemented for 2x2 contingency tables");
|
||||
}
|
||||
|
||||
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
|
||||
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
|
||||
int N = rowSums[0] + rowSums[1];
|
||||
|
||||
return fisherExact(table,rowSums,colSums,N);
|
||||
|
||||
}
|
||||
|
||||
public static double fisherExact(int[][] table, int[] rowSums, int[] colSums, int N ) {
|
||||
|
||||
// calculate in log space so we don't die with high numbers
|
||||
double pCutoff = Arithmetic.logFactorial(rowSums[0])
|
||||
+ Arithmetic.logFactorial(rowSums[1])
|
||||
+ Arithmetic.logFactorial(colSums[0])
|
||||
+ Arithmetic.logFactorial(colSums[1])
|
||||
- Arithmetic.logFactorial(table[0][0])
|
||||
- Arithmetic.logFactorial(table[0][1])
|
||||
- Arithmetic.logFactorial(table[1][0])
|
||||
- Arithmetic.logFactorial(table[1][1])
|
||||
- Arithmetic.logFactorial(N);
|
||||
return Math.exp(pCutoff);
|
||||
}
|
||||
|
||||
public static int sumRow(int[][] table, int column) {
|
||||
int sum = 0;
|
||||
for (int r = 0; r < table.length; r++) {
|
||||
sum += table[r][column];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int sumColumn(int[][] table, int row) {
|
||||
int sum = 0;
|
||||
for (int c = 0; c < table[row].length; c++) {
|
||||
sum += table[row][c];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue