Intermediate checkin for ReducedReads with HaplotypeCaller - change min read count over k-mer to average count over k-mer when doing assembly of a reduced read (not optimal, currently trying max and then will decide on best approach), fix merge conflicts

This commit is contained in:
Guillermo del Angel 2012-08-08 12:07:33 -04:00
commit 3e2752667c
12 changed files with 173 additions and 41 deletions

View File

@ -55,12 +55,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
}
@Test
public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() {
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","96087fe9240e3656cc2a4e0ff0174d5b");
}
@Test
public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() {
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","6fdae7093831ecfc82a06dd707d62fe9");
}

View File

@ -130,8 +130,8 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) {
if ( getArgumentCollection().gatkKeyFile == null ) {
throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " +
"Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " +
"for more information and instructions on how to obtain a key.");
"Please see " + GATKRunReport.PHONE_HOME_DOCS_URL +
" for more information and instructions on how to obtain a key.");
}
else {
PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey();

View File

@ -66,10 +66,10 @@ public class GATKArgumentCollection {
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
@Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
@Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
public File gatkKeyFile = null;
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)

View File

@ -88,6 +88,7 @@ public class GATKRunReport {
// number of milliseconds before the S3 put operation is timed-out:
private static final long S3PutTimeOut = 10 * 1000;
public static final String PHONE_HOME_DOCS_URL = "http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest";
/**
* our log

View File

@ -351,7 +351,9 @@ public class PairHMMIndelErrorModel {
previousHaplotypeSeen = haplotypeBases.clone();
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals,
contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities,
(read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities),
(read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities),
contextLogGapContinuationProbabilities,
startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray);

View File

@ -235,7 +235,7 @@ public class VariantDataManager {
double value;
try {
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
if( Double.isInfinite(value) ) { value = Double.NaN; }
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();

View File

@ -167,7 +167,6 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
// create the config offsets
if ( ! header.getContigLines().isEmpty() ) {
logger.info("Found contig lines in BCF2 file, using those");
contigNames.clear();
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
if ( contig.getID() == null || contig.getID().equals("") )

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.exceptions;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -351,8 +352,8 @@ public class UserException extends ReviewedStingException {
public static class UnreadableKeyException extends UserException {
public UnreadableKeyException ( File f, Exception e ) {
super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " +
"Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for help.",
f.getAbsolutePath(), getMessage(e)));
"Please see %s for help.",
f.getAbsolutePath(), getMessage(e), GATKRunReport.PHONE_HOME_DOCS_URL));
}
public UnreadableKeyException ( String message, Exception e ) {
@ -361,8 +362,8 @@ public class UserException extends ReviewedStingException {
public UnreadableKeyException ( String message ) {
super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " +
"Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for help.",
message));
"Please see %s for help.",
message, GATKRunReport.PHONE_HOME_DOCS_URL));
}
}
@ -370,9 +371,8 @@ public class UserException extends ReviewedStingException {
public KeySignatureVerificationException ( File f ) {
super(String.format("The signature in key file %s failed cryptographic verification. " +
"If this key was valid in the past, it's likely been revoked. " +
"Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " +
"for help.",
f.getAbsolutePath()));
"Please see %s for help.",
f.getAbsolutePath(), GATKRunReport.PHONE_HOME_DOCS_URL));
}
}
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.commons.math.MathException;
import org.apache.commons.math.stat.inference.ChiSquareTestImpl;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.collections.Pair;
@ -19,6 +20,7 @@ import java.util.Set;
* @since 07/27/12
*/
public class RecalDatumNode<T extends RecalDatum> {
private final static double SMALLEST_CHI2_PVALUE = 1e-300;
protected static Logger logger = Logger.getLogger(RecalDatumNode.class);
/**
@ -150,8 +152,6 @@ public class RecalDatumNode<T extends RecalDatum> {
* definition have 0 penalty unless they represent a pruned tree with underlying -- but now
* pruned -- subtrees
*
* TODO -- can we really just add together the chi2 values?
*
* @return
*/
public double totalPenalty() {
@ -169,10 +169,10 @@ public class RecalDatumNode<T extends RecalDatum> {
* The maximum penalty among all nodes
* @return
*/
public double maxPenalty() {
double max = getPenalty();
public double maxPenalty(final boolean leafOnly) {
double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE;
for ( final RecalDatumNode<T> sub : subnodes )
max = Math.max(max, sub.maxPenalty());
max = Math.max(max, sub.maxPenalty(leafOnly));
return max;
}
@ -180,10 +180,10 @@ public class RecalDatumNode<T extends RecalDatum> {
* The minimum penalty among all nodes
* @return
*/
public double minPenalty() {
double min = getPenalty();
public double minPenalty(final boolean leafOnly) {
double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE;
for ( final RecalDatumNode<T> sub : subnodes )
min = Math.min(min, sub.minPenalty());
min = Math.min(min, sub.minPenalty(leafOnly));
return min;
}
@ -244,14 +244,15 @@ public class RecalDatumNode<T extends RecalDatum> {
}
/**
* Calculate the chi^2 penalty among subnodes of this node. The chi^2 value
* indicates the degree of independence of the implied error rates among the
* Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node.
*
* The chi^2 value indicates the degree of independence of the implied error rates among the
* immediate subnodes
*
* @return the chi2 penalty, or 0.0 if it cannot be calculated
* @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated
*/
private double calcPenalty() {
if ( isLeaf() )
if ( isLeaf() || freeToMerge() )
return 0.0;
else if ( subnodes.size() == 1 )
// only one value, so its free to merge away
@ -267,13 +268,34 @@ public class RecalDatumNode<T extends RecalDatum> {
i++;
}
final double chi2 = new ChiSquareTestImpl().chiSquare(counts);
try {
final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts);
final double penalty = -10 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE));
// make sure things are reasonable and fail early if not
if (Double.isInfinite(chi2) || Double.isNaN(chi2))
throw new ReviewedStingException("chi2 value is " + chi2 + " at " + getRecalDatum());
// make sure things are reasonable and fail early if not
if (Double.isInfinite(penalty) || Double.isNaN(penalty))
throw new ReviewedStingException("chi2 value is " + chi2PValue + " at " + getRecalDatum());
return chi2;
return penalty;
} catch ( MathException e ) {
throw new ReviewedStingException("Failed in calculating chi2 value", e);
}
}
}
/**
* Is this node free to merge because its rounded Q score is the same as all nodes below
* @return
*/
private boolean freeToMerge() {
if ( isLeaf() ) // leaves are free to merge
return true;
else {
final byte myQual = getRecalDatum().getEmpiricalQualityAsByte();
for ( final RecalDatumNode<T> sub : subnodes )
if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual )
return false;
return true;
}
}
@ -346,14 +368,56 @@ public class RecalDatumNode<T extends RecalDatum> {
while ( root.size() > maxElements ) {
// remove the lowest penalty element, and continue
root = root.removeLowestPenaltyNode();
if ( logger.isDebugEnabled() )
logger.debug("pruneByPenalty root size is now " + root.size() + " of max " + maxElements);
}
// our size is below the target, so we are good, return
return root;
}
/**
* Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged
*
* Note that nodes must have fixed penalties to this algorithm will fail.
*
* @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge
* @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction
* for the size of the initial tree. That is, if there are 10 nodes in the
* tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30
* penalty for multiple testing
* @return
*/
public RecalDatumNode<T> pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) {
RecalDatumNode<T> root = this;
final double bonferroniCorrection = 10 * Math.log10(this.size());
final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn;
if ( applyBonferroniCorrection )
logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " +
"corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty));
while ( true ) {
final Pair<RecalDatumNode<T>, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode();
if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) {
// nothing to merge, or the best candidate is above our max allowed
if ( minPenaltyNode == null )
if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found");
else
if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty);
break;
} else {
// remove the lowest penalty element, and continue
if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond());
root = root.removeLowestPenaltyNode();
}
}
// no more candidates exist with penalty < maxPenalty
return root;
}
/**
* Find the lowest penalty above leaf node in the tree, and return a tree without it
*
@ -363,7 +427,8 @@ public class RecalDatumNode<T extends RecalDatum> {
*/
private RecalDatumNode<T> removeLowestPenaltyNode() {
final Pair<RecalDatumNode<T>, Double> nodeToRemove = getMinPenaltyAboveLeafNode();
//logger.info("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
if ( logger.isDebugEnabled() )
logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
final Pair<RecalDatumNode<T>, Boolean> result = removeNode(nodeToRemove.getFirst());
@ -379,7 +444,7 @@ public class RecalDatumNode<T extends RecalDatum> {
/**
* Finds in the tree the node with the lowest penalty whose subnodes are all leaves
*
* @return
* @return the node and its penalty, or null if no such node exists
*/
private Pair<RecalDatumNode<T>, Double> getMinPenaltyAboveLeafNode() {
if ( isLeaf() )

View File

@ -216,6 +216,7 @@ final class CommonInfo {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
if ( x instanceof Integer ) return (Integer)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
}

View File

@ -355,6 +355,19 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
}
@Test
public void testBaseIndelQualityScores() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 +
" -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam" +
" -o %s" +
" -L 20:10,000,000-10,100,000",
1,
Arrays.asList("b3c923ed9efa04b85fc18a9b45c8d2a6"));
executeTest(String.format("test UG with base indel quality scores"), spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing SnpEff

View File

@ -13,7 +13,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
String recalMD5;
String cutVCFMD5;
public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) {
this.inVCF = validationDataLocation + inVCF;
this.inVCF = inVCF;
this.tranchesMD5 = tranchesMD5;
this.recalMD5 = recalMD5;
this.cutVCFMD5 = cutVCFMD5;
@ -25,7 +25,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
}
}
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
"f360ce3eb2b0b887301be917a9843e2b", // tranches
"287fea5ea066bf3fdd71f5ce9b58eab3", // recal file
"356b9570817b9389da71fbe991d8b2f5"); // cut VCF
@ -74,14 +74,65 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
executeTest("testApplyRecalibration-"+params.inVCF, spec);
}
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"a8ce3cd3dccafdf7d580bcce7d660a9a", // tranches
"1cdf8c9ee77d91d1ba7f002573108bad", // recal file
"62fda105e14b619a1c263855cf56af1d"); // cut VCF
@DataProvider(name = "VRBCFTest")
public Object[][] createVRBCFTest() {
return new Object[][]{ {bcfTest} };
//return new Object[][]{ {yriTrio}, {lowPass} }; // Add hg19 chr20 trio calls here
}
@Test(dataProvider = "VRBCFTest")
public void testVariantRecalibratorWithBCF(VRTest params) {
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" +
" -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" +
" -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" +
" -T VariantRecalibrator" +
" -input " + params.inVCF +
" -L 20:10,000,000-20,000,000" +
" --no_cmdline_in_header" +
" -an AC " + // integer value
" -an QD -an ReadPosRankSum -an FS -an InbreedingCoeff " + // floats value
" -mG 2 "+
" -recalFile %s" +
" -tranchesFile %s",
2,
Arrays.asList("bcf", "txt"),
Arrays.asList(params.recalMD5, params.tranchesMD5));
executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst();
}
@Test(dataProvider = "VRBCFTest", dependsOnMethods="testVariantRecalibratorWithBCF")
public void testApplyRecalibrationWithBCF(VRTest params) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b37KGReference +
" -T ApplyRecalibration" +
" -L 20:10,000,000-20,000,000" +
" --no_cmdline_in_header" +
" -input " + params.inVCF +
" -U LENIENT_VCF_PROCESSING -o %s" +
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null),
Arrays.asList(params.cutVCFMD5));
spec.disableShadowBCF();
executeTest("testApplyRecalibration-"+params.inVCF, spec);
}
VRTest indelUnfiltered = new VRTest(
"combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
"b7589cd098dc153ec64c02dcff2838e4", // tranches
"a04a9001f62eff43d363f4d63769f3ee", // recal file
"64f576881e21323dd4078262604717a2"); // cut VCF
VRTest indelFiltered = new VRTest(
"combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
"b7589cd098dc153ec64c02dcff2838e4", // tranches
"a04a9001f62eff43d363f4d63769f3ee", // recal file
"af22c55d91394c56a222fd40d6d54781"); // cut VCF