Improvements to indel analysis capabilities of VariantEval

-- Now calculates the number of Indels overlapping gold standard sites, as well as the percent of indels overlapping gold standard sites
-- Removed insertion : deletion ratio for 1 bp event, replaced it with 1 + 2 : 3 bp ratio for insertions and deletions separately.  This is based on an old email from Mark Daly:

    // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a
    // downstream frameshift, if we make the simplifying assumptions that 3 bp ins
    // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably
    // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions
    // as for deletions, and certainly would expect consistency between in/dels that
    // multiple methods find and in/dels that are unique to one method  (since deletions
    // are more common and the artifacts differ, it is probably worth looking at the totals,
    // overlaps and ratios for insertions and deletions separately in the methods
    // comparison and in this case don't even need to make the simplifying in = del functional assumption

-- Added a new VEW argument to bind a gold standard track
-- Added two new stratifications: OneBPIndel and TandemRepeat which do exactly what you imagine they do
-- Deleted random unused functions in IndelUtils
This commit is contained in:
Mark DePristo 2012-04-06 16:02:09 -04:00
parent 52ef4a3e26
commit 45fc0ea98d
6 changed files with 194 additions and 64 deletions

View File

@ -116,6 +116,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
/**
* Some analyses want to count overlap not with dbSNP (which is in general very open) but
* actually want to itemize their overlap specifically with a set of gold standard sites
* such as HapMap, OMNI, or the gold standard indels. Theis argument provides a mechanism
* for communicating which file to use
*/
@Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false)
public RodBinding<VariantContext> goldStandard = null;
// Help arguments
@Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
protected Boolean LIST = false;

View File

@ -56,6 +56,12 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Number of singleton Indels", format = "%d")
public int n_singleton_indels = 0;
@DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d")
public int n_indels_matching_gold_standard = 0;
@DataPoint(description = "Percent of indels overlapping gold standard sites")
public String gold_standard_matching_rate;
// counts 1 for each site where the number of alleles > 2
public int nMultiIndelSites = 0;
@ -71,18 +77,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Indel novelty rate")
public String indel_novelty_rate;
@DataPoint(description = "1 to 2 bp indel ratio")
public String ratio_of_1_to_2_bp_indels;
@DataPoint(description = "1 to 3 bp indel ratio")
public String ratio_of_1_to_3_bp_indels;
@DataPoint(description = "2 to 3 bp indel ratio")
public String ratio_of_2_to_3_bp_indels;
@DataPoint(description = "1 and 2 to 3 bp indel ratio")
public String ratio_of_1_and_2_to_3_bp_indels;
@DataPoint(description = "Frameshift percent")
public String frameshift_rate_for_coding_indels;
@ -92,9 +86,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Insertion to deletion ratio")
public String insertion_to_deletion_ratio;
@DataPoint(description = "Insertion to deletion ratio for 1 bp events")
public String insertion_to_deletion_ratio_for_1bp_indels;
//
// Frameshifts
//
@ -116,9 +107,25 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0;
int nKnownIndels = 0, nInsertions = 0;
int n1bpInsertions = 0, n1bpDeletions = 0;
int[] countByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
// - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a
// downstream frameshift, if we make the simplifying assumptions that 3 bp ins
// and 3bp del (adding/subtracting 1 AA in general) are roughly comparably
// selected against, we should see a consistent 1+2 : 3 bp ratio for insertions
// as for deletions, and certainly would expect consistency between in/dels that
// multiple methods find and in/dels that are unique to one method (since deletions
// are more common and the artifacts differ, it is probably worth looking at the totals,
// overlaps and ratios for insertions and deletions separately in the methods
// comparison and in this case don't even need to make the simplifying in = del functional assumption
@DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions")
public String ratio_of_1_and_2_to_3_bp_insertions;
@DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions")
public String ratio_of_1_and_2_to_3_bp_deletions;
public final static int LARGE_INDEL_SIZE_THRESHOLD = 10;
@ -150,11 +157,11 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
}
break;
case INDEL:
final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard);
if ( eval.isComplexIndel() ) break; // don't count complex substitutions
nIndelSites++;
if ( ! eval.isBiallelic() ) nMultiIndelSites++;
if ( variantWasSingleton(eval) ) n_singleton_indels++;
// collect information about het / hom ratio
for ( final Genotype g : eval.getGenotypes() ) {
@ -164,15 +171,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
for ( Allele alt : eval.getAlternateAlleles() ) {
n_indels++; // +1 for each alt allele
if ( variantWasSingleton(eval) ) n_singleton_indels++;
if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific?
if ( gold != null ) n_indels_matching_gold_standard++;
// ins : del ratios
final int alleleSize = alt.length() - eval.getReference().length();
if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
if ( alleleSize > 0 ) nInsertions++;
if ( alleleSize == 1 ) n1bpInsertions++;
if ( alleleSize == -1 ) n1bpDeletions++;
// requires snpEFF annotations
if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) {
@ -193,6 +199,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
n_large_deletions++;
// update the baby histogram
final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength;
final int absSize = Math.abs(alleleSize);
if ( absSize < countByLength.length ) countByLength[absSize]++;
@ -210,18 +217,18 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites);
SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels);
SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels);
gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels);
indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels);
ratio_of_1_to_2_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[2]);
ratio_of_1_to_3_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[3]);
ratio_of_2_to_3_bp_indels = Utils.formattedRatio(countByLength[2], countByLength[3]);
ratio_of_1_and_2_to_3_bp_indels = Utils.formattedRatio(countByLength[1] + countByLength[2], countByLength[3]);
frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting);
ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]);
ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]);
SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms);
indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms);
insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions);
insertion_to_deletion_ratio_for_1bp_indels = Utils.formattedRatio(n1bpInsertions, n1bpDeletions);
insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions);
}

View File

@ -0,0 +1,59 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Stratifies the eval RODs into sites where the indel is 1 bp in length and those where the event is 2+.
* all non indel events go into all bins, so that SNP counts can be used as contrasts in eval modules.
*/
public class OneBPIndel extends VariantStratifier {
private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"one.bp", (Object)"two.plus.bp");
private final static List<Object> ONE_BP = Arrays.asList((Object)"all", (Object)"one.bp");
private final static List<Object> TWO_PLUS_BP = Arrays.asList((Object)"all", (Object)"two.plus.bp");
@Override
public void initialize() {
states.addAll(ALL);
}
@Override
public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if (eval != null && eval.isIndel()) {
for ( int l : eval.getIndelLengths() )
if ( l > 1 )
return TWO_PLUS_BP; // someone is too long
return ONE_BP; // all lengths are one
} else
return ALL;
}
}

View File

@ -0,0 +1,67 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.Arrays;
import java.util.List;
/**
* Stratifies the eval RODs into sites that are tandem repeats
*/
public class TandemRepeat extends VariantStratifier {
private final static List<Object> JUST_ALL = Arrays.asList((Object)"all");
private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"is.repeat", (Object)"not.repeat");
private final static List<Object> REPEAT = Arrays.asList((Object)"all", (Object)"is.repeat");
private final static List<Object> NOT_REPEAT = Arrays.asList((Object)"all", (Object)"not.repeat");
@Override
public void initialize() {
states.addAll(ALL);
}
@Override
public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if ( eval == null || ! eval.isIndel() )
return ALL;
else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) {
print("REPEAT", eval, ref);
return REPEAT;
} else {
print("NOT A REPEAT", eval, ref);
return NOT_REPEAT;
}
}
private final void print(String prefix, VariantContext eval, ReferenceContext ref) {
// String alleles = ParsingUtils.sortList(eval.getAlleles()).toString();
// this.getVariantEvalWalker().getLogger().info(prefix + ": " + "pos=" + eval.getStart() + " alleles=" + alleles + " ref=" + new String(ref.getForwardBases()));
}
}

View File

@ -224,10 +224,6 @@ public class IndelUtils {
return inds;
}
public static String[] getIndelClassificationNames() {
return COLUMN_KEYS;
}
public static String getIndelClassificationName(int k) {
if (k >=0 && k < COLUMN_KEYS.length)
return COLUMN_KEYS[k];
@ -235,35 +231,6 @@ public class IndelUtils {
throw new ReviewedStingException("Invalid index when trying to get indel classification name");
}
public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) {
ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
boolean isIt = false;
for (int k : inds) {
if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_T) {
isIt = true;
break;
}
}
return isIt;
}
public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) {
ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
boolean isIt = false;
for (int k : inds) {
if (k == IND_FOR_REPEAT_EXPANSION_C || k == IND_FOR_REPEAT_EXPANSION_G) {
isIt = true;
break;
}
}
return isIt;
}
public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) {
return (vc.getStart() != ref.getLocus().getStart());
}

View File

@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
" --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf";
WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s",
1, Arrays.asList("c8a782f51e094dc7be06dbfb795feab2"));
1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65"));
executeTestParallel("testSelect1", spec);
}
@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
@Test
public void testCompVsEvalAC() {
String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5c409a2ab4517f862c6678902c0fd7a1"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00"));
executeTestParallel("testCompVsEvalAC",spec);
}
@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --dbsnp " + b37dbSNP132 +
" --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
" -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a27c700eabe6b7b3877c8fd4eabb3975"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2"));
executeTestParallel("testEvalTrackWithoutGenotypes",spec);
}
@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
" --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" +
" -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3272a2db627d4f42bc512df49a8ea64b"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a"));
executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec);
}
@ -488,11 +488,32 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s"
),
1,
Arrays.asList("41a37636868a838a632559949c5216cf")
Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3")
);
executeTest("testModernVCFWithLargeIndels", spec);
}
@Test
public void testStandardIndelEval() {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T VariantEval",
"-R " + b37KGReference,
"-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf",
"-L 20",
"-noST -ST Sample -ST OneBPIndel -ST TandemRepeat",
"-noEV -EV IndelSummary -EV IndelLengthHistogram",
"-gold " + validationDataLocation + "/Mills_and_1000G_gold_standard.indels.b37.sites.vcf",
"-D " + b37dbSNP132,
"-o %s"
),
1,
Arrays.asList("c89705147ef4233d5de3a539469bd1d1")
);
executeTest("testStandardIndelEval", spec);
}
@Test()
public void testIncompatibleEvalAndStrat() {
WalkerTestSpec spec = new WalkerTestSpec(