Support for validationRate calculation in variant eval 2; better error messages for failed genome loc parsing; tolerance to odd whitespace in plinkrod, and fix for monomorphic sites in vcf2variantcontext.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2976 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-10 16:25:16 +00:00
parent c85ed1ce90
commit 486bef9318
6 changed files with 141 additions and 29 deletions

View File

@ -478,11 +478,11 @@ class PlinkVariantInfo implements Comparable {
if ( pieces.length < 2 )
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
String chrom = pieces[0];
String chrom = pieces[0].trim();
if ( pieces[1].charAt(0) != 'p' )
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
String pos = pieces[1].substring(1);
String pos = pieces[1].substring(1).trim();
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {

View File

@ -146,7 +146,7 @@ public class VariantContextAdaptors {
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, Allele refAllele) {
if ( vcf.isSNP() || vcf.isIndel() ) {
if ( vcf.isReference() || vcf.isSNP() || vcf.isIndel() ) {
// add the reference allele
if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) {
System.out.printf("Excluding vcf record %s%n", vcf);
@ -165,7 +165,10 @@ public class VariantContextAdaptors {
//System.out.printf("Excluding vcf record %s%n", vcf);
return null;
}
alleles.add(new Allele(alt, false));
Allele allele = new Allele(alt, false);
if ( ! allele.isNoCall() )
alleles.add(allele);
}
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();

View File

@ -0,0 +1,102 @@
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import java.util.ArrayList;
import java.util.List;
import java.util.Arrays;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class ValidationRate extends VariantEvaluator {
class SiteStats {
long nPoly = 0, nMono = 0, nNoCall = 0;
double polyPercent() { return rate(nPoly, nPoly + nMono + nNoCall); }
}
private SiteStats validationStats = new SiteStats();
private SiteStats evalOverlapAtMono = new SiteStats();
private SiteStats evalOverlapAtPoly = new SiteStats();
public ValidationRate(VariantEval2Walker parent) {
// don't do anything
}
public String getName() {
return "validationRate";
}
public int getComparisonOrder() {
return 2; // we need to see each eval track and each comp track
}
public String toString() {
return getName() + ": " + summaryLine();
}
private String summaryLine() {
return String.format("%d %d %.2f %d %d %d %.2f %d %d %d %.2f",
validationStats.nMono, validationStats.nPoly, validationStats.polyPercent(),
evalOverlapAtMono.nMono, evalOverlapAtMono.nPoly, evalOverlapAtMono.nNoCall, evalOverlapAtMono.polyPercent(),
evalOverlapAtPoly.nMono, evalOverlapAtPoly.nPoly, evalOverlapAtPoly.nNoCall, evalOverlapAtPoly.polyPercent());
}
private static List<String> HEADER =
Arrays.asList("n_mono_in_comp", "n_poly_in_comp", "percent_poly_in_comp",
"n_mono_calls_at_mono_sites", "n_poly_calls_at_mono_sites", "n_nocalls_at_mono_sites", "percent_mono_sites_called_poly",
"n_mono_calls_at_poly_sites", "n_poly_calls_at_poly_sites", "n_nocalls_at_poly_sites", "percent_poly_sites_called_poly");
// making it a table
public List<String> getTableHeader() {
return HEADER;
}
public boolean enabled() { return true; }
public List<List<String>> getTableRows() {
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
}
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String interesting = null;
if ( validation != null && validation.hasGenotypes() && validation.isNotFiltered() ) {
SiteStats overlap = null;
if ( validation.isPolymorphic() ) {
validationStats.nPoly++;
overlap = evalOverlapAtPoly;
if ( eval == null || eval.isMonomorphic() )
interesting = "ValidationStatus=FN";
}
else {
validationStats.nMono++;
overlap = evalOverlapAtMono;
if ( eval != null && eval.isPolymorphic() )
interesting = "ValidationStatus=FP";
}
if ( eval == null )
overlap.nNoCall++;
else if ( eval.isPolymorphic() )
overlap.nPoly++;
else
overlap.nMono++;
}
return interesting; // we don't capture any interesting sites
}
}

View File

@ -123,7 +123,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
protected String outputVCF = null;
/** Right now we will only be looking at SNPS */
EnumSet<VariantContext.Type> ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP);
EnumSet<VariantContext.Type> ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION);
@Argument(shortName="rsID", fullName="rsID", doc="If provided, list of rsID and build number for capping known snps by their build date", required=false)
protected String rsIDFile = null;
@ -366,7 +366,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
for ( EvaluationContext group : contexts ) {
VariantContext vc = vcs.get(group.evalTrackName);
//logger.debug(String.format("Updating %s of %s with variant", group.name, vc));
//logger.debug(String.format("Updating %s with variant", vc));
Set<VariantEvaluator> evaluations = group.evaluations;
boolean evalWantsVC = applyVCtoEvaluation(vc, vcs, group);
List<String> interestingReasons = new ArrayList<String>();

View File

@ -163,27 +163,34 @@ public class GenomeLocParser {
Matcher match = mPattern.matcher(str);
try {
if (match.matches() && match.groupCount() == 4) {
if (match.group(1) != null) contig = match.group(1);
if (match.group(2) != null) start = parsePosition(match.group(2));
if ((match.group(3) != null && match.group(3).equals("+")) || // chr:1+
(match.group(3) == null && match.group(4) == null && match.group(2) == null)) // chr1
stop = Integer.MAX_VALUE;
else if (match.group(3) != null && match.group(3).equals("-")) // chr1:1-1
stop = parsePosition(match.group(4));
else if (match.group(3) == null && match.group(4) == null) // chr1:1
stop = start;
else {
bad = true;
}
}
} catch (Exception e) {
if (match.matches() && match.groupCount() == 4) {
if (match.group(1) != null) contig = match.group(1);
if (match.group(2) != null) start = parsePosition(match.group(2));
if ((match.group(3) != null && match.group(3).equals("+")) || // chr:1+
(match.group(3) == null && match.group(4) == null && match.group(2) == null)) // chr1
stop = Integer.MAX_VALUE;
else if (match.group(3) != null && match.group(3).equals("-")) // chr1:1-1
stop = parsePosition(match.group(4));
else if (match.group(3) == null && match.group(4) == null) // chr1:1
stop = start;
else {
bad = true;
}
}
} catch (Exception e) {
bad = true;
}
if (bad || start < 0 || stop < 0 || contig == null)
throw new StingException("Invalid Genome Location string: " + str);
if (bad)
throw new StingException("Failed to parse Genome Location string: " + str);
if (start < 0)
throw new StingException("Invalid Genome Location start < 0: " + str + ' ' + start);
if (stop < 0)
throw new StingException("Invalid Genome Location stop < 0: " + str + ' ' + stop);
if (contig == null)
throw new StingException("Invalid Genome Location contig == null : " + str);
if (!isContigValid(contig))
throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference.");

View File

@ -20,8 +20,8 @@ public class VariantEval2IntegrationTest extends WalkerTest {
@Test
public void testVE2Simple() {
HashMap<String, String> expectations = new HashMap<String, String>();
expectations.put("-L 1:1-10,000,000", "d58a2a22e5fb3a3d8d90ba02de37f62b");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "03cddae2afbe0d1a8f5e3490aebc7c9c");
expectations.put("-L 1:1-10,000,000", "d83605861576db9bc0d50d5c11b67a90");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "be922212c7cb8f8070158dab86949c4b");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey();
@ -41,10 +41,10 @@ public class VariantEval2IntegrationTest extends WalkerTest {
" -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" +
" -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
String eqMD5s = "1606e285d9bd586dc6662b1ace0a3a0e"; // next two examples should be the same!
String eqMD5s = "f03913dacc5e9938a0aa00f1e5a031de"; // next two examples should be the same!
expectations.put("", eqMD5s);
expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s);
expectations.put(" -known comp_hapmap", "44773a96d1c5904a57e0e983836768e4");
expectations.put(" -known comp_hapmap", "0f886e7042c3999c3d87f848e0b58eb8");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs2 = entry.getKey();
@ -62,7 +62,7 @@ public class VariantEval2IntegrationTest extends WalkerTest {
String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30";
WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s",
2,
Arrays.asList("1f2e04f8af061b7190758679a7840f12", "a3ce1d70d8ae3874807e9d61994d42af"));
Arrays.asList("8162dcf2539e8241fb27cba2055631bf", "a3ce1d70d8ae3874807e9d61994d42af"));
executeTest("testVE2WriteVCF", spec);
}
}