Support for validationRate calculation in variant eval 2; better error messages for failed genome loc parsing; tolerance to odd whitespace in plinkrod, and fix for monomorphic sites in vcf2variantcontext.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2976 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c85ed1ce90
commit
486bef9318
|
|
@ -478,11 +478,11 @@ class PlinkVariantInfo implements Comparable {
|
|||
if ( pieces.length < 2 )
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
|
||||
|
||||
String chrom = pieces[0];
|
||||
String chrom = pieces[0].trim();
|
||||
if ( pieces[1].charAt(0) != 'p' )
|
||||
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
|
||||
|
||||
String pos = pieces[1].substring(1);
|
||||
String pos = pieces[1].substring(1).trim();
|
||||
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
|
||||
|
||||
if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ public class VariantContextAdaptors {
|
|||
|
||||
|
||||
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, Allele refAllele) {
|
||||
if ( vcf.isSNP() || vcf.isIndel() ) {
|
||||
if ( vcf.isReference() || vcf.isSNP() || vcf.isIndel() ) {
|
||||
// add the reference allele
|
||||
if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) {
|
||||
System.out.printf("Excluding vcf record %s%n", vcf);
|
||||
|
|
@ -165,7 +165,10 @@ public class VariantContextAdaptors {
|
|||
//System.out.printf("Excluding vcf record %s%n", vcf);
|
||||
return null;
|
||||
}
|
||||
alleles.add(new Allele(alt, false));
|
||||
|
||||
Allele allele = new Allele(alt, false);
|
||||
if ( ! allele.isNoCall() )
|
||||
alleles.add(allele);
|
||||
}
|
||||
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,102 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class ValidationRate extends VariantEvaluator {
|
||||
class SiteStats {
|
||||
long nPoly = 0, nMono = 0, nNoCall = 0;
|
||||
|
||||
double polyPercent() { return rate(nPoly, nPoly + nMono + nNoCall); }
|
||||
}
|
||||
|
||||
private SiteStats validationStats = new SiteStats();
|
||||
private SiteStats evalOverlapAtMono = new SiteStats();
|
||||
private SiteStats evalOverlapAtPoly = new SiteStats();
|
||||
|
||||
public ValidationRate(VariantEval2Walker parent) {
|
||||
// don't do anything
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "validationRate";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 2; // we need to see each eval track and each comp track
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getName() + ": " + summaryLine();
|
||||
}
|
||||
|
||||
private String summaryLine() {
|
||||
return String.format("%d %d %.2f %d %d %d %.2f %d %d %d %.2f",
|
||||
validationStats.nMono, validationStats.nPoly, validationStats.polyPercent(),
|
||||
evalOverlapAtMono.nMono, evalOverlapAtMono.nPoly, evalOverlapAtMono.nNoCall, evalOverlapAtMono.polyPercent(),
|
||||
evalOverlapAtPoly.nMono, evalOverlapAtPoly.nPoly, evalOverlapAtPoly.nNoCall, evalOverlapAtPoly.polyPercent());
|
||||
}
|
||||
|
||||
private static List<String> HEADER =
|
||||
Arrays.asList("n_mono_in_comp", "n_poly_in_comp", "percent_poly_in_comp",
|
||||
"n_mono_calls_at_mono_sites", "n_poly_calls_at_mono_sites", "n_nocalls_at_mono_sites", "percent_mono_sites_called_poly",
|
||||
"n_mono_calls_at_poly_sites", "n_poly_calls_at_poly_sites", "n_nocalls_at_poly_sites", "percent_poly_sites_called_poly");
|
||||
|
||||
// making it a table
|
||||
public List<String> getTableHeader() {
|
||||
return HEADER;
|
||||
}
|
||||
|
||||
public boolean enabled() { return true; }
|
||||
|
||||
public List<List<String>> getTableRows() {
|
||||
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
|
||||
}
|
||||
|
||||
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
String interesting = null;
|
||||
|
||||
if ( validation != null && validation.hasGenotypes() && validation.isNotFiltered() ) {
|
||||
SiteStats overlap = null;
|
||||
|
||||
if ( validation.isPolymorphic() ) {
|
||||
validationStats.nPoly++;
|
||||
overlap = evalOverlapAtPoly;
|
||||
if ( eval == null || eval.isMonomorphic() )
|
||||
interesting = "ValidationStatus=FN";
|
||||
}
|
||||
else {
|
||||
validationStats.nMono++;
|
||||
overlap = evalOverlapAtMono;
|
||||
|
||||
if ( eval != null && eval.isPolymorphic() )
|
||||
interesting = "ValidationStatus=FP";
|
||||
}
|
||||
|
||||
if ( eval == null )
|
||||
overlap.nNoCall++;
|
||||
else if ( eval.isPolymorphic() )
|
||||
overlap.nPoly++;
|
||||
else
|
||||
overlap.nMono++;
|
||||
}
|
||||
|
||||
return interesting; // we don't capture any interesting sites
|
||||
}
|
||||
}
|
||||
|
|
@ -123,7 +123,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
|
|||
protected String outputVCF = null;
|
||||
|
||||
/** Right now we will only be looking at SNPS */
|
||||
EnumSet<VariantContext.Type> ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP);
|
||||
EnumSet<VariantContext.Type> ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION);
|
||||
|
||||
@Argument(shortName="rsID", fullName="rsID", doc="If provided, list of rsID and build number for capping known snps by their build date", required=false)
|
||||
protected String rsIDFile = null;
|
||||
|
|
@ -366,7 +366,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
|
|||
for ( EvaluationContext group : contexts ) {
|
||||
VariantContext vc = vcs.get(group.evalTrackName);
|
||||
|
||||
//logger.debug(String.format("Updating %s of %s with variant", group.name, vc));
|
||||
//logger.debug(String.format("Updating %s with variant", vc));
|
||||
Set<VariantEvaluator> evaluations = group.evaluations;
|
||||
boolean evalWantsVC = applyVCtoEvaluation(vc, vcs, group);
|
||||
List<String> interestingReasons = new ArrayList<String>();
|
||||
|
|
|
|||
|
|
@ -163,27 +163,34 @@ public class GenomeLocParser {
|
|||
|
||||
Matcher match = mPattern.matcher(str);
|
||||
try {
|
||||
if (match.matches() && match.groupCount() == 4) {
|
||||
if (match.group(1) != null) contig = match.group(1);
|
||||
if (match.group(2) != null) start = parsePosition(match.group(2));
|
||||
if ((match.group(3) != null && match.group(3).equals("+")) || // chr:1+
|
||||
(match.group(3) == null && match.group(4) == null && match.group(2) == null)) // chr1
|
||||
stop = Integer.MAX_VALUE;
|
||||
else if (match.group(3) != null && match.group(3).equals("-")) // chr1:1-1
|
||||
stop = parsePosition(match.group(4));
|
||||
else if (match.group(3) == null && match.group(4) == null) // chr1:1
|
||||
stop = start;
|
||||
else {
|
||||
bad = true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (match.matches() && match.groupCount() == 4) {
|
||||
if (match.group(1) != null) contig = match.group(1);
|
||||
if (match.group(2) != null) start = parsePosition(match.group(2));
|
||||
if ((match.group(3) != null && match.group(3).equals("+")) || // chr:1+
|
||||
(match.group(3) == null && match.group(4) == null && match.group(2) == null)) // chr1
|
||||
stop = Integer.MAX_VALUE;
|
||||
else if (match.group(3) != null && match.group(3).equals("-")) // chr1:1-1
|
||||
stop = parsePosition(match.group(4));
|
||||
else if (match.group(3) == null && match.group(4) == null) // chr1:1
|
||||
stop = start;
|
||||
else {
|
||||
bad = true;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
bad = true;
|
||||
}
|
||||
|
||||
if (bad || start < 0 || stop < 0 || contig == null)
|
||||
throw new StingException("Invalid Genome Location string: " + str);
|
||||
|
||||
if (bad)
|
||||
throw new StingException("Failed to parse Genome Location string: " + str);
|
||||
if (start < 0)
|
||||
throw new StingException("Invalid Genome Location start < 0: " + str + ' ' + start);
|
||||
if (stop < 0)
|
||||
throw new StingException("Invalid Genome Location stop < 0: " + str + ' ' + stop);
|
||||
if (contig == null)
|
||||
throw new StingException("Invalid Genome Location contig == null : " + str);
|
||||
|
||||
|
||||
if (!isContigValid(contig))
|
||||
throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference.");
|
||||
|
||||
|
|
|
|||
|
|
@ -20,8 +20,8 @@ public class VariantEval2IntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testVE2Simple() {
|
||||
HashMap<String, String> expectations = new HashMap<String, String>();
|
||||
expectations.put("-L 1:1-10,000,000", "d58a2a22e5fb3a3d8d90ba02de37f62b");
|
||||
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "03cddae2afbe0d1a8f5e3490aebc7c9c");
|
||||
expectations.put("-L 1:1-10,000,000", "d83605861576db9bc0d50d5c11b67a90");
|
||||
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "be922212c7cb8f8070158dab86949c4b");
|
||||
|
||||
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
|
||||
String extraArgs = entry.getKey();
|
||||
|
|
@ -41,10 +41,10 @@ public class VariantEval2IntegrationTest extends WalkerTest {
|
|||
" -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" +
|
||||
" -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
|
||||
|
||||
String eqMD5s = "1606e285d9bd586dc6662b1ace0a3a0e"; // next two examples should be the same!
|
||||
String eqMD5s = "f03913dacc5e9938a0aa00f1e5a031de"; // next two examples should be the same!
|
||||
expectations.put("", eqMD5s);
|
||||
expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s);
|
||||
expectations.put(" -known comp_hapmap", "44773a96d1c5904a57e0e983836768e4");
|
||||
expectations.put(" -known comp_hapmap", "0f886e7042c3999c3d87f848e0b58eb8");
|
||||
|
||||
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
|
||||
String extraArgs2 = entry.getKey();
|
||||
|
|
@ -62,7 +62,7 @@ public class VariantEval2IntegrationTest extends WalkerTest {
|
|||
String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30";
|
||||
WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s",
|
||||
2,
|
||||
Arrays.asList("1f2e04f8af061b7190758679a7840f12", "a3ce1d70d8ae3874807e9d61994d42af"));
|
||||
Arrays.asList("8162dcf2539e8241fb27cba2055631bf", "a3ce1d70d8ae3874807e9d61994d42af"));
|
||||
executeTest("testVE2WriteVCF", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue