From 076d21d394ccbfbbad389118295df013591d7cd5 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 22 Mar 2010 02:47:09 +0000 Subject: [PATCH] Minor bug workaround in GenotypeConcordance module (see todo). General platform read filter. You can say -rl Platform illumina to remove all SLX reads git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3054 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/filters/PlatformFilter.java | 49 +++++++++++++++++++ .../varianteval2/GenotypeConcordance.java | 17 ++++++- .../varianteval2/VariantEval2Walker.java | 9 ++++ .../org/broadinstitute/sting/utils/Utils.java | 17 ++++++- 4 files changed, 89 insertions(+), 3 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java diff --git a/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java new file mode 100755 index 000000000..d4dff6b68 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; + +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +/** + * Filter out PL matching reads. + * + * @author ebanks + * @version 0.1 + */ +public class PlatformFilter implements SamRecordFilter { + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false) + protected String[] PLFilterNames; + + public boolean filterOut(SAMRecord rec) { + for ( String name : PLFilterNames ) + if ( Utils.isPlatformRead(rec, name.toUpperCase() )) + return true; + return false; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java index 702a23452..a5717dd33 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.contexts.variantcontext.*; import org.broadinstitute.sting.utils.StingException; +import org.apache.log4j.Logger; import java.util.*; @@ -17,6 +18,9 @@ import java.util.*; * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. */ public class GenotypeConcordance extends VariantEvaluator { + protected static Logger logger = Logger.getLogger(GenotypeConcordance.class); + + private static final int MAX_MISSED_VALIDATION_DATA = 10000; private static final int nGenotypeTypes = Genotype.Type.values().length; @@ -118,6 +122,7 @@ public class GenotypeConcordance extends VariantEvaluator { return rows; } + private boolean warnedAboutValidationData = false; public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { String interesting = null; @@ -133,7 +138,17 @@ public class GenotypeConcordance extends VariantEvaluator { determineStats(null, vc); missedValidationData = null; } else { - missedValidationData.add(validation); + // todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide + // todo -- perhaps you need should extend the evaluators with an initialize + // todo -- method that gets the header (or samples) for the first eval sites? + if ( missedValidationData.size() > MAX_MISSED_VALIDATION_DATA) { + if ( ! warnedAboutValidationData ) { + logger.warn("Too many genotype sites missed before eval site appeared; ignoring"); + warnedAboutValidationData = true; + } + } else { + missedValidationData.add(validation); + } return interesting; } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java index 8c2d67092..395201fae 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -108,6 +108,10 @@ public class VariantEval2Walker extends RodWalker { @Argument(shortName="outputVCF", fullName="InterestingSitesVCF", doc="If provided, interesting sites emitted to this vcf and the INFO field annotated as to why they are interesting", required=false) protected String outputVCF = null; + private static double NO_MIN_QUAL_SCORE = -1.0; + @Argument(shortName = "Q", fullName="minPhredConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false) + public double minQualScore = NO_MIN_QUAL_SCORE; + /** Right now we will only be looking at SNPS */ EnumSet ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION); @@ -434,6 +438,11 @@ public class VariantEval2Walker extends RodWalker { if ( vc == null ) return true; + if ( minQualScore != NO_MIN_QUAL_SCORE && vc.getNegLog10PError() < (minQualScore / 10.0)) { + //System.out.printf("exclude %s%n", vc); + return false; + } + if ( group.requiresFiltered() && vc.isNotFiltered() ) return false; diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 527e68146..a949648c8 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -123,16 +123,29 @@ public class Utils { return new String(basesAsbytes); } - public static boolean is454Read(SAMRecord read) { + public static boolean isPlatformRead(SAMRecord read, String name) { SAMReadGroupRecord readGroup = read.getReadGroup(); if (readGroup != null) { Object readPlatformAttr = readGroup.getAttribute("PL"); if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains("454"); + return readPlatformAttr.toString().toUpperCase().contains(name); } return false; } + + public static boolean is454Read(SAMRecord read) { + return isPlatformRead(read, "454"); + } + + public static boolean isSOLiDRead(SAMRecord read) { + return isPlatformRead(read, "SOLID"); + } + + public static boolean isSLXRead(SAMRecord read) { + return isPlatformRead(read, "ILLUMINA"); + } + private static final Map readFlagNames = new HashMap();