From 2f7892601c1c544796c518e382ffdb44c7184cb2 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Fri, 1 Oct 2010 21:08:55 +0000 Subject: [PATCH] Useful debugging argument added to VariantRecalibrator to only use sites whose qual field is above --qual git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4406 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContextUtils.java | 2 +- .../VariantRecalibrator.java | 125 +++++++++--------- 2 files changed, 66 insertions(+), 61 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index a9f716f37..c77e99bab 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -312,7 +312,7 @@ public class VariantContextUtils { String rsID = null; int depth = 0; - // counting the number of filtered and varint VCs + // counting the number of filtered and variant VCs int nFiltered = 0, nVariant = 0; Allele refAllele = determineReferenceAllele(VCs); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 1fc8d7bc5..09f1f85d7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -116,6 +116,9 @@ public class VariantRecalibrator extends RodWalker= QUAL_THRESHOLD ) { + final VariantDatum variantDatum = new VariantDatum(); + variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; - final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false); - final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false); + final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); + final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false); + final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false); - variantDatum.isKnown = ( dbsnp != null ); - double knownPrior_qScore = PRIOR_NOVELS; - if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) { - knownPrior_qScore = PRIOR_HAPMAP; - } else if( vc1KG != null ) { - knownPrior_qScore = PRIOR_1KG; - } else if( dbsnp != null ) { - knownPrior_qScore = PRIOR_DBSNP; - } + variantDatum.isKnown = ( dbsnp != null ); + double knownPrior_qScore = PRIOR_NOVELS; + if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) { + knownPrior_qScore = PRIOR_HAPMAP; + } else if( vc1KG != null ) { + knownPrior_qScore = PRIOR_1KG; + } else if( dbsnp != null ) { + knownPrior_qScore = PRIOR_DBSNP; + } - // If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up. - final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0))); - if( !trustACField && !NEVER_TRUST_AC_FIELD ) { - if( !vc.getAttributes().containsKey("AC") ) { - NEVER_TRUST_AC_FIELD = true; - } else { - if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) { - // The AC field is correct at this record so we trust it forever - trustACField = true; - } else { // We found a record in which the AC field wasn't correct but we are trying to trust it - throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." + - "First observed at " + vc.getChr() + ":" + vc.getStart()); + // If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up. + final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0))); + if( !trustACField && !NEVER_TRUST_AC_FIELD ) { + if( !vc.getAttributes().containsKey("AC") ) { + NEVER_TRUST_AC_FIELD = true; + } else { + if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) { + // The AC field is correct at this record so we trust it forever + trustACField = true; + } else { // We found a record in which the AC field wasn't correct but we are trying to trust it + throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." + + "First observed at " + vc.getChr() + ":" + vc.getStart()); + } } } - } - if( trustACField && alleleCount == -1 ) { - throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." + - "First observed at " + vc.getChr() + ":" + vc.getStart()); - } - - final Object[] priorKey = new Object[2]; - priorKey[0] = alleleCount; - priorKey[1] = knownPrior_qScore; - - Double priorLodFactor = (Double)priorCache.get( priorKey ); - - // If this prior factor hasn't been calculated before, do so now - if(priorLodFactor == null) { - final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore); - final double acPrior = theModel.getAlleleCountPrior( alleleCount ); - final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior)); - - if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) { - throw new UserException.CommandLineException("Something is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later + if( trustACField && alleleCount == -1 ) { + throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." + + "First observed at " + vc.getChr() + ":" + vc.getStart()); } - priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0); + final Object[] priorKey = new Object[2]; + priorKey[0] = alleleCount; + priorKey[1] = knownPrior_qScore; - priorCache.put( priorLodFactor, false, priorKey ); + Double priorLodFactor = (Double)priorCache.get( priorKey ); + + // If this prior factor hasn't been calculated before, do so now + if(priorLodFactor == null) { + final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore); + final double acPrior = theModel.getAlleleCountPrior( alleleCount ); + final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior)); + + if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) { + throw new UserException.CommandLineException("Something is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later + } + + priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0); + + priorCache.put( priorLodFactor, false, priorKey ); + } + + final double pVar = theModel.evaluateVariant( vc ); + final double lod = priorLodFactor + Math.log10(pVar); + variantDatum.qual = Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod) ); + + mapList.add( variantDatum ); + final Map attrs = new HashMap(vc.getAttributes()); + attrs.put("OQ", String.format("%.2f", vc.getPhredScaledQual())); + attrs.put("LOD", String.format("%.4f", lod)); + VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, variantDatum.qual / 10.0, new HashSet(), attrs); + + vcfWriter.add( newVC, ref.getBase() ); } - final double pVar = theModel.evaluateVariant( vc ); - final double lod = priorLodFactor + Math.log10(pVar); - variantDatum.qual = Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod) ); - - mapList.add( variantDatum ); - final Map attrs = new HashMap(vc.getAttributes()); - attrs.put("OQ", String.format("%.2f", vc.getPhredScaledQual())); - attrs.put("LOD", String.format("%.4f", lod)); - VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, variantDatum.qual / 10.0, new HashSet(), attrs); - - vcfWriter.add( newVC, ref.getBase() ); - } else { // not a SNP or is filtered so just dump it out to the VCF file vcfWriter.add( vc, ref.getBase() ); }