Useful debugging argument added to VariantRecalibrator to only use sites whose qual field is above --qual

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4406 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-10-01 21:08:55 +00:00
parent 575c38fc04
commit 2f7892601c
2 changed files with 66 additions and 61 deletions

View File

@ -312,7 +312,7 @@ public class VariantContextUtils {
String rsID = null;
int depth = 0;
// counting the number of filtered and varint VCs
// counting the number of filtered and variant VCs
int nFiltered = 0, nVariant = 0;
Allele refAllele = determineReferenceAllele(VCs);

View File

@ -116,6 +116,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Hidden
@Argument(fullName = "NoByHapMapValidationStatus", shortName = "NoByHapMapValidationStatus", doc = "Don't consider sites in dbsnp rod tagged as by-hapmap validation status as real HapMap sites. FOR DEBUGGING PURPOSES ONLY.", required=false)
private Boolean NO_BY_HAPMAP_VALIDATION_STATUS = false;
@Hidden
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
private double QUAL_THRESHOLD = 0.0;
/////////////////////////////
// Private Member Variables
@ -217,77 +220,79 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
if( vc != null && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
final VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
if( vc.getPhredScaledQual() >= QUAL_THRESHOLD ) {
final VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false);
final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false);
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
final VariantContext vcHapMap = tracker.getVariantContext(ref, "hapmap", null, context.getLocation(), false);
final VariantContext vc1KG = tracker.getVariantContext(ref, "1kg", null, context.getLocation(), false);
variantDatum.isKnown = ( dbsnp != null );
double knownPrior_qScore = PRIOR_NOVELS;
if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
knownPrior_qScore = PRIOR_HAPMAP;
} else if( vc1KG != null ) {
knownPrior_qScore = PRIOR_1KG;
} else if( dbsnp != null ) {
knownPrior_qScore = PRIOR_DBSNP;
}
variantDatum.isKnown = ( dbsnp != null );
double knownPrior_qScore = PRIOR_NOVELS;
if( vcHapMap != null || ( !NO_BY_HAPMAP_VALIDATION_STATUS && dbsnp != null && DbSNPHelper.isHapmap(dbsnp) ) ) {
knownPrior_qScore = PRIOR_HAPMAP;
} else if( vc1KG != null ) {
knownPrior_qScore = PRIOR_1KG;
} else if( dbsnp != null ) {
knownPrior_qScore = PRIOR_DBSNP;
}
// If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up.
final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0)));
if( !trustACField && !NEVER_TRUST_AC_FIELD ) {
if( !vc.getAttributes().containsKey("AC") ) {
NEVER_TRUST_AC_FIELD = true;
} else {
if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) {
// The AC field is correct at this record so we trust it forever
trustACField = true;
} else { // We found a record in which the AC field wasn't correct but we are trying to trust it
throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
"First observed at " + vc.getChr() + ":" + vc.getStart());
// If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up.
final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0)));
if( !trustACField && !NEVER_TRUST_AC_FIELD ) {
if( !vc.getAttributes().containsKey("AC") ) {
NEVER_TRUST_AC_FIELD = true;
} else {
if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) {
// The AC field is correct at this record so we trust it forever
trustACField = true;
} else { // We found a record in which the AC field wasn't correct but we are trying to trust it
throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
"First observed at " + vc.getChr() + ":" + vc.getStart());
}
}
}
}
if( trustACField && alleleCount == -1 ) {
throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
"First observed at " + vc.getChr() + ":" + vc.getStart());
}
final Object[] priorKey = new Object[2];
priorKey[0] = alleleCount;
priorKey[1] = knownPrior_qScore;
Double priorLodFactor = (Double)priorCache.get( priorKey );
// If this prior factor hasn't been calculated before, do so now
if(priorLodFactor == null) {
final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore);
final double acPrior = theModel.getAlleleCountPrior( alleleCount );
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
throw new UserException.CommandLineException("Something is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later
if( trustACField && alleleCount == -1 ) {
throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
"First observed at " + vc.getChr() + ":" + vc.getStart());
}
priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0);
final Object[] priorKey = new Object[2];
priorKey[0] = alleleCount;
priorKey[1] = knownPrior_qScore;
priorCache.put( priorLodFactor, false, priorKey );
Double priorLodFactor = (Double)priorCache.get( priorKey );
// If this prior factor hasn't been calculated before, do so now
if(priorLodFactor == null) {
final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore);
final double acPrior = theModel.getAlleleCountPrior( alleleCount );
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
throw new UserException.CommandLineException("Something is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later
}
priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0);
priorCache.put( priorLodFactor, false, priorKey );
}
final double pVar = theModel.evaluateVariant( vc );
final double lod = priorLodFactor + Math.log10(pVar);
variantDatum.qual = Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod) );
mapList.add( variantDatum );
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
attrs.put("OQ", String.format("%.2f", vc.getPhredScaledQual()));
attrs.put("LOD", String.format("%.4f", lod));
VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, variantDatum.qual / 10.0, new HashSet<String>(), attrs);
vcfWriter.add( newVC, ref.getBase() );
}
final double pVar = theModel.evaluateVariant( vc );
final double lod = priorLodFactor + Math.log10(pVar);
variantDatum.qual = Math.abs( QUALITY_SCALE_FACTOR * QualityUtils.lodToPhredScaleErrorRate(lod) );
mapList.add( variantDatum );
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
attrs.put("OQ", String.format("%.2f", vc.getPhredScaledQual()));
attrs.put("LOD", String.format("%.4f", lod));
VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, variantDatum.qual / 10.0, new HashSet<String>(), attrs);
vcfWriter.add( newVC, ref.getBase() );
} else { // not a SNP or is filtered so just dump it out to the VCF file
vcfWriter.add( vc, ref.getBase() );
}