From c20d3e567e9ddd75f53a334fc0827ee5654a0588 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 4 Mar 2010 21:28:17 +0000 Subject: [PATCH] Now outputs fully spec-compliant VCF with proper annotations. Emits statistics as to number of good/bad records. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2931 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContextUtils.java | 33 ++-- .../walkers/variantstovcf/PlinkToVCF.java | 150 +++++++++--------- 2 files changed, 97 insertions(+), 86 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 9e8520a27..e3abd7547 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -4,6 +4,7 @@ import java.util.*; import org.apache.commons.jexl.*; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation; public class VariantContextUtils { /** @@ -15,8 +16,8 @@ public class VariantContextUtils { /** * Create a new matcher expression with name and JEXL expression exp - * @param name - * @param exp + * @param name name + * @param exp expression */ public JexlVCMatchExp(String name, Expression exp) { this.name = name; @@ -30,9 +31,9 @@ public class VariantContextUtils { * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to * match() below. * - * @param names - * @param exps - * @return + * @param names names + * @param exps expressions + * @return list of matches */ public static List initializeMatchExps(String[] names, String[] exps) { if ( names == null || exps == null ) @@ -53,8 +54,8 @@ public class VariantContextUtils { * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to * match() below. * - * @param names_and_exps - * @return + * @param names_and_exps mapping of names to expressions + * @return list of matches */ public static List initializeMatchExps(Map names_and_exps) { List exps = new ArrayList(); @@ -77,9 +78,9 @@ public class VariantContextUtils { /** * Returns true if exp match VC. See collection<> version for full docs. - * @param vc - * @param exp - * @return + * @param vc variant context + * @param exp expression + * @return true if there is a match */ public static boolean match(VariantContext vc, JexlVCMatchExp exp) { return match(vc,Arrays.asList(exp)).get(exp); @@ -91,9 +92,9 @@ public class VariantContextUtils { * expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp * expressions. * - * @param vc - * @param exps - * @return + * @param vc variant context + * @param exps expressions + * @return true if there is a match */ public static Map match(VariantContext vc, Collection exps) { // todo -- actually, we should implement a JEXL context interface to VariantContext, @@ -245,4 +246,10 @@ public class VariantContextUtils { // } // return contexts; // } + + + public static double computeHardyWeinbergPvalue(VariantContext vc) { + return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java index 69ad0cdff..420c6068b 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/PlinkToVCF.java @@ -3,8 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.*; -import org.broadinstitute.sting.gatk.walkers.annotator.HardyWeinberg; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.PlinkRod; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; @@ -26,33 +26,35 @@ import java.util.*; */ @Reference(window=@Window(start=0,stop=40)) public class PlinkToVCF extends RodWalker { - @Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true) - public File vcfFile = null; - @Argument(fullName="numberOfSamples", shortName="ns", doc="The number of samples sequenced", required=false) - public int nSamples = 192; // The number of samples in the ped file I wrote this tool for + @Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write results", required=true) + protected File vcfFile = null; + @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false) + protected double maxHardy = 20.0; + @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false) + protected double maxNoCall = 0.05; + @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false) + protected double maxHomNonref = 1.1; + @Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+ "used for smart Hardy-Weinberg annotation",required = false) public File popFile = null; - @Argument(fullName="useb36ContigNames",shortName="b36contig",doc="Uses b36 contig names (1:1,000,000) rather than hg18 (chr1:1,000,000) for comparison with ref", required=false) - public boolean useb36contigs=false; - @Argument(fullName="maxHardy", doc="Maximum Hardy-Weinberg score to consider an assay valid", required=false) - public double maxHardy = 10; - @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a proportion) to consider an assay valid", required=false) - public double maxNoCall = 0.05; - @Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false) - public double maxHomNonref = 1.1; + // max allowable indel size (based on ref window) private static final int MAX_INDEL_SIZE = 40; - - private final int INIT_NUMBER_OF_POPULATIONS = 10; - private ArrayList sampleNames = new ArrayList(nSamples); + + // the VCF writer private VCFWriter vcfWriter = null; - private final HardyWeinberg HWCalc = new HardyWeinberg(); - private final boolean useSmartHardy = popFile != null; + + // statistics + private int numRecords = 0; + private int numHWViolations = 0; + private int numNoCallViolations = 0; + private int numHomVarViolations = 0; + private HashMap samplesToPopulation; public void initialize() { - if ( useSmartHardy ) { + if ( popFile != null ) { samplesToPopulation = parsePopulationFile(popFile); } } @@ -89,15 +91,23 @@ public class PlinkToVCF extends RodWalker { private void initializeWriter(PlinkRod plinkRod) { vcfWriter = new VCFWriter(vcfFile); + // set up the info and filter headers Set hInfo = new HashSet(); hInfo.add(new VCFHeaderLine("source", "PlinkToVCF")); hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + hInfo.add(new VCFInfoHeaderLine("NoCallPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent of no-calls")); + hInfo.add(new VCFInfoHeaderLine("HomRefPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent of homozygous reference genotypes")); + hInfo.add(new VCFInfoHeaderLine("HetPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent of heterozygous genotypes")); + hInfo.add(new VCFInfoHeaderLine("HomVarPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent homozygous variant genotypes")); + hInfo.add(new VCFInfoHeaderLine("HW", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Phred-scaled Hardy-Weinberg violation p-value")); + hInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_COUNT_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); + hInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_NUMBER_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total number of alleles in called genotypes")); + hInfo.add(new VCFFilterHeaderLine("HardyWeinbergViolation", "The validation is in Hardy-Weinberg violation")); + hInfo.add(new VCFFilterHeaderLine("HighNoCallRate", "The validation no-call rate is too high")); + hInfo.add(new VCFFilterHeaderLine("TooManyHomVars", "The validation homozygous variant rate is too high")); VCFHeader header = new VCFHeader(hInfo, new TreeSet(plinkRod.getSampleNames())); vcfWriter.writeHeader(header); - - nSamples = sampleNames.size(); - } public Integer reduce(VCFRecord call, Integer numVariants) { @@ -109,9 +119,14 @@ public class PlinkToVCF extends RodWalker { } public void onTraversalDone(Integer finalReduce) { - logger.info("Variants processed="+finalReduce.toString()); if ( vcfWriter != null ) vcfWriter.close(); + System.out.println(String.format("Total number of records processed:\t\t%d", numRecords)); + System.out.println(String.format("Number of Hardy-Weinberg violations:\t\t%d (%d%%)", numHWViolations, 100*numHWViolations/numRecords)); + System.out.println(String.format("Number of no-call violations:\t\t\t%d (%d%%)", numNoCallViolations, 100*numNoCallViolations/numRecords)); + System.out.println(String.format("Number of homozygous variant violations:\t%d (%d%%)", numHomVarViolations, 100*numHomVarViolations/numRecords)); + int goodRecords = numRecords - numHWViolations - numNoCallViolations - numHomVarViolations; + System.out.println(String.format("Number of records passing all filters:\t\t%d (%d%%)", goodRecords, 100*goodRecords/numRecords)); } @@ -135,80 +150,69 @@ public class PlinkToVCF extends RodWalker { VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase()); record.setGenotypeFormatString("GT"); - if ( true ) - return record; - - int numHetCalls = vContext.getHetCount(); - + // check possible filters + double hwPvalue = hardyWeinbergCalculation(vContext); + double hwScore = Math.abs(QualityUtils.phredScaleErrorRate(hwPvalue)); double noCallProp = (double)vContext.getNoCallCount() / (double)vContext.getNSamples(); + double homRefProp = (double)vContext.getHomRefCount() / (double)vContext.getNSamples(); + double hetProp = (double)vContext.getHetCount() / (double)vContext.getNSamples(); double homVarProp = (double)vContext.getHomVarCount() / (double)vContext.getNSamples(); - String hw = hardyWeinbergCalculation(ref,record); - double hwScore = hw != null ? Double.valueOf(hw) : -0.0; - // TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore)); if ( hwScore > maxHardy ) { - record.setFilterString("Hardy-Weinberg"); + record.setFilterString("HardyWeinbergViolation"); + numHWViolations++; } else if ( noCallProp > maxNoCall ) { - record.setFilterString("No-calls"); + record.setFilterString("HighNoCallRate"); + numNoCallViolations++; } else if ( homVarProp > maxHomNonref) { - record.setFilterString("HomNonref-calls"); + record.setFilterString("TooManyHomVars"); + numHomVarViolations++; } - + numRecords++; + + // add the info fields + HashMap infoMap = new HashMap(5); + infoMap.put("NoCallPct", String.format("%.1f", 100.0*noCallProp)); + infoMap.put("HomRefPct", String.format("%.1f", 100.0*homRefProp)); + infoMap.put("HomVarPct", String.format("%.1f", 100.0*homVarProp)); + infoMap.put("HetPct", String.format("%.1f", 100.0*hetProp)); + infoMap.put("HW", String.format("%.2f", hwScore)); + Set altAlleles = vContext.getAlternateAlleles(); + int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getChromosomeCount(altAlleles.iterator().next()); + infoMap.put(VCFRecord.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount)); + infoMap.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount())); + record.addInfoFields(infoMap); + return record; - } - private String hardyWeinbergCalculation(ReferenceContext ref, VCFRecord rec) { - if ( useSmartHardy ) { - return smartHardy(ref, rec); + private double hardyWeinbergCalculation(VariantContext vc) { + if ( popFile != null ) { + throw new StingException("We still need to implement this!"); } else { - VCFVariationCall variant = new VCFVariationCall(ref.getBase(),ref.getLocus(),VCFVariationCall.VARIANT_TYPE.SNP); - variant.setGenotypeCalls(rec.getGenotypes()); - return HWCalc.annotate(null, ref, null, variant); + return VariantContextUtils.computeHardyWeinbergPvalue(vc); } } - private Map generateInfoField(VCFRecord rec, int nocall, int homnonref, int allnonref, - ReferenceContext ref, PlinkRod info, double hwScore) { - double propNoCall = ( ( double ) nocall / (double) nSamples ); - double propHomNR = ( (double) homnonref / (double) nSamples ); - HashMap infoMap = new HashMap(1); - putInfoStrings(infoMap,propNoCall,propHomNR,allnonref,hwScore,info.getName()); - - return infoMap; - } - - private void putInfoStrings(HashMap infoMap, double pnc, double phnr, int nra, double hw, String nm) { - - infoMap.put("snpID",nm); - infoMap.put("noCallPct",String.format("%.2f",100.0*pnc)); - infoMap.put("homNonrefPct",String.format("%.2f",100.0*phnr)); - infoMap.put("nonrefAlleles",String.format("%d",nra)); - infoMap.put("HW",String.format("%.2f",hw)); - - //return String.format("snpID=%s;nocall=%f;homNonref=%4f;numNonrefAlleles=%d;HW=%s",nm,pnc,phnr,nra,hw); - - } - private String smartHardy(ReferenceContext ref, VCFRecord rec) { - HashMap> genotypesByPopulation = new HashMap>(INIT_NUMBER_OF_POPULATIONS); - HashMap hardyWeinbergByPopulation = new HashMap(INIT_NUMBER_OF_POPULATIONS); + HashMap> genotypesByPopulation = new HashMap>(10); + HashMap hardyWeinbergByPopulation = new HashMap(10); for ( String population : samplesToPopulation.values() ) { genotypesByPopulation.put(population,new ArrayList()); } - for ( String name : sampleNames ) { - String pop = samplesToPopulation.get(name); - if ( rec.getGenotype(name) != null ) { - // TODO -- genotypesByPopulation.get(pop).add(rec.getGenotype(name)); - } - } + //for ( String name : sampleNames ) { + // String pop = samplesToPopulation.get(name); + // if ( rec.getGenotype(name) != null ) { + // genotypesByPopulation.get(pop).add(rec.getGenotype(name)); + // } + //} for ( String population : samplesToPopulation.values() ) { VCFVariationCall v = new VCFVariationCall(ref.getBase(),ref.getLocus(),VCFVariationCall.VARIANT_TYPE.SNP); // TODO -- v.setGenotypeCalls(genotypesByPopulation.get(population)); - hardyWeinbergByPopulation.put(population,HWCalc.annotate(null,ref,null,v)); + // TODO -- hardyWeinbergByPopulation.put(population,HWCalc.annotate(null,ref,null,v)); } return smartHardyString(hardyWeinbergByPopulation);