Now outputs fully spec-compliant VCF with proper annotations. Emits statistics as to number of good/bad records.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2931 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-04 21:28:17 +00:00
parent 54f04dc541
commit c20d3e567e
2 changed files with 97 additions and 86 deletions

View File

@ -4,6 +4,7 @@ import java.util.*;
import org.apache.commons.jexl.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation;
public class VariantContextUtils {
/**
@ -15,8 +16,8 @@ public class VariantContextUtils {
/**
* Create a new matcher expression with name and JEXL expression exp
* @param name
* @param exp
* @param name name
* @param exp expression
*/
public JexlVCMatchExp(String name, Expression exp) {
this.name = name;
@ -30,9 +31,9 @@ public class VariantContextUtils {
* a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to
* match() below.
*
* @param names
* @param exps
* @return
* @param names names
* @param exps expressions
* @return list of matches
*/
public static List<JexlVCMatchExp> initializeMatchExps(String[] names, String[] exps) {
if ( names == null || exps == null )
@ -53,8 +54,8 @@ public class VariantContextUtils {
* a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to
* match() below.
*
* @param names_and_exps
* @return
* @param names_and_exps mapping of names to expressions
* @return list of matches
*/
public static List<JexlVCMatchExp> initializeMatchExps(Map<String, String> names_and_exps) {
List<JexlVCMatchExp> exps = new ArrayList<JexlVCMatchExp>();
@ -77,9 +78,9 @@ public class VariantContextUtils {
/**
* Returns true if exp match VC. See collection<> version for full docs.
* @param vc
* @param exp
* @return
* @param vc variant context
* @param exp expression
* @return true if there is a match
*/
public static boolean match(VariantContext vc, JexlVCMatchExp exp) {
return match(vc,Arrays.asList(exp)).get(exp);
@ -91,9 +92,9 @@ public class VariantContextUtils {
* expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp
* expressions.
*
* @param vc
* @param exps
* @return
* @param vc variant context
* @param exps expressions
* @return true if there is a match
*/
public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Collection<JexlVCMatchExp> exps) {
// todo -- actually, we should implement a JEXL context interface to VariantContext,
@ -245,4 +246,10 @@ public class VariantContextUtils {
// }
// return contexts;
// }
public static double computeHardyWeinbergPvalue(VariantContext vc) {
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
}
}

View File

@ -3,8 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.gatk.walkers.annotator.HardyWeinberg;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.PlinkRod;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
@ -26,33 +26,35 @@ import java.util.*;
*/
@Reference(window=@Window(start=0,stop=40))
public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true)
public File vcfFile = null;
@Argument(fullName="numberOfSamples", shortName="ns", doc="The number of samples sequenced", required=false)
public int nSamples = 192; // The number of samples in the ped file I wrote this tool for
@Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write results", required=true)
protected File vcfFile = null;
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
protected double maxHardy = 20.0;
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false)
protected double maxNoCall = 0.05;
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
protected double maxHomNonref = 1.1;
@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
"used for smart Hardy-Weinberg annotation",required = false)
public File popFile = null;
@Argument(fullName="useb36ContigNames",shortName="b36contig",doc="Uses b36 contig names (1:1,000,000) rather than hg18 (chr1:1,000,000) for comparison with ref", required=false)
public boolean useb36contigs=false;
@Argument(fullName="maxHardy", doc="Maximum Hardy-Weinberg score to consider an assay valid", required=false)
public double maxHardy = 10;
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a proportion) to consider an assay valid", required=false)
public double maxNoCall = 0.05;
@Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false)
public double maxHomNonref = 1.1;
// max allowable indel size (based on ref window)
private static final int MAX_INDEL_SIZE = 40;
private final int INIT_NUMBER_OF_POPULATIONS = 10;
private ArrayList<String> sampleNames = new ArrayList<String>(nSamples);
// the VCF writer
private VCFWriter vcfWriter = null;
private final HardyWeinberg HWCalc = new HardyWeinberg();
private final boolean useSmartHardy = popFile != null;
// statistics
private int numRecords = 0;
private int numHWViolations = 0;
private int numNoCallViolations = 0;
private int numHomVarViolations = 0;
private HashMap<String,String> samplesToPopulation;
public void initialize() {
if ( useSmartHardy ) {
if ( popFile != null ) {
samplesToPopulation = parsePopulationFile(popFile);
}
}
@ -89,15 +91,23 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
private void initializeWriter(PlinkRod plinkRod) {
vcfWriter = new VCFWriter(vcfFile);
// set up the info and filter headers
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
hInfo.add(new VCFInfoHeaderLine("NoCallPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent of no-calls"));
hInfo.add(new VCFInfoHeaderLine("HomRefPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent of homozygous reference genotypes"));
hInfo.add(new VCFInfoHeaderLine("HetPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent of heterozygous genotypes"));
hInfo.add(new VCFInfoHeaderLine("HomVarPct", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Percent homozygous variant genotypes"));
hInfo.add(new VCFInfoHeaderLine("HW", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Phred-scaled Hardy-Weinberg violation p-value"));
hInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_COUNT_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"));
hInfo.add(new VCFInfoHeaderLine(VCFRecord.ALLELE_NUMBER_KEY, 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total number of alleles in called genotypes"));
hInfo.add(new VCFFilterHeaderLine("HardyWeinbergViolation", "The validation is in Hardy-Weinberg violation"));
hInfo.add(new VCFFilterHeaderLine("HighNoCallRate", "The validation no-call rate is too high"));
hInfo.add(new VCFFilterHeaderLine("TooManyHomVars", "The validation homozygous variant rate is too high"));
VCFHeader header = new VCFHeader(hInfo, new TreeSet<String>(plinkRod.getSampleNames()));
vcfWriter.writeHeader(header);
nSamples = sampleNames.size();
}
public Integer reduce(VCFRecord call, Integer numVariants) {
@ -109,9 +119,14 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
}
public void onTraversalDone(Integer finalReduce) {
logger.info("Variants processed="+finalReduce.toString());
if ( vcfWriter != null )
vcfWriter.close();
System.out.println(String.format("Total number of records processed:\t\t%d", numRecords));
System.out.println(String.format("Number of Hardy-Weinberg violations:\t\t%d (%d%%)", numHWViolations, 100*numHWViolations/numRecords));
System.out.println(String.format("Number of no-call violations:\t\t\t%d (%d%%)", numNoCallViolations, 100*numNoCallViolations/numRecords));
System.out.println(String.format("Number of homozygous variant violations:\t%d (%d%%)", numHomVarViolations, 100*numHomVarViolations/numRecords));
int goodRecords = numRecords - numHWViolations - numNoCallViolations - numHomVarViolations;
System.out.println(String.format("Number of records passing all filters:\t\t%d (%d%%)", goodRecords, 100*goodRecords/numRecords));
}
@ -135,80 +150,69 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
record.setGenotypeFormatString("GT");
if ( true )
return record;
int numHetCalls = vContext.getHetCount();
// check possible filters
double hwPvalue = hardyWeinbergCalculation(vContext);
double hwScore = Math.abs(QualityUtils.phredScaleErrorRate(hwPvalue));
double noCallProp = (double)vContext.getNoCallCount() / (double)vContext.getNSamples();
double homRefProp = (double)vContext.getHomRefCount() / (double)vContext.getNSamples();
double hetProp = (double)vContext.getHetCount() / (double)vContext.getNSamples();
double homVarProp = (double)vContext.getHomVarCount() / (double)vContext.getNSamples();
String hw = hardyWeinbergCalculation(ref,record);
double hwScore = hw != null ? Double.valueOf(hw) : -0.0;
// TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore));
if ( hwScore > maxHardy ) {
record.setFilterString("Hardy-Weinberg");
record.setFilterString("HardyWeinbergViolation");
numHWViolations++;
} else if ( noCallProp > maxNoCall ) {
record.setFilterString("No-calls");
record.setFilterString("HighNoCallRate");
numNoCallViolations++;
} else if ( homVarProp > maxHomNonref) {
record.setFilterString("HomNonref-calls");
record.setFilterString("TooManyHomVars");
numHomVarViolations++;
}
numRecords++;
// add the info fields
HashMap<String, String> infoMap = new HashMap<String,String>(5);
infoMap.put("NoCallPct", String.format("%.1f", 100.0*noCallProp));
infoMap.put("HomRefPct", String.format("%.1f", 100.0*homRefProp));
infoMap.put("HomVarPct", String.format("%.1f", 100.0*homVarProp));
infoMap.put("HetPct", String.format("%.1f", 100.0*hetProp));
infoMap.put("HW", String.format("%.2f", hwScore));
Set<Allele> altAlleles = vContext.getAlternateAlleles();
int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getChromosomeCount(altAlleles.iterator().next());
infoMap.put(VCFRecord.ALLELE_COUNT_KEY, String.format("%d", altAlleleCount));
infoMap.put(VCFRecord.ALLELE_NUMBER_KEY, String.format("%d", vContext.getChromosomeCount()));
record.addInfoFields(infoMap);
return record;
}
private String hardyWeinbergCalculation(ReferenceContext ref, VCFRecord rec) {
if ( useSmartHardy ) {
return smartHardy(ref, rec);
private double hardyWeinbergCalculation(VariantContext vc) {
if ( popFile != null ) {
throw new StingException("We still need to implement this!");
} else {
VCFVariationCall variant = new VCFVariationCall(ref.getBase(),ref.getLocus(),VCFVariationCall.VARIANT_TYPE.SNP);
variant.setGenotypeCalls(rec.getGenotypes());
return HWCalc.annotate(null, ref, null, variant);
return VariantContextUtils.computeHardyWeinbergPvalue(vc);
}
}
private Map<String,String> generateInfoField(VCFRecord rec, int nocall, int homnonref, int allnonref,
ReferenceContext ref, PlinkRod info, double hwScore) {
double propNoCall = ( ( double ) nocall / (double) nSamples );
double propHomNR = ( (double) homnonref / (double) nSamples );
HashMap<String,String> infoMap = new HashMap<String,String>(1);
putInfoStrings(infoMap,propNoCall,propHomNR,allnonref,hwScore,info.getName());
return infoMap;
}
private void putInfoStrings(HashMap<String,String> infoMap, double pnc, double phnr, int nra, double hw, String nm) {
infoMap.put("snpID",nm);
infoMap.put("noCallPct",String.format("%.2f",100.0*pnc));
infoMap.put("homNonrefPct",String.format("%.2f",100.0*phnr));
infoMap.put("nonrefAlleles",String.format("%d",nra));
infoMap.put("HW",String.format("%.2f",hw));
//return String.format("snpID=%s;nocall=%f;homNonref=%4f;numNonrefAlleles=%d;HW=%s",nm,pnc,phnr,nra,hw);
}
private String smartHardy(ReferenceContext ref, VCFRecord rec) {
HashMap<String,ArrayList<Genotype>> genotypesByPopulation = new HashMap<String,ArrayList<Genotype>>(INIT_NUMBER_OF_POPULATIONS);
HashMap<String,String> hardyWeinbergByPopulation = new HashMap<String,String>(INIT_NUMBER_OF_POPULATIONS);
HashMap<String,ArrayList<Genotype>> genotypesByPopulation = new HashMap<String,ArrayList<Genotype>>(10);
HashMap<String,String> hardyWeinbergByPopulation = new HashMap<String,String>(10);
for ( String population : samplesToPopulation.values() ) {
genotypesByPopulation.put(population,new ArrayList<Genotype>());
}
for ( String name : sampleNames ) {
String pop = samplesToPopulation.get(name);
if ( rec.getGenotype(name) != null ) {
// TODO -- genotypesByPopulation.get(pop).add(rec.getGenotype(name));
}
}
//for ( String name : sampleNames ) {
// String pop = samplesToPopulation.get(name);
// if ( rec.getGenotype(name) != null ) {
// genotypesByPopulation.get(pop).add(rec.getGenotype(name));
// }
//}
for ( String population : samplesToPopulation.values() ) {
VCFVariationCall v = new VCFVariationCall(ref.getBase(),ref.getLocus(),VCFVariationCall.VARIANT_TYPE.SNP);
// TODO -- v.setGenotypeCalls(genotypesByPopulation.get(population));
hardyWeinbergByPopulation.put(population,HWCalc.annotate(null,ref,null,v));
// TODO -- hardyWeinbergByPopulation.put(population,HWCalc.annotate(null,ref,null,v));
}
return smartHardyString(hardyWeinbergByPopulation);