From 61b5fb82ce093634da5892fd514067eb1924e6ec Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 12 Nov 2009 22:51:49 +0000 Subject: [PATCH] 2 major changes: 1. Add dbsnp RS ID to VCF output from genotyper; to do this I needed to fix the dbsnp rod which did not correctly return this value. 2. Remove AlleleBalanceBacked and instead generalize the arbitrary info fields backing VCFs (and potentially others) in preparation for refactoring VariantFiltration next week. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2028 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/rodDbSNP.java | 15 +++--- .../genotyper/EMGenotypeCalculationModel.java | 6 +++ .../genotyper/GenotypeCalculationModel.java | 20 ++++++++ ...JointEstimateGenotypeCalculationModel.java | 18 +++++-- ...PointEstimateGenotypeCalculationModel.java | 14 +++-- .../walkers/genotyper/UnifiedGenotyper.java | 23 --------- .../walkers/variantstovcf/VariantsToVCF.java | 2 +- .../utils/genotype/AlleleBalanceBacked.java | 35 ------------- .../utils/genotype/ArbitraryFieldsBacked.java | 25 +++++++++ .../sting/utils/genotype/IDBacked.java | 24 +++++++++ .../genotype/vcf/VCFGenotypeLocusData.java | 51 ++++++++----------- .../vcf/VCFGenotypeWriterAdapter.java | 17 +++++-- 12 files changed, 142 insertions(+), 108 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/utils/genotype/AlleleBalanceBacked.java create mode 100755 java/src/org/broadinstitute/sting/utils/genotype/ArbitraryFieldsBacked.java create mode 100755 java/src/org/broadinstitute/sting/utils/genotype/IDBacked.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 34a04a02d..10893ce57 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -27,7 +27,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod // Reference sequence chromosome or scaffold // Start and stop positions in chrom - public String name; // Reference SNP identifier or Affy SNP name + public String RS_ID; // Reference SNP identifier or Affy SNP name public String strand; // Which DNA strand contains the observed alleles public String refBases; // the reference base according to NCBI, in the dbSNP file @@ -206,19 +206,22 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod // formatting // // ---------------------------------------------------------------------- + + public String getRS_ID() { return RS_ID; } + public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1, - name, strand, refBases, observed, molType, + RS_ID, strand, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight); } public String toSimpleString() { - return String.format("%s:%s:%s", name, observed, strand); + return String.format("%s:%s:%s", RS_ID, observed, strand); } public String toMediumString() { - String s = String.format("%s:%s:%s", getLocation().toString(), name, Utils.join("",this.getAlleleList())); + String s = String.format("%s:%s:%s", getLocation().toString(), RS_ID, Utils.join("",this.getAlleleList())); if (isSNP()) s += ":SNP"; if (isIndel()) s += ":Indel"; if (isHapmap()) s += ":Hapmap"; @@ -229,7 +232,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod public String repl() { return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", 585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(), - name, strand, refBases, refBases, observed, molType, + RS_ID, strand, refBases, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight); } @@ -240,7 +243,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1)); - name = parts[4]; + RS_ID = parts[4]; strand = parts[6]; refBases = parts[7]; if (strand == "-") diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index c0045d83f..9443e476e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; @@ -52,6 +53,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode if ( locusdata instanceof ConfidenceBacked ) { ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); } + if ( locusdata instanceof IDBacked ) { + rodDbSNP dbsnp = getDbSNP(tracker); + if ( dbsnp != null ) + ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); + } if ( locusdata instanceof SLODBacked ) { // calculate strand score diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index 13d7f3312..6b2be5560 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Pair; @@ -99,6 +100,25 @@ public abstract class GenotypeCalculationModel implements Cloneable { AlignmentContext context, DiploidGenotypePriors priors); + /** + * @param tracker rod data + * + * @return the dbsnp rod if there is one at this position + */ + public static rodDbSNP getDbSNP(RefMetaDataTracker tracker) { + return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); + } + + /** + * Determine whether we're at a Hapmap site + * + * @param tracker the meta data tracker + * + * @return true if we're at a Hapmap site, false if otherwise + */ + private static boolean isHapmapSite(RefMetaDataTracker tracker) { + return tracker.getTrackData("hapmap", null) != null; + } /** * Create the mapping from sample to alignment contexts. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index a5dedad3c..f9a3f1e0c 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import java.util.*; @@ -49,7 +50,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo // run joint estimation for the full GL contexts initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL); calculateAlleleFrequencyPosteriors(ref, context.getLocation()); - return createCalls(ref, contexts, context.getLocation()); + return createCalls(tracker, ref, contexts, context.getLocation()); } private void initializeAlleleFrequencies(int numSamples) { @@ -275,7 +276,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo verboseWriter.println(); } - private Pair, GenotypeLocusData> createCalls(char ref, HashMap contexts, GenomeLoc loc) { + private Pair, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap contexts, GenomeLoc loc) { // first, find the alt allele with maximum confidence int indexOfMax = 0; char baseOfMax = ref; @@ -333,7 +334,12 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo if ( locusdata instanceof AlleleFrequencyBacked ) { ((AlleleFrequencyBacked)locusdata).setAlleleFrequency(bestAFguess); } - if ( locusdata instanceof AlleleBalanceBacked ) { + if ( locusdata instanceof IDBacked ) { + rodDbSNP dbsnp = getDbSNP(tracker); + if ( dbsnp != null ) + ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); + } + if ( locusdata instanceof ArbitraryFieldsBacked) { ArrayList refBalances = new ArrayList(); ArrayList onOffBalances = new ArrayList(); ArrayList weights = new ArrayList(); @@ -384,8 +390,10 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i); } - ((AlleleBalanceBacked)locusdata).setRefRatio(normalizedRefRatio); - ((AlleleBalanceBacked)locusdata).setOnOffRatio(normalizedOnOffRatio); + HashMap fields = new HashMap(); + fields.put("AB", String.format("%.2f", normalizedRefRatio)); + fields.put("OO", String.format("%.2f", normalizedOnOffRatio)); + ((ArbitraryFieldsBacked)locusdata).setFields(fields); } } if ( locusdata instanceof SLODBacked ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index b6f757964..de125120a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; @@ -90,7 +91,12 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation if ( locusdata instanceof ConfidenceBacked ) { ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); } - if ( locusdata instanceof AlleleBalanceBacked ) { + if ( locusdata instanceof IDBacked ) { + rodDbSNP dbsnp = getDbSNP(tracker); + if ( dbsnp != null ) + ((IDBacked)locusdata).setID(dbsnp.getRS_ID()); + } + if ( locusdata instanceof ArbitraryFieldsBacked) { DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex]; // we care only about het calls @@ -107,8 +113,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation totalCount += counts[i]; // set the ratios - ((AlleleBalanceBacked)locusdata).setRefRatio((double)refCount / (double)(refCount + altCount)); - ((AlleleBalanceBacked)locusdata).setOnOffRatio((double)(refCount + altCount) / (double)totalCount); + HashMap fields = new HashMap(); + fields.put("AB", String.format("%.2f", (double)refCount / (double)(refCount + altCount))); + fields.put("OO", String.format("%.2f",(double)(refCount + altCount) / (double)totalCount)); + ((ArbitraryFieldsBacked)locusdata).setFields(fields); } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 8b7d7ef06..73182923d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadFilters; import org.broadinstitute.sting.utils.BaseUtils; @@ -195,28 +194,6 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL return new AlignmentContext(context.getLocation(), newReads, newOffsets); } - /** - * Determine whether we're at a Hapmap site - * - * @param tracker the meta data tracker - * - * @return true if we're at a Hapmap site, false if otherwise - */ - private static boolean isHapmapSite(RefMetaDataTracker tracker) { - return tracker.getTrackData("hapmap", null) != null; - } - - /** - * Determine whether we're at a dbSNP site - * - * @param tracker the meta data tracker - * - * @return true if we're at a dbSNP site, false if otherwise - */ - private static boolean isDbSNPSite(RefMetaDataTracker tracker) { - return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)) != null; - } - public Integer reduceInit() { return 0; } public Integer reduce(Pair, GenotypeLocusData> value, Integer sum) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java index da976cee6..d471017ab 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java @@ -182,7 +182,7 @@ public class VariantsToVCF extends RefWalker { return new VCFRecord(ref.getBase(), context.getContig(), (int) context.getPosition(), - (dbsnp == null) ? "." : dbsnp.name, + (dbsnp == null) ? "." : dbsnp.getRS_ID(), alts, snpQual > 99 ? 99 : (int) snpQual, "0", diff --git a/java/src/org/broadinstitute/sting/utils/genotype/AlleleBalanceBacked.java b/java/src/org/broadinstitute/sting/utils/genotype/AlleleBalanceBacked.java deleted file mode 100755 index 404c01857..000000000 --- a/java/src/org/broadinstitute/sting/utils/genotype/AlleleBalanceBacked.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.utils.genotype; - -/** - * @author ebanks - * Interface AlleleBalanceBacked - * - * this interface indicates that the genotype is - * backed up by allele balance information. - */ -public interface AlleleBalanceBacked { - - /** - * - * @return returns the ratio of ref/(ref+alt) bases for hets - */ - public double getRefRatio(); - - /** - * - * @param ratio the ref/(ref+alt) ratio - */ - public void setRefRatio(double ratio); - - /** - * - * @return returns the ratio of (ref+alt)/total bases for hets - */ - public double getOnOffRatio(); - - /** - * - * @param ratio the (ref+alt)/total ratio - */ - public void setOnOffRatio(double ratio); -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/ArbitraryFieldsBacked.java b/java/src/org/broadinstitute/sting/utils/genotype/ArbitraryFieldsBacked.java new file mode 100755 index 000000000..df57a35b2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/ArbitraryFieldsBacked.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.utils.genotype; + +import java.util.Map; + +/** + * @author ebanks + * Interface AlleleBalanceBacked + * + * this interface indicates that the genotype is + * backed up by allele balance information. + */ +public interface ArbitraryFieldsBacked { + + /** + * + * @return returns the list of arbitrary fields to emit + */ + public Map getFields(); + + /** + * + * @param fields a list of arbitrary fields to emit + */ + public void setFields(Map fields); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/IDBacked.java b/java/src/org/broadinstitute/sting/utils/genotype/IDBacked.java new file mode 100755 index 000000000..084925fe2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/IDBacked.java @@ -0,0 +1,24 @@ +package org.broadinstitute.sting.utils.genotype; + +/** + * @author ebanks + * Interface IDBacked + * + * this interface indicates that the genotype is + * backed up by sample information. + */ +public interface IDBacked { + + /** + * + * @return returns the ID for this genotype + */ + public String getID(); + + /** + * + * @param id the ID for this genotype + */ + public void setID(String id); + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeLocusData.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeLocusData.java index 4fb9e02f1..dcf869936 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeLocusData.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeLocusData.java @@ -3,6 +3,9 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotype.*; +import java.util.HashMap; +import java.util.Map; + /** * @author ebanks *

@@ -10,7 +13,7 @@ import org.broadinstitute.sting.utils.genotype.*; *

* represents the meta data for a genotype object. */ -public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, AlleleFrequencyBacked, AlleleBalanceBacked { +public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, IDBacked, AlleleFrequencyBacked, ArbitraryFieldsBacked { // the discovery lod score private double mConfidence = 0.0; @@ -27,9 +30,11 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked // the ref base private char mRefBase; - // the various allele ratios - private double mRefRatio = 0.0; - private double mOnOffRatio = 0.0; + // the id + private String mID; + + // the various info field values + private Map mInfoFields; /** * create a basic genotype meta data pbject, given the following fields @@ -111,38 +116,24 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked } /** - * @return returns true if the ref/(ref+alt) ratio for this genotype has been set + * @return returns the dbsnp id for this genotype */ - public boolean hasRefRatio() { - return mRefRatio > 0.0; + public String getID() { + return mID; + } + + public void setID(String id) { + mID = id; } /** - * @return returns the ref/(ref+alt) ratio for this genotype + * @return returns te arbitrary info fields */ - public double getRefRatio() { - return mRefRatio; + public Map getFields() { + return mInfoFields; } - public void setRefRatio(double ratio) { - mRefRatio = ratio; - } - - /** - * @return returns true if the (ref+alt)/total ratio for this genotype has been set - */ - public boolean hasOnOffRatio() { - return mOnOffRatio > 0.0; - } - - /** - * @return returns the (ref+alt)/total ratio for this genotype - */ - public double getOnOffRatio() { - return mOnOffRatio; - } - - public void setOnOffRatio(double ratio) { - mOnOffRatio = ratio; + public void setFields(Map fields) { + mInfoFields = new HashMap(fields); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java index baeb2b89f..4794fb5f7 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -148,17 +148,24 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header"); } + // info fields Map infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params); infoFields.put("DP", String.format("%d", totalReadDepth)); + // q-score double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence(); // min Q-score is zero qual = Math.max(qual, 0); + // dbsnp id + String dbSnpID = null; + if ( locusdata != null ) + dbSnpID = ((VCFGenotypeLocusData)locusdata).getID(); + VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(), params.getContig(), params.getPosition(), - ".", + (dbSnpID == null ? "." : dbSnpID), params.getAlternateBases(), qual, "0", @@ -182,10 +189,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter { if ( locusdata != null ) { infoFields.put("SB", String.format("%.2f", locusdata.getSLOD())); infoFields.put("AF", String.format("%.2f", locusdata.getAlleleFrequency())); - if ( locusdata.hasRefRatio() ) - infoFields.put("AB", String.format("%.2f", locusdata.getRefRatio())); - if ( locusdata.hasOnOffRatio() ) - infoFields.put("OO", String.format("%.2f", locusdata.getOnOffRatio())); + Map otherFields = locusdata.getFields(); + if ( otherFields != null ) { + infoFields.putAll(otherFields); + } } infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size())); return infoFields;