2 major changes:

1. Add dbsnp RS ID to VCF output from genotyper; to do this I needed to fix the dbsnp rod which did not correctly return this value.

2. Remove AlleleBalanceBacked and instead generalize the arbitrary info fields backing VCFs (and potentially others) in preparation for refactoring VariantFiltration next week.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2028 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-11-12 22:51:49 +00:00
parent 3742a05760
commit 61b5fb82ce
12 changed files with 142 additions and 108 deletions

View File

@ -27,7 +27,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
// Reference sequence chromosome or scaffold // Reference sequence chromosome or scaffold
// Start and stop positions in chrom // Start and stop positions in chrom
public String name; // Reference SNP identifier or Affy SNP name public String RS_ID; // Reference SNP identifier or Affy SNP name
public String strand; // Which DNA strand contains the observed alleles public String strand; // Which DNA strand contains the observed alleles
public String refBases; // the reference base according to NCBI, in the dbSNP file public String refBases; // the reference base according to NCBI, in the dbSNP file
@ -206,19 +206,22 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
// formatting // formatting
// //
// ---------------------------------------------------------------------- // ----------------------------------------------------------------------
public String getRS_ID() { return RS_ID; }
public String toString() { public String toString() {
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1, getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1,
name, strand, refBases, observed, molType, RS_ID, strand, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight); varType, validationStatus, avHet, avHetSE, func, locType, weight);
} }
public String toSimpleString() { public String toSimpleString() {
return String.format("%s:%s:%s", name, observed, strand); return String.format("%s:%s:%s", RS_ID, observed, strand);
} }
public String toMediumString() { public String toMediumString() {
String s = String.format("%s:%s:%s", getLocation().toString(), name, Utils.join("",this.getAlleleList())); String s = String.format("%s:%s:%s", getLocation().toString(), RS_ID, Utils.join("",this.getAlleleList()));
if (isSNP()) s += ":SNP"; if (isSNP()) s += ":SNP";
if (isIndel()) s += ":Indel"; if (isIndel()) s += ":Indel";
if (isHapmap()) s += ":Hapmap"; if (isHapmap()) s += ":Hapmap";
@ -229,7 +232,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
public String repl() { public String repl() {
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(), 585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(),
name, strand, refBases, refBases, observed, molType, RS_ID, strand, refBases, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight); varType, validationStatus, avHet, avHetSE, func, locType, weight);
} }
@ -240,7 +243,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1)); loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1));
name = parts[4]; RS_ID = parts[4];
strand = parts[6]; strand = parts[6];
refBases = parts[7]; refBases = parts[7];
if (strand == "-") if (strand == "-")

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.genotype.*;
@ -52,6 +53,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
if ( locusdata instanceof ConfidenceBacked ) { if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
} }
if ( locusdata instanceof IDBacked ) {
rodDbSNP dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
}
if ( locusdata instanceof SLODBacked ) { if ( locusdata instanceof SLODBacked ) {
// calculate strand score // calculate strand score

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.Pair;
@ -99,6 +100,25 @@ public abstract class GenotypeCalculationModel implements Cloneable {
AlignmentContext context, AlignmentContext context,
DiploidGenotypePriors priors); DiploidGenotypePriors priors);
/**
* @param tracker rod data
*
* @return the dbsnp rod if there is one at this position
*/
public static rodDbSNP getDbSNP(RefMetaDataTracker tracker) {
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
}
/**
* Determine whether we're at a Hapmap site
*
* @param tracker the meta data tracker
*
* @return true if we're at a Hapmap site, false if otherwise
*/
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
return tracker.getTrackData("hapmap", null) != null;
}
/** /**
* Create the mapping from sample to alignment contexts. * Create the mapping from sample to alignment contexts.

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import java.util.*; import java.util.*;
@ -49,7 +50,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
// run joint estimation for the full GL contexts // run joint estimation for the full GL contexts
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL); initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
calculateAlleleFrequencyPosteriors(ref, context.getLocation()); calculateAlleleFrequencyPosteriors(ref, context.getLocation());
return createCalls(ref, contexts, context.getLocation()); return createCalls(tracker, ref, contexts, context.getLocation());
} }
private void initializeAlleleFrequencies(int numSamples) { private void initializeAlleleFrequencies(int numSamples) {
@ -275,7 +276,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
verboseWriter.println(); verboseWriter.println();
} }
private Pair<List<Genotype>, GenotypeLocusData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) { private Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
// first, find the alt allele with maximum confidence // first, find the alt allele with maximum confidence
int indexOfMax = 0; int indexOfMax = 0;
char baseOfMax = ref; char baseOfMax = ref;
@ -333,7 +334,12 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
if ( locusdata instanceof AlleleFrequencyBacked ) { if ( locusdata instanceof AlleleFrequencyBacked ) {
((AlleleFrequencyBacked)locusdata).setAlleleFrequency(bestAFguess); ((AlleleFrequencyBacked)locusdata).setAlleleFrequency(bestAFguess);
} }
if ( locusdata instanceof AlleleBalanceBacked ) { if ( locusdata instanceof IDBacked ) {
rodDbSNP dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
}
if ( locusdata instanceof ArbitraryFieldsBacked) {
ArrayList<Double> refBalances = new ArrayList<Double>(); ArrayList<Double> refBalances = new ArrayList<Double>();
ArrayList<Double> onOffBalances = new ArrayList<Double>(); ArrayList<Double> onOffBalances = new ArrayList<Double>();
ArrayList<Double> weights = new ArrayList<Double>(); ArrayList<Double> weights = new ArrayList<Double>();
@ -384,8 +390,10 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i); normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i);
} }
((AlleleBalanceBacked)locusdata).setRefRatio(normalizedRefRatio); HashMap<String, String> fields = new HashMap<String, String>();
((AlleleBalanceBacked)locusdata).setOnOffRatio(normalizedOnOffRatio); fields.put("AB", String.format("%.2f", normalizedRefRatio));
fields.put("OO", String.format("%.2f", normalizedOnOffRatio));
((ArbitraryFieldsBacked)locusdata).setFields(fields);
} }
} }
if ( locusdata instanceof SLODBacked ) { if ( locusdata instanceof SLODBacked ) {

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.genotype.*;
@ -90,7 +91,12 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
if ( locusdata instanceof ConfidenceBacked ) { if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
} }
if ( locusdata instanceof AlleleBalanceBacked ) { if ( locusdata instanceof IDBacked ) {
rodDbSNP dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
}
if ( locusdata instanceof ArbitraryFieldsBacked) {
DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex]; DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex];
// we care only about het calls // we care only about het calls
@ -107,8 +113,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
totalCount += counts[i]; totalCount += counts[i];
// set the ratios // set the ratios
((AlleleBalanceBacked)locusdata).setRefRatio((double)refCount / (double)(refCount + altCount)); HashMap<String, String> fields = new HashMap<String, String>();
((AlleleBalanceBacked)locusdata).setOnOffRatio((double)(refCount + altCount) / (double)totalCount); fields.put("AB", String.format("%.2f", (double)refCount / (double)(refCount + altCount)));
fields.put("OO", String.format("%.2f",(double)(refCount + altCount) / (double)totalCount));
((ArbitraryFieldsBacked)locusdata).setFields(fields);
} }
} }
} }

View File

@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters; import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
@ -195,28 +194,6 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<Genotype>, GenotypeL
return new AlignmentContext(context.getLocation(), newReads, newOffsets); return new AlignmentContext(context.getLocation(), newReads, newOffsets);
} }
/**
* Determine whether we're at a Hapmap site
*
* @param tracker the meta data tracker
*
* @return true if we're at a Hapmap site, false if otherwise
*/
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
return tracker.getTrackData("hapmap", null) != null;
}
/**
* Determine whether we're at a dbSNP site
*
* @param tracker the meta data tracker
*
* @return true if we're at a dbSNP site, false if otherwise
*/
private static boolean isDbSNPSite(RefMetaDataTracker tracker) {
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)) != null;
}
public Integer reduceInit() { return 0; } public Integer reduceInit() { return 0; }
public Integer reduce(Pair<List<Genotype>, GenotypeLocusData> value, Integer sum) { public Integer reduce(Pair<List<Genotype>, GenotypeLocusData> value, Integer sum) {

View File

@ -182,7 +182,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
return new VCFRecord(ref.getBase(), return new VCFRecord(ref.getBase(),
context.getContig(), context.getContig(),
(int) context.getPosition(), (int) context.getPosition(),
(dbsnp == null) ? "." : dbsnp.name, (dbsnp == null) ? "." : dbsnp.getRS_ID(),
alts, alts,
snpQual > 99 ? 99 : (int) snpQual, snpQual > 99 ? 99 : (int) snpQual,
"0", "0",

View File

@ -1,35 +0,0 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author ebanks
* Interface AlleleBalanceBacked
*
* this interface indicates that the genotype is
* backed up by allele balance information.
*/
public interface AlleleBalanceBacked {
/**
*
* @return returns the ratio of ref/(ref+alt) bases for hets
*/
public double getRefRatio();
/**
*
* @param ratio the ref/(ref+alt) ratio
*/
public void setRefRatio(double ratio);
/**
*
* @return returns the ratio of (ref+alt)/total bases for hets
*/
public double getOnOffRatio();
/**
*
* @param ratio the (ref+alt)/total ratio
*/
public void setOnOffRatio(double ratio);
}

View File

@ -0,0 +1,25 @@
package org.broadinstitute.sting.utils.genotype;
import java.util.Map;
/**
* @author ebanks
* Interface AlleleBalanceBacked
*
* this interface indicates that the genotype is
* backed up by allele balance information.
*/
public interface ArbitraryFieldsBacked {
/**
*
* @return returns the list of arbitrary fields to emit
*/
public Map<String, String> getFields();
/**
*
* @param fields a list of arbitrary fields to emit
*/
public void setFields(Map<String, String> fields);
}

View File

@ -0,0 +1,24 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author ebanks
* Interface IDBacked
*
* this interface indicates that the genotype is
* backed up by sample information.
*/
public interface IDBacked {
/**
*
* @return returns the ID for this genotype
*/
public String getID();
/**
*
* @param id the ID for this genotype
*/
public void setID(String id);
}

View File

@ -3,6 +3,9 @@ package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.genotype.*;
import java.util.HashMap;
import java.util.Map;
/** /**
* @author ebanks * @author ebanks
* <p/> * <p/>
@ -10,7 +13,7 @@ import org.broadinstitute.sting.utils.genotype.*;
* <p/> * <p/>
* represents the meta data for a genotype object. * represents the meta data for a genotype object.
*/ */
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, AlleleFrequencyBacked, AlleleBalanceBacked { public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, IDBacked, AlleleFrequencyBacked, ArbitraryFieldsBacked {
// the discovery lod score // the discovery lod score
private double mConfidence = 0.0; private double mConfidence = 0.0;
@ -27,9 +30,11 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
// the ref base // the ref base
private char mRefBase; private char mRefBase;
// the various allele ratios // the id
private double mRefRatio = 0.0; private String mID;
private double mOnOffRatio = 0.0;
// the various info field values
private Map<String, String> mInfoFields;
/** /**
* create a basic genotype meta data pbject, given the following fields * create a basic genotype meta data pbject, given the following fields
@ -111,38 +116,24 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
} }
/** /**
* @return returns true if the ref/(ref+alt) ratio for this genotype has been set * @return returns the dbsnp id for this genotype
*/ */
public boolean hasRefRatio() { public String getID() {
return mRefRatio > 0.0; return mID;
}
public void setID(String id) {
mID = id;
} }
/** /**
* @return returns the ref/(ref+alt) ratio for this genotype * @return returns te arbitrary info fields
*/ */
public double getRefRatio() { public Map<String, String> getFields() {
return mRefRatio; return mInfoFields;
} }
public void setRefRatio(double ratio) { public void setFields(Map<String, String> fields) {
mRefRatio = ratio; mInfoFields = new HashMap<String, String>(fields);
}
/**
* @return returns true if the (ref+alt)/total ratio for this genotype has been set
*/
public boolean hasOnOffRatio() {
return mOnOffRatio > 0.0;
}
/**
* @return returns the (ref+alt)/total ratio for this genotype
*/
public double getOnOffRatio() {
return mOnOffRatio;
}
public void setOnOffRatio(double ratio) {
mOnOffRatio = ratio;
} }
} }

View File

@ -148,17 +148,24 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header"); throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header");
} }
// info fields
Map<String, String> infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params); Map<String, String> infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params);
infoFields.put("DP", String.format("%d", totalReadDepth)); infoFields.put("DP", String.format("%d", totalReadDepth));
// q-score
double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence(); double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence();
// min Q-score is zero // min Q-score is zero
qual = Math.max(qual, 0); qual = Math.max(qual, 0);
// dbsnp id
String dbSnpID = null;
if ( locusdata != null )
dbSnpID = ((VCFGenotypeLocusData)locusdata).getID();
VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(), VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(),
params.getContig(), params.getContig(),
params.getPosition(), params.getPosition(),
".", (dbSnpID == null ? "." : dbSnpID),
params.getAlternateBases(), params.getAlternateBases(),
qual, qual,
"0", "0",
@ -182,10 +189,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
if ( locusdata != null ) { if ( locusdata != null ) {
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD())); infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
infoFields.put("AF", String.format("%.2f", locusdata.getAlleleFrequency())); infoFields.put("AF", String.format("%.2f", locusdata.getAlleleFrequency()));
if ( locusdata.hasRefRatio() ) Map<String, String> otherFields = locusdata.getFields();
infoFields.put("AB", String.format("%.2f", locusdata.getRefRatio())); if ( otherFields != null ) {
if ( locusdata.hasOnOffRatio() ) infoFields.putAll(otherFields);
infoFields.put("OO", String.format("%.2f", locusdata.getOnOffRatio())); }
} }
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size())); infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
return infoFields; return infoFields;