2 major changes:

1. Add dbsnp RS ID to VCF output from genotyper; to do this I needed to fix the dbsnp rod which did not correctly return this value.

2. Remove AlleleBalanceBacked and instead generalize the arbitrary info fields backing VCFs (and potentially others) in preparation for refactoring VariantFiltration next week.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2028 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-11-12 22:51:49 +00:00
parent 3742a05760
commit 61b5fb82ce
12 changed files with 142 additions and 108 deletions

View File

@ -27,7 +27,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
// Reference sequence chromosome or scaffold
// Start and stop positions in chrom
public String name; // Reference SNP identifier or Affy SNP name
public String RS_ID; // Reference SNP identifier or Affy SNP name
public String strand; // Which DNA strand contains the observed alleles
public String refBases; // the reference base according to NCBI, in the dbSNP file
@ -206,19 +206,22 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
// formatting
//
// ----------------------------------------------------------------------
public String getRS_ID() { return RS_ID; }
public String toString() {
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1,
name, strand, refBases, observed, molType,
RS_ID, strand, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public String toSimpleString() {
return String.format("%s:%s:%s", name, observed, strand);
return String.format("%s:%s:%s", RS_ID, observed, strand);
}
public String toMediumString() {
String s = String.format("%s:%s:%s", getLocation().toString(), name, Utils.join("",this.getAlleleList()));
String s = String.format("%s:%s:%s", getLocation().toString(), RS_ID, Utils.join("",this.getAlleleList()));
if (isSNP()) s += ":SNP";
if (isIndel()) s += ":Indel";
if (isHapmap()) s += ":Hapmap";
@ -229,7 +232,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
public String repl() {
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(),
name, strand, refBases, refBases, observed, molType,
RS_ID, strand, refBases, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
@ -240,7 +243,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1));
name = parts[4];
RS_ID = parts[4];
strand = parts[6];
refBases = parts[7];
if (strand == "-")

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
@ -52,6 +53,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
}
if ( locusdata instanceof IDBacked ) {
rodDbSNP dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
}
if ( locusdata instanceof SLODBacked ) {
// calculate strand score

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
@ -99,6 +100,25 @@ public abstract class GenotypeCalculationModel implements Cloneable {
AlignmentContext context,
DiploidGenotypePriors priors);
/**
* @param tracker rod data
*
* @return the dbsnp rod if there is one at this position
*/
public static rodDbSNP getDbSNP(RefMetaDataTracker tracker) {
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
}
/**
* Determine whether we're at a Hapmap site
*
* @param tracker the meta data tracker
*
* @return true if we're at a Hapmap site, false if otherwise
*/
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
return tracker.getTrackData("hapmap", null) != null;
}
/**
* Create the mapping from sample to alignment contexts.

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import java.util.*;
@ -49,7 +50,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
// run joint estimation for the full GL contexts
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
return createCalls(ref, contexts, context.getLocation());
return createCalls(tracker, ref, contexts, context.getLocation());
}
private void initializeAlleleFrequencies(int numSamples) {
@ -275,7 +276,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
verboseWriter.println();
}
private Pair<List<Genotype>, GenotypeLocusData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
private Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
// first, find the alt allele with maximum confidence
int indexOfMax = 0;
char baseOfMax = ref;
@ -333,7 +334,12 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
if ( locusdata instanceof AlleleFrequencyBacked ) {
((AlleleFrequencyBacked)locusdata).setAlleleFrequency(bestAFguess);
}
if ( locusdata instanceof AlleleBalanceBacked ) {
if ( locusdata instanceof IDBacked ) {
rodDbSNP dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
}
if ( locusdata instanceof ArbitraryFieldsBacked) {
ArrayList<Double> refBalances = new ArrayList<Double>();
ArrayList<Double> onOffBalances = new ArrayList<Double>();
ArrayList<Double> weights = new ArrayList<Double>();
@ -384,8 +390,10 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i);
}
((AlleleBalanceBacked)locusdata).setRefRatio(normalizedRefRatio);
((AlleleBalanceBacked)locusdata).setOnOffRatio(normalizedOnOffRatio);
HashMap<String, String> fields = new HashMap<String, String>();
fields.put("AB", String.format("%.2f", normalizedRefRatio));
fields.put("OO", String.format("%.2f", normalizedOnOffRatio));
((ArbitraryFieldsBacked)locusdata).setFields(fields);
}
}
if ( locusdata instanceof SLODBacked ) {

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
@ -90,7 +91,12 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
if ( locusdata instanceof ConfidenceBacked ) {
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
}
if ( locusdata instanceof AlleleBalanceBacked ) {
if ( locusdata instanceof IDBacked ) {
rodDbSNP dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
}
if ( locusdata instanceof ArbitraryFieldsBacked) {
DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex];
// we care only about het calls
@ -107,8 +113,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
totalCount += counts[i];
// set the ratios
((AlleleBalanceBacked)locusdata).setRefRatio((double)refCount / (double)(refCount + altCount));
((AlleleBalanceBacked)locusdata).setOnOffRatio((double)(refCount + altCount) / (double)totalCount);
HashMap<String, String> fields = new HashMap<String, String>();
fields.put("AB", String.format("%.2f", (double)refCount / (double)(refCount + altCount)));
fields.put("OO", String.format("%.2f",(double)(refCount + altCount) / (double)totalCount));
((ArbitraryFieldsBacked)locusdata).setFields(fields);
}
}
}

View File

@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.BaseUtils;
@ -195,28 +194,6 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<Genotype>, GenotypeL
return new AlignmentContext(context.getLocation(), newReads, newOffsets);
}
/**
* Determine whether we're at a Hapmap site
*
* @param tracker the meta data tracker
*
* @return true if we're at a Hapmap site, false if otherwise
*/
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
return tracker.getTrackData("hapmap", null) != null;
}
/**
* Determine whether we're at a dbSNP site
*
* @param tracker the meta data tracker
*
* @return true if we're at a dbSNP site, false if otherwise
*/
private static boolean isDbSNPSite(RefMetaDataTracker tracker) {
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)) != null;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Pair<List<Genotype>, GenotypeLocusData> value, Integer sum) {

View File

@ -182,7 +182,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
return new VCFRecord(ref.getBase(),
context.getContig(),
(int) context.getPosition(),
(dbsnp == null) ? "." : dbsnp.name,
(dbsnp == null) ? "." : dbsnp.getRS_ID(),
alts,
snpQual > 99 ? 99 : (int) snpQual,
"0",

View File

@ -1,35 +0,0 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author ebanks
* Interface AlleleBalanceBacked
*
* this interface indicates that the genotype is
* backed up by allele balance information.
*/
public interface AlleleBalanceBacked {
/**
*
* @return returns the ratio of ref/(ref+alt) bases for hets
*/
public double getRefRatio();
/**
*
* @param ratio the ref/(ref+alt) ratio
*/
public void setRefRatio(double ratio);
/**
*
* @return returns the ratio of (ref+alt)/total bases for hets
*/
public double getOnOffRatio();
/**
*
* @param ratio the (ref+alt)/total ratio
*/
public void setOnOffRatio(double ratio);
}

View File

@ -0,0 +1,25 @@
package org.broadinstitute.sting.utils.genotype;
import java.util.Map;
/**
* @author ebanks
* Interface AlleleBalanceBacked
*
* this interface indicates that the genotype is
* backed up by allele balance information.
*/
public interface ArbitraryFieldsBacked {
/**
*
* @return returns the list of arbitrary fields to emit
*/
public Map<String, String> getFields();
/**
*
* @param fields a list of arbitrary fields to emit
*/
public void setFields(Map<String, String> fields);
}

View File

@ -0,0 +1,24 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author ebanks
* Interface IDBacked
*
* this interface indicates that the genotype is
* backed up by sample information.
*/
public interface IDBacked {
/**
*
* @return returns the ID for this genotype
*/
public String getID();
/**
*
* @param id the ID for this genotype
*/
public void setID(String id);
}

View File

@ -3,6 +3,9 @@ package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.HashMap;
import java.util.Map;
/**
* @author ebanks
* <p/>
@ -10,7 +13,7 @@ import org.broadinstitute.sting.utils.genotype.*;
* <p/>
* represents the meta data for a genotype object.
*/
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, AlleleFrequencyBacked, AlleleBalanceBacked {
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, IDBacked, AlleleFrequencyBacked, ArbitraryFieldsBacked {
// the discovery lod score
private double mConfidence = 0.0;
@ -27,9 +30,11 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
// the ref base
private char mRefBase;
// the various allele ratios
private double mRefRatio = 0.0;
private double mOnOffRatio = 0.0;
// the id
private String mID;
// the various info field values
private Map<String, String> mInfoFields;
/**
* create a basic genotype meta data pbject, given the following fields
@ -111,38 +116,24 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
}
/**
* @return returns true if the ref/(ref+alt) ratio for this genotype has been set
* @return returns the dbsnp id for this genotype
*/
public boolean hasRefRatio() {
return mRefRatio > 0.0;
public String getID() {
return mID;
}
public void setID(String id) {
mID = id;
}
/**
* @return returns the ref/(ref+alt) ratio for this genotype
* @return returns te arbitrary info fields
*/
public double getRefRatio() {
return mRefRatio;
public Map<String, String> getFields() {
return mInfoFields;
}
public void setRefRatio(double ratio) {
mRefRatio = ratio;
}
/**
* @return returns true if the (ref+alt)/total ratio for this genotype has been set
*/
public boolean hasOnOffRatio() {
return mOnOffRatio > 0.0;
}
/**
* @return returns the (ref+alt)/total ratio for this genotype
*/
public double getOnOffRatio() {
return mOnOffRatio;
}
public void setOnOffRatio(double ratio) {
mOnOffRatio = ratio;
public void setFields(Map<String, String> fields) {
mInfoFields = new HashMap<String, String>(fields);
}
}

View File

@ -148,17 +148,24 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header");
}
// info fields
Map<String, String> infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params);
infoFields.put("DP", String.format("%d", totalReadDepth));
// q-score
double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence();
// min Q-score is zero
qual = Math.max(qual, 0);
// dbsnp id
String dbSnpID = null;
if ( locusdata != null )
dbSnpID = ((VCFGenotypeLocusData)locusdata).getID();
VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(),
params.getContig(),
params.getPosition(),
".",
(dbSnpID == null ? "." : dbSnpID),
params.getAlternateBases(),
qual,
"0",
@ -182,10 +189,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
if ( locusdata != null ) {
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
infoFields.put("AF", String.format("%.2f", locusdata.getAlleleFrequency()));
if ( locusdata.hasRefRatio() )
infoFields.put("AB", String.format("%.2f", locusdata.getRefRatio()));
if ( locusdata.hasOnOffRatio() )
infoFields.put("OO", String.format("%.2f", locusdata.getOnOffRatio()));
Map<String, String> otherFields = locusdata.getFields();
if ( otherFields != null ) {
infoFields.putAll(otherFields);
}
}
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
return infoFields;