2 major changes:
1. Add dbsnp RS ID to VCF output from genotyper; to do this I needed to fix the dbsnp rod which did not correctly return this value. 2. Remove AlleleBalanceBacked and instead generalize the arbitrary info fields backing VCFs (and potentially others) in preparation for refactoring VariantFiltration next week. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2028 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3742a05760
commit
61b5fb82ce
|
|
@ -27,7 +27,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
|
|||
// Reference sequence chromosome or scaffold
|
||||
// Start and stop positions in chrom
|
||||
|
||||
public String name; // Reference SNP identifier or Affy SNP name
|
||||
public String RS_ID; // Reference SNP identifier or Affy SNP name
|
||||
public String strand; // Which DNA strand contains the observed alleles
|
||||
|
||||
public String refBases; // the reference base according to NCBI, in the dbSNP file
|
||||
|
|
@ -206,19 +206,22 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
|
|||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public String getRS_ID() { return RS_ID; }
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||
getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1,
|
||||
name, strand, refBases, observed, molType,
|
||||
RS_ID, strand, refBases, observed, molType,
|
||||
varType, validationStatus, avHet, avHetSE, func, locType, weight);
|
||||
}
|
||||
|
||||
public String toSimpleString() {
|
||||
return String.format("%s:%s:%s", name, observed, strand);
|
||||
return String.format("%s:%s:%s", RS_ID, observed, strand);
|
||||
}
|
||||
|
||||
public String toMediumString() {
|
||||
String s = String.format("%s:%s:%s", getLocation().toString(), name, Utils.join("",this.getAlleleList()));
|
||||
String s = String.format("%s:%s:%s", getLocation().toString(), RS_ID, Utils.join("",this.getAlleleList()));
|
||||
if (isSNP()) s += ":SNP";
|
||||
if (isIndel()) s += ":Indel";
|
||||
if (isHapmap()) s += ":Hapmap";
|
||||
|
|
@ -229,7 +232,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
|
|||
public String repl() {
|
||||
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||
585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(),
|
||||
name, strand, refBases, refBases, observed, molType,
|
||||
RS_ID, strand, refBases, refBases, observed, molType,
|
||||
varType, validationStatus, avHet, avHetSE, func, locType, weight);
|
||||
}
|
||||
|
||||
|
|
@ -240,7 +243,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements VariationRod
|
|||
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1));
|
||||
|
||||
name = parts[4];
|
||||
RS_ID = parts[4];
|
||||
strand = parts[6];
|
||||
refBases = parts[7];
|
||||
if (strand == "-")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
|
|
@ -52,6 +53,11 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
|
|||
if ( locusdata instanceof ConfidenceBacked ) {
|
||||
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
|
||||
}
|
||||
if ( locusdata instanceof IDBacked ) {
|
||||
rodDbSNP dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
|
||||
}
|
||||
if ( locusdata instanceof SLODBacked ) {
|
||||
|
||||
// calculate strand score
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
|
@ -99,6 +100,25 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
AlignmentContext context,
|
||||
DiploidGenotypePriors priors);
|
||||
|
||||
/**
|
||||
* @param tracker rod data
|
||||
*
|
||||
* @return the dbsnp rod if there is one at this position
|
||||
*/
|
||||
public static rodDbSNP getDbSNP(RefMetaDataTracker tracker) {
|
||||
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether we're at a Hapmap site
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
*
|
||||
* @return true if we're at a Hapmap site, false if otherwise
|
||||
*/
|
||||
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
|
||||
return tracker.getTrackData("hapmap", null) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the mapping from sample to alignment contexts.
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -49,7 +50,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
// run joint estimation for the full GL contexts
|
||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
||||
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
|
||||
return createCalls(ref, contexts, context.getLocation());
|
||||
return createCalls(tracker, ref, contexts, context.getLocation());
|
||||
}
|
||||
|
||||
private void initializeAlleleFrequencies(int numSamples) {
|
||||
|
|
@ -275,7 +276,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
verboseWriter.println();
|
||||
}
|
||||
|
||||
private Pair<List<Genotype>, GenotypeLocusData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||
private Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||
// first, find the alt allele with maximum confidence
|
||||
int indexOfMax = 0;
|
||||
char baseOfMax = ref;
|
||||
|
|
@ -333,7 +334,12 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
if ( locusdata instanceof AlleleFrequencyBacked ) {
|
||||
((AlleleFrequencyBacked)locusdata).setAlleleFrequency(bestAFguess);
|
||||
}
|
||||
if ( locusdata instanceof AlleleBalanceBacked ) {
|
||||
if ( locusdata instanceof IDBacked ) {
|
||||
rodDbSNP dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
|
||||
}
|
||||
if ( locusdata instanceof ArbitraryFieldsBacked) {
|
||||
ArrayList<Double> refBalances = new ArrayList<Double>();
|
||||
ArrayList<Double> onOffBalances = new ArrayList<Double>();
|
||||
ArrayList<Double> weights = new ArrayList<Double>();
|
||||
|
|
@ -384,8 +390,10 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
normalizedOnOffRatio += weights.get(i) * onOffBalances.get(i);
|
||||
}
|
||||
|
||||
((AlleleBalanceBacked)locusdata).setRefRatio(normalizedRefRatio);
|
||||
((AlleleBalanceBacked)locusdata).setOnOffRatio(normalizedOnOffRatio);
|
||||
HashMap<String, String> fields = new HashMap<String, String>();
|
||||
fields.put("AB", String.format("%.2f", normalizedRefRatio));
|
||||
fields.put("OO", String.format("%.2f", normalizedOnOffRatio));
|
||||
((ArbitraryFieldsBacked)locusdata).setFields(fields);
|
||||
}
|
||||
}
|
||||
if ( locusdata instanceof SLODBacked ) {
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
|
|
@ -90,7 +91,12 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
if ( locusdata instanceof ConfidenceBacked ) {
|
||||
((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
|
||||
}
|
||||
if ( locusdata instanceof AlleleBalanceBacked ) {
|
||||
if ( locusdata instanceof IDBacked ) {
|
||||
rodDbSNP dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
((IDBacked)locusdata).setID(dbsnp.getRS_ID());
|
||||
}
|
||||
if ( locusdata instanceof ArbitraryFieldsBacked) {
|
||||
|
||||
DiploidGenotype bestGenotype = DiploidGenotype.values()[bestIndex];
|
||||
// we care only about het calls
|
||||
|
|
@ -107,8 +113,10 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
|
|||
totalCount += counts[i];
|
||||
|
||||
// set the ratios
|
||||
((AlleleBalanceBacked)locusdata).setRefRatio((double)refCount / (double)(refCount + altCount));
|
||||
((AlleleBalanceBacked)locusdata).setOnOffRatio((double)(refCount + altCount) / (double)totalCount);
|
||||
HashMap<String, String> fields = new HashMap<String, String>();
|
||||
fields.put("AB", String.format("%.2f", (double)refCount / (double)(refCount + altCount)));
|
||||
fields.put("OO", String.format("%.2f",(double)(refCount + altCount) / (double)totalCount));
|
||||
((ArbitraryFieldsBacked)locusdata).setFields(fields);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -195,28 +194,6 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<Genotype>, GenotypeL
|
|||
return new AlignmentContext(context.getLocation(), newReads, newOffsets);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether we're at a Hapmap site
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
*
|
||||
* @return true if we're at a Hapmap site, false if otherwise
|
||||
*/
|
||||
private static boolean isHapmapSite(RefMetaDataTracker tracker) {
|
||||
return tracker.getTrackData("hapmap", null) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether we're at a dbSNP site
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
*
|
||||
* @return true if we're at a dbSNP site, false if otherwise
|
||||
*/
|
||||
private static boolean isDbSNPSite(RefMetaDataTracker tracker) {
|
||||
return rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)) != null;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Pair<List<Genotype>, GenotypeLocusData> value, Integer sum) {
|
||||
|
|
|
|||
|
|
@ -182,7 +182,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
|
|||
return new VCFRecord(ref.getBase(),
|
||||
context.getContig(),
|
||||
(int) context.getPosition(),
|
||||
(dbsnp == null) ? "." : dbsnp.name,
|
||||
(dbsnp == null) ? "." : dbsnp.getRS_ID(),
|
||||
alts,
|
||||
snpQual > 99 ? 99 : (int) snpQual,
|
||||
"0",
|
||||
|
|
|
|||
|
|
@ -1,35 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.genotype;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* Interface AlleleBalanceBacked
|
||||
*
|
||||
* this interface indicates that the genotype is
|
||||
* backed up by allele balance information.
|
||||
*/
|
||||
public interface AlleleBalanceBacked {
|
||||
|
||||
/**
|
||||
*
|
||||
* @return returns the ratio of ref/(ref+alt) bases for hets
|
||||
*/
|
||||
public double getRefRatio();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param ratio the ref/(ref+alt) ratio
|
||||
*/
|
||||
public void setRefRatio(double ratio);
|
||||
|
||||
/**
|
||||
*
|
||||
* @return returns the ratio of (ref+alt)/total bases for hets
|
||||
*/
|
||||
public double getOnOffRatio();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param ratio the (ref+alt)/total ratio
|
||||
*/
|
||||
public void setOnOffRatio(double ratio);
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package org.broadinstitute.sting.utils.genotype;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* Interface AlleleBalanceBacked
|
||||
*
|
||||
* this interface indicates that the genotype is
|
||||
* backed up by allele balance information.
|
||||
*/
|
||||
public interface ArbitraryFieldsBacked {
|
||||
|
||||
/**
|
||||
*
|
||||
* @return returns the list of arbitrary fields to emit
|
||||
*/
|
||||
public Map<String, String> getFields();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param fields a list of arbitrary fields to emit
|
||||
*/
|
||||
public void setFields(Map<String, String> fields);
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
package org.broadinstitute.sting.utils.genotype;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* Interface IDBacked
|
||||
*
|
||||
* this interface indicates that the genotype is
|
||||
* backed up by sample information.
|
||||
*/
|
||||
public interface IDBacked {
|
||||
|
||||
/**
|
||||
*
|
||||
* @return returns the ID for this genotype
|
||||
*/
|
||||
public String getID();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param id the ID for this genotype
|
||||
*/
|
||||
public void setID(String id);
|
||||
|
||||
}
|
||||
|
|
@ -3,6 +3,9 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
|
|
@ -10,7 +13,7 @@ import org.broadinstitute.sting.utils.genotype.*;
|
|||
* <p/>
|
||||
* represents the meta data for a genotype object.
|
||||
*/
|
||||
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, AlleleFrequencyBacked, AlleleBalanceBacked {
|
||||
public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked, SLODBacked, IDBacked, AlleleFrequencyBacked, ArbitraryFieldsBacked {
|
||||
|
||||
// the discovery lod score
|
||||
private double mConfidence = 0.0;
|
||||
|
|
@ -27,9 +30,11 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
|
|||
// the ref base
|
||||
private char mRefBase;
|
||||
|
||||
// the various allele ratios
|
||||
private double mRefRatio = 0.0;
|
||||
private double mOnOffRatio = 0.0;
|
||||
// the id
|
||||
private String mID;
|
||||
|
||||
// the various info field values
|
||||
private Map<String, String> mInfoFields;
|
||||
|
||||
/**
|
||||
* create a basic genotype meta data pbject, given the following fields
|
||||
|
|
@ -111,38 +116,24 @@ public class VCFGenotypeLocusData implements GenotypeLocusData, ConfidenceBacked
|
|||
}
|
||||
|
||||
/**
|
||||
* @return returns true if the ref/(ref+alt) ratio for this genotype has been set
|
||||
* @return returns the dbsnp id for this genotype
|
||||
*/
|
||||
public boolean hasRefRatio() {
|
||||
return mRefRatio > 0.0;
|
||||
public String getID() {
|
||||
return mID;
|
||||
}
|
||||
|
||||
public void setID(String id) {
|
||||
mID = id;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return returns the ref/(ref+alt) ratio for this genotype
|
||||
* @return returns te arbitrary info fields
|
||||
*/
|
||||
public double getRefRatio() {
|
||||
return mRefRatio;
|
||||
public Map<String, String> getFields() {
|
||||
return mInfoFields;
|
||||
}
|
||||
|
||||
public void setRefRatio(double ratio) {
|
||||
mRefRatio = ratio;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return returns true if the (ref+alt)/total ratio for this genotype has been set
|
||||
*/
|
||||
public boolean hasOnOffRatio() {
|
||||
return mOnOffRatio > 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return returns the (ref+alt)/total ratio for this genotype
|
||||
*/
|
||||
public double getOnOffRatio() {
|
||||
return mOnOffRatio;
|
||||
}
|
||||
|
||||
public void setOnOffRatio(double ratio) {
|
||||
mOnOffRatio = ratio;
|
||||
public void setFields(Map<String, String> fields) {
|
||||
mInfoFields = new HashMap<String, String>(fields);
|
||||
}
|
||||
}
|
||||
|
|
@ -148,17 +148,24 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header");
|
||||
}
|
||||
|
||||
// info fields
|
||||
Map<String, String> infoFields = getInfoFields((VCFGenotypeLocusData)locusdata, params);
|
||||
infoFields.put("DP", String.format("%d", totalReadDepth));
|
||||
|
||||
// q-score
|
||||
double qual = (locusdata == null) ? 0 : ((VCFGenotypeLocusData)locusdata).getConfidence();
|
||||
// min Q-score is zero
|
||||
qual = Math.max(qual, 0);
|
||||
|
||||
// dbsnp id
|
||||
String dbSnpID = null;
|
||||
if ( locusdata != null )
|
||||
dbSnpID = ((VCFGenotypeLocusData)locusdata).getID();
|
||||
|
||||
VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(),
|
||||
params.getContig(),
|
||||
params.getPosition(),
|
||||
".",
|
||||
(dbSnpID == null ? "." : dbSnpID),
|
||||
params.getAlternateBases(),
|
||||
qual,
|
||||
"0",
|
||||
|
|
@ -182,10 +189,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
if ( locusdata != null ) {
|
||||
infoFields.put("SB", String.format("%.2f", locusdata.getSLOD()));
|
||||
infoFields.put("AF", String.format("%.2f", locusdata.getAlleleFrequency()));
|
||||
if ( locusdata.hasRefRatio() )
|
||||
infoFields.put("AB", String.format("%.2f", locusdata.getRefRatio()));
|
||||
if ( locusdata.hasOnOffRatio() )
|
||||
infoFields.put("OO", String.format("%.2f", locusdata.getOnOffRatio()));
|
||||
Map<String, String> otherFields = locusdata.getFields();
|
||||
if ( otherFields != null ) {
|
||||
infoFields.putAll(otherFields);
|
||||
}
|
||||
}
|
||||
infoFields.put("NS", String.valueOf(params.getGenotypesRecords().size()));
|
||||
return infoFields;
|
||||
|
|
|
|||
Loading…
Reference in New Issue