DbSNP moved over to tribble

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3288 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-05-03 06:02:35 +00:00
parent 447081583a
commit f497213933
19 changed files with 191 additions and 569 deletions

View File

@ -30,6 +30,7 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -47,7 +48,6 @@ import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager;
import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.commandline.ArgumentException;
@ -331,7 +331,7 @@ public class GenomeAnalysisEngine {
//
// please don't use these in the future, use the new syntax <- if we're not using these please remove them
//
if (argCollection.DBSNPFile != null) bindConvenienceRods(rodDbSNP.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile);
if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile);
if (argCollection.HAPMAPFile != null)
bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", argCollection.HAPMAPFile);
if (argCollection.HAPMAPChipFile != null)

View File

@ -76,7 +76,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
*/
public LocationAwareSeekableRODIterator seek( Shard shard ) {
if (iteratorPool == null) // use query
return getQuery(shard.getGenomeLocs());
return getQuery(shard.getGenomeLocs() == null || shard.getGenomeLocs().size() == 0 ? null : shard.getGenomeLocs());
DataStreamSegment dataStreamSegment = shard.getGenomeLocs().size() != 0 ? new MappedStreamSegment(shard.getGenomeLocs().get(0)) : new EntireStream();
LocationAwareSeekableRODIterator RODIterator = iteratorPool.iterator(dataStreamSegment);
return RODIterator;
@ -91,7 +91,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
*/
public LocationAwareSeekableRODIterator seek(GenomeLoc loc) {
if (iteratorPool == null) // use query
return getQuery(Arrays.asList(loc));
return getQuery(loc == null ? null : Arrays.asList(loc));
DataStreamSegment dataStreamSegment = loc != null ? new MappedStreamSegment(loc) : new EntireStream();
LocationAwareSeekableRODIterator RODIterator = iteratorPool.iterator(dataStreamSegment);
return RODIterator;
@ -103,6 +103,8 @@ public class ReferenceOrderedDataSource implements SimpleDataSource {
* @return a LocationAwareSeekableRODIterator over the selected region
*/
private LocationAwareSeekableRODIterator getQuery(List<GenomeLoc> loc) {
if (loc == null) // for the mono shard case
return new SeekableRODIterator(rod.getIterator());
return new StitchingLocationAwareSeekableRODIterator(loc,(QueryableTrack)rod);
}
@ -253,6 +255,7 @@ class StitchingLocationAwareSeekableRODIterator implements LocationAwareSeekable
if (locationList != null && locationList.size() > 0) {
GenomeLoc loc = locationList.getFirst();
locationList.removeFirst();
if (rod == null) throw new StingException("Unable to query(), target rod is null, next location = " + ((locationList != null) ? locationList.getFirst() : "null"));
try {
iterator = new SeekableRODIterator(rod.query(loc));
} catch (IOException e) {

View File

@ -2,12 +2,14 @@ package org.broadinstitute.sting.gatk.refdata;
import edu.mit.broad.picard.genotype.DiploidGenotype;
import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.gelitext.GeliTextFeature;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
@ -43,7 +45,7 @@ public class VariantContextAdaptors {
private static Map<Class, VCAdaptor> adaptors = new HashMap<Class, VCAdaptor>();
static {
adaptors.put(rodDbSNP.class, new RodDBSnpAdaptor());
adaptors.put(DbSNPFeature.class, new DBSnpAdaptor());
adaptors.put(RodVCF.class, new RodVCFAdaptor());
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
@ -94,24 +96,24 @@ public class VariantContextAdaptors {
//
// --------------------------------------------------------------------------------------------------------------
private static class RodDBSnpAdaptor extends VCAdaptor {
private static class DBSnpAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
return convert(name, input, null);
}
VariantContext convert(String name, Object input, ReferenceContext ref) {
rodDbSNP dbsnp = (rodDbSNP)input;
if ( ! Allele.acceptableAlleleBases(dbsnp.getReference()) )
DbSNPFeature dbsnp = (DbSNPFeature)input;
if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) )
return null;
Allele refAllele = new Allele(dbsnp.getReference(), true);
Allele refAllele = new Allele(DbSNPHelper.getReference(dbsnp), true);
if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) {
if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
// add the reference allele
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
// add all of the alt alleles
for ( String alt : dbsnp.getAlternateAlleleList() ) {
for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
return null;
@ -120,9 +122,9 @@ public class VariantContextAdaptors {
}
Map<String, String> attributes = new HashMap<String, String>();
attributes.put("ID", dbsnp.getRS_ID());
attributes.put("ID", dbsnp.getRsID());
Collection<Genotype> genotypes = null;
VariantContext vc = new VariantContext(name, dbsnp.getLocation(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes);
VariantContext vc = new VariantContext(name, GenomeLocParser.createGenomeLoc(dbsnp.getChr(),dbsnp.getStart(),dbsnp.getEnd()), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes);
vc.validate();
return vc;
} else

View File

@ -1,295 +0,0 @@
package org.broadinstitute.sting.gatk.refdata;
import net.sf.samtools.util.SequenceUtil;
import org.broadinstitute.sting.utils.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Example format:
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
* <p/>
* User: mdepristo
* Date: Feb 27, 2009
* Time: 10:47:14 AM
* To change this template use File | Settings | File Templates.
*/
public class rodDbSNP extends BasicReferenceOrderedDatum {
public static final String STANDARD_DBSNP_TRACK_NAME = "dbsnp";
public GenomeLoc loc; // genome location of SNP
// Reference sequence chromosome or scaffold
// Start and stop positions in chrom
public String RS_ID; // Reference SNP identifier or Affy SNP name
public String strand; // Which DNA strand contains the observed alleles
public String refBases; // the reference base according to NCBI, in the dbSNP file
public String observed; // The sequences of the observed alleles from rs-fasta files
public String molType; // Sample type from exemplar ss
public String varType; // The class of variant (simple, insertion, deletion, range, etc.)
// Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion'
public String validationStatus; // The validation status of the SNP
// one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap')
public double avHet; // The average heterozygosity from all observations
public double avHetSE; // The Standard Error for the average heterozygosity
public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.)
// set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5',
// 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5')
public String locType; // How the variant affects the reference sequence
// enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion')
public int weight; // The quality of the alignment
// cache the allele list so it doesn't need to get recomputed each time
private List<String> alleleList = null;
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public rodDbSNP(final String name) {
super(name);
}
// ----------------------------------------------------------------------
//
// manipulating the SNP information
//
// ----------------------------------------------------------------------
public GenomeLoc getLocation() {
return loc;
}
/**
* get the reference base(s) at this position
*
* @return the reference base or bases, as a string
*/
public String getReference() {
return refBases;
}
/**
* get the -1 * (log 10 of the error value)
*
* @return the log based error estimate
*/
public double getNegLog10PError() {
return 4; // -log10(0.0001)
}
/**
* gets the alternate alleles. This method should return all the alleles present at the location,
* NOT including the reference base. This is returned as a string list with no guarantee ordering
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
* frequency).
*
* @return an alternate allele list
*/
public List<String> getAlternateAlleleList() {
List<String> ret = new ArrayList<String>();
for (String allele : getAlleleList())
if (!allele.equals(getReference())) ret.add(allele);
return ret;
}
/**
* gets the alleles. This method should return all the alleles present at the location,
* including the reference base. The first allele should always be the reference allele, followed
* by an unordered list of alternate alleles.
*
* @return an alternate allele list
*/
public List<String> getAlleleList() {
if ( alleleList == null ) {
// add ref first
if ( onFwdStrand() )
alleleList = Arrays.asList(observed.split("/"));
else
alleleList = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/"));
if ( alleleList.size() > 0 && alleleList.contains(getReference()) && !alleleList.get(0).equals(this.getReference()) )
Collections.swap(alleleList, alleleList.indexOf(getReference()), 0);
}
return alleleList;
}
public boolean onFwdStrand() {
return strand.equals("+");
}
/**
* get the frequency of this variant
*
* @return VariantFrequency with the stored frequency
*/
public double getNonRefAlleleFrequency() {
return 0; // dbSNP doesn't know the allele frequency
}
//
// What kind of variant are we?
//
// ----------------------------------------------------------------------
public boolean isSNP() {
return varType.contains("single") && locType.contains("exact");
}
public boolean isInsertion() {
return varType.contains("insertion");
}
public boolean isDeletion() {
return varType.contains("deletion");
}
public boolean isIndel() {
return isInsertion() || isDeletion() || varType.contains("in-del");
}
/**
* gets the alternate base is the case of a SNP. Throws a StingException when we can't parse out a
* single base to return, the site isn't a snp, or the site isn't biallelic
*
* @return a char, representing the alternate base
*/
public char getAlternativeBaseForSNP() {
if (!isSNP()) throw new StingException("We're not a SNP; called in DbSNP rod at position " + this.loc);
if (!isBiallelic()) throw new StingException("We're not biallelic; at position " + this.loc);
List<String> ret = this.getAlternateAlleleList();
if (ret.size() == 1 && ret.get(0).length() == 1)
return ret.get(0).charAt(0);
throw new StingException("getAlternativeBaseForSNP failed for DbSNP rod " + this.loc);
}
/**
* gets the reference base is the case of a SNP. Throws a StingException if we're not a SNP.
*
* @return a char, representing the alternate base
*/
public char getReferenceForSNP() {
if (!isSNP()) throw new StingException("We're not a SNP; called in DbSNP rod at position " + this.loc);
if (refBases.length() != 1) throw new StingException("The reference base in DbSNP must be zero, at position " + this.loc + " was " + refBases);
return refBases.charAt(0); // we know it's length 1, this is ok
}
public boolean isReference() {
return false; // snp locations are never "reference", there's always a variant
}
public boolean isHapmap() {
return validationStatus.contains("by-hapmap");
}
public boolean is2Hit2Allele() {
return validationStatus.contains("by-2hit-2allele");
}
// ----------------------------------------------------------------------
//
// formatting
//
// ----------------------------------------------------------------------
public String getRS_ID() { return RS_ID; }
public String toString() {
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
getLocation().getContig(), getLocation().getStart(), getLocation().getStop() + 1,
RS_ID, strand, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public String toSimpleString() {
return String.format("%s:%s:%s", RS_ID, observed, strand);
}
public String toMediumString() {
String s = String.format("%s:%s:%s", getLocation().toString(), RS_ID, Utils.join("",this.getAlleleList()));
if (isSNP()) s += ":SNP";
if (isIndel()) s += ":Indel";
if (isHapmap()) s += ":Hapmap";
if (is2Hit2Allele()) s += ":2Hit";
return s;
}
public String repl() {
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
585, getLocation().getContig(), getLocation().getStart() - 1, getLocation().getStop(),
RS_ID, strand, refBases, refBases, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight);
}
public boolean parseLine(final Object header, final String[] parts) {
try {
String contig = parts[1];
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
loc = GenomeLocParser.parseGenomeLoc(contig, start, Math.max(start, stop - 1));
RS_ID = parts[4];
strand = parts[6];
refBases = parts[7];
//if (strand.equals("-")) // this is just wrong lol
// refBases = BaseUtils.simpleReverseComplement(refBases);
observed = parts[9];
molType = parts[10];
varType = parts[11];
validationStatus = parts[12];
avHet = Double.parseDouble(parts[13]);
avHetSE = Double.parseDouble(parts[14]);
func = parts[15];
locType = parts[16];
weight = Integer.parseInt(parts[17]);
//System.out.printf("Parsed %s%n", toString());
return true;
} catch (MalformedGenomeLocException ex) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw ex;
} catch (ArrayIndexOutOfBoundsException ex) {
// Just rethrow malformed genome locs; the ROD system itself will deal with these.
throw new RuntimeException("Badly formed dbSNP line: " + ex);
} catch (RuntimeException e) {
System.out.printf(" Exception caught during parsing DBSNP line %s%n", Utils.join(" <=> ", parts));
throw e;
}
}
public double getHeterozygosity() {
return avHet;
}
public int getPloidy() throws IllegalStateException {
return 2; // our DbSNP assumes a diploid human
}
public boolean isBiallelic() {
return getAlternateAlleleList().size() == 1;
}
public static rodDbSNP getFirstRealSNP(List<Object> dbsnpList) {
if (dbsnpList == null)
return null;
rodDbSNP dbsnp = null;
for (Object d : dbsnpList) {
if (d instanceof rodDbSNP && ((rodDbSNP) d).isSNP()) {
dbsnp = (rodDbSNP) d;
break;
}
}
return dbsnp;
}
}

View File

@ -55,7 +55,6 @@ public class RODTrackBuilder implements RMDTrackBuilder {
static {
// All known ROD types
Types.put("GFF", RodGenotypeChipAsGFF.class);
Types.put("dbSNP", rodDbSNP.class);
Types.put("SAMPileup", rodSAMPileup.class);
Types.put("GELI", rodGELI.class);
Types.put("RefSeq", rodRefSeq.class);

View File

@ -67,8 +67,8 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
public Map<String, Class> getAvailableTrackNamesAndTypes() {
Map<String, Class> classes = new HashMap<String, Class>();
for (String c : this.pluginsByName.keySet())
// we're excluding dbSNP and VCF right now
if (!c.contains("SNP") && !c.contains("VCF")) classes.put(c,this.pluginsByName.get(c));
// we're excluding VCF right now
if (!c.contains("VCF")) classes.put(c,this.pluginsByName.get(c));
return classes;
}

View File

@ -0,0 +1,130 @@
package org.broadinstitute.sting.gatk.refdata.utils.helpers;
import net.sf.samtools.util.SequenceUtil;
import org.broad.tribble.annotation.Strand;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.utils.Utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* this class contains static helper methods for DbSNP
*/
public class DbSNPHelper {
public static final String STANDARD_DBSNP_TRACK_NAME = "dbsnp";
private DbSNPHelper() {} // don't make a DbSNPHelper
public static DbSNPFeature getFirstRealSNP(List<Object> dbsnpList) {
if (dbsnpList == null)
return null;
DbSNPFeature dbsnp = null;
for (Object d : dbsnpList) {
if (d instanceof DbSNPFeature && DbSNPHelper.isSNP((DbSNPFeature)d)) {
dbsnp = (DbSNPFeature) d;
break;
}
}
return dbsnp;
}
/**
* get the -1 * (log 10 of the error value)
*
* @return the log based error estimate
*/
public static double getNegLog10PError(DbSNPFeature feature) {
return 4; // -log10(0.0001)
}
//
// What kind of variant are we?
//
// ----------------------------------------------------------------------
public static boolean isSNP(DbSNPFeature feature) {
return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
}
public static String toMediumString(DbSNPFeature feature) {
String s = String.format("%s:%d:%s:%s", feature.getChr(), feature.getStart(), feature.getRsID(), Utils.join("",feature.getObserved()));
if (isSNP(feature)) s += ":SNP";
if (isIndel(feature)) s += ":Indel";
if (isHapmap(feature)) s += ":Hapmap";
if (is2Hit2Allele(feature)) s += ":2Hit";
return s;
}
public static boolean isInsertion(DbSNPFeature feature) {
return feature.getVariantType().contains("insertion");
}
public static boolean isDeletion(DbSNPFeature feature) {
return feature.getVariantType().contains("deletion");
}
public static boolean isIndel(DbSNPFeature feature) {
return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || feature.getVariantType().contains("in-del");
}
public static boolean isHapmap(DbSNPFeature feature) {
return feature.getValidationStatus().contains("by-hapmap");
}
public static boolean is2Hit2Allele(DbSNPFeature feature) {
return feature.getValidationStatus().contains("by-2hit-2allele");
}
/**
* gets the alternate alleles. This method should return all the alleles present at the location,
* NOT including the reference base. This is returned as a string list with no guarantee ordering
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
* frequency).
*
* @return an alternate allele list
*/
public static List<String> getAlternateAlleleList(DbSNPFeature feature) {
List<String> ret = new ArrayList<String>();
for (String allele : getAlleleList(feature))
if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
return ret;
}
public static boolean onFwdStrand(DbSNPFeature feature) {
return feature.getStrand() == Strand.POSITIVE;
}
public static String getReference(DbSNPFeature feature) {
return feature.getNCBIRefBase();
}
public static String toSimpleString(DbSNPFeature feature) {
return String.format("%s:%s:%s", feature.getRsID(), feature.getObserved(), (feature.getStrand() == Strand.POSITIVE) ? "+" : "-");
}
/**
* gets the alleles. This method should return all the alleles present at the location,
* including the reference base. The first allele should always be the reference allele, followed
* by an unordered list of alternate alleles.
*
* @return an alternate allele list
*/
public static List<String> getAlleleList(DbSNPFeature feature) {
List<String> alleleList = new ArrayList<String>();
// add ref first
if ( onFwdStrand(feature) )
alleleList = Arrays.asList(feature.getObserved());
else
for (String str : feature.getObserved())
alleleList.add(SequenceUtil.reverseComplement(str));
if ( alleleList.size() > 0 && alleleList.contains(getReference(feature)) && !alleleList.get(0).equals(getReference(feature)) )
Collections.swap(alleleList, alleleList.indexOf(getReference(feature)), 0);
return alleleList;
}
}

View File

@ -25,12 +25,13 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.commandline.Argument;
@ -143,16 +144,16 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
private String getReferenceOrderedData( RefMetaDataTracker tracker ) {
ArrayList<String> rodStrings = new ArrayList<String>();
for ( GATKFeature datum : tracker.getAllRods() ) {
if ( datum != null && ! (datum.getUnderlyingObject() instanceof rodDbSNP)) {
rodStrings.add(((ReferenceOrderedDatum)datum.getUnderlyingObject()).toSimpleString()); // TODO: Aaron figure out what to do with this line, it's bad form
if ( datum != null && ! (datum.getUnderlyingObject() instanceof DbSNPFeature)) {
rodStrings.add(((ReferenceOrderedDatum)datum.getUnderlyingObject()).toSimpleString()); // TODO: Aaron: this line still survives, try to remove it
}
}
String rodString = Utils.join(", ", rodStrings);
rodDbSNP dbsnp = tracker.lookup(rodDbSNP.STANDARD_DBSNP_TRACK_NAME,rodDbSNP.class);
DbSNPFeature dbsnp = tracker.lookup(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, DbSNPFeature.class);
if ( dbsnp != null)
rodString += dbsnp.toMediumString();
rodString += DbSNPHelper.toMediumString(dbsnp);
if ( !rodString.equals("") )
rodString = "[ROD: " + rodString + "]";

View File

@ -25,11 +25,13 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.BaseUtils;
@ -58,14 +60,14 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
return 0;
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getReferenceMetaData(rodDbSNP.STANDARD_DBSNP_TRACK_NAME));
DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
Collection<VariantContext> contexts = tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
for ( VariantContext vc : contexts ) {
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
if ( dbsnp != null )
vcf.setID(dbsnp.getRS_ID());
vcf.setID(dbsnp.getRsID());
// set the appropriate sample name if necessary
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype(INPUT_ROD_NAME) != null )
vcf.getGenotype(INPUT_ROD_NAME).setSampleName(sampleName);

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
@ -32,8 +33,8 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
@ -145,7 +146,7 @@ public class VariantAnnotatorEngine {
List<ReferenceOrderedDataSource> dataSources = engine.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getName().equals(rodDbSNP.STANDARD_DBSNP_TRACK_NAME) ) {
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
annotateDbsnp = true;
}
if ( rod.getName().equals("hapmap2") ) {
@ -181,11 +182,11 @@ public class VariantAnnotatorEngine {
// annotate dbsnp occurrence
if ( annotateDbsnp ) {
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getReferenceMetaData(rodDbSNP.STANDARD_DBSNP_TRACK_NAME));
DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
infoAnnotations.put(VCFRecord.DBSNP_KEY, dbsnp == null ? "0" : "1");
// annotate dbsnp id if available and not already there
if ( dbsnp != null && !vc.hasAttribute("ID") )
infoAnnotations.put("ID", dbsnp.getRS_ID());
infoAnnotations.put("ID", dbsnp.getRsID());
}
if ( annotateHapmap2 ) {

View File

@ -1,9 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
@ -96,7 +97,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
*
* @return the dbsnp rod if there is one at this position
*/
public static rodDbSNP getDbSNP(RefMetaDataTracker tracker) {
return rodDbSNP.getFirstRealSNP(tracker.getReferenceMetaData(rodDbSNP.STANDARD_DBSNP_TRACK_NAME));
public static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
}
}

View File

@ -1,10 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
@ -388,9 +388,9 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
if ( bestAFguess != 0 )
attributes.put(VCFRecord.ALLELE_FREQUENCY_KEY, new Double((double)bestAFguess / (double)(frequencyEstimationPoints-1)));
rodDbSNP dbsnp = getDbSNP(tracker);
DbSNPFeature dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
attributes.put("ID", dbsnp.getRS_ID());
attributes.put("ID", dbsnp.getRsID());
if ( !UAC.NO_SLOD ) {
// the overall lod

View File

@ -33,8 +33,8 @@ import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.BaseUtils;
@ -121,7 +121,7 @@ public class UnifiedGenotyperEngine {
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getName().equals(rodDbSNP.STANDARD_DBSNP_TRACK_NAME) ) {
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
this.annotateDbsnp = true;
}
if ( rod.getName().equals("hapmap2") ) {

View File

@ -44,7 +44,7 @@ import net.sf.samtools.util.SequenceUtil;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
@ -153,7 +153,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// Warn the user if a dbSNP file was specified since it isn't being used here
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase(rodDbSNP.STANDARD_DBSNP_TRACK_NAME) ) {
if( rod.getName().equalsIgnoreCase(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
foundDBSNP = true;
}
}

View File

@ -25,14 +25,17 @@
package org.broadinstitute.sting.gatk.walkers.sequenom;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -68,13 +71,17 @@ public class PickSequenomProbes extends RodWalker<String, String> {
if ( SNP_MASK != null ) {
logger.info("Loading SNP mask... ");
ReferenceOrderedData snp_mask;
if ( SNP_MASK.contains(rodDbSNP.STANDARD_DBSNP_TRACK_NAME)) {
snp_mask = new ReferenceOrderedData<rodDbSNP>("snp_mask",new java.io.File(SNP_MASK),rodDbSNP.class);
if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) {
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
Iterator<GATKFeature> iter = builder.createInstanceOfTrack(DbSNPFeature.class,"snp_mask",new java.io.File(SNP_MASK)).getIterator();
snpMaskIterator = new SeekableRODIterator(iter);
} else {
snp_mask = new ReferenceOrderedData<TabularROD>("snp_mask",
new java.io.File(SNP_MASK), TabularROD.class);
snpMaskIterator = new SeekableRODIterator(new GATKFeatureIterator(snp_mask.iterator()));
}
snpMaskIterator = new SeekableRODIterator(new GATKFeatureIterator(snp_mask.iterator()));
}
}

View File

@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.playground.utils.report.ReportMarshaller;
import org.broadinstitute.sting.playground.utils.report.VE2ReportFactory;
@ -117,7 +117,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
protected String[] SELECT_NAMES = {};
@Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected String[] KNOWN_NAMES = {rodDbSNP.STANDARD_DBSNP_TRACK_NAME};
protected String[] KNOWN_NAMES = {DbSNPHelper.STANDARD_DBSNP_TRACK_NAME};
@Argument(shortName="sample", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
protected String[] SAMPLES = {};
@ -270,7 +270,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> {
evalNames.add(d.getName());
} else if ( d.getName().startsWith("comp") ) {
compNames.add(d.getName());
} else if ( d.getName().startsWith(rodDbSNP.STANDARD_DBSNP_TRACK_NAME) || d.getName().startsWith("hapmap") ) {
} else if ( d.getName().startsWith(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) || d.getName().startsWith("hapmap") ) {
compNames.add(d.getName());
} else {
logger.info("Not evaluating ROD binding " + d.getName());

View File

@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.commandline.Argument;
@ -904,7 +904,7 @@ public class MultiSampleCaller extends LocusWalker<MultiSampleCaller.MultiSample
MultiSampleCallResult MultiSampleCall(RefMetaDataTracker tracker, char ref, AlignmentContext context, List<String> sample_names)
{
String in_dbsnp;
if (tracker.getReferenceMetaData(rodDbSNP.STANDARD_DBSNP_TRACK_NAME).size() > 0) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; }
if (tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME).size() > 0) { in_dbsnp = "known"; } else { in_dbsnp = "novel"; }
AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0);
glCache.clear(); // reset the cache

View File

@ -1,142 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
/**
* @author aaron
*
* Class ValidateDbSNPConversion
*
* a quick walker to validate the dbSNP conversion.
*/
public class ValidateDbSNPConversion extends RefWalker<Pair<Matrix.BASE, Matrix.BASE>, Matrix> {
@Override
public Pair<Matrix.BASE, Matrix.BASE> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (!tracker.hasROD(rodDbSNP.STANDARD_DBSNP_TRACK_NAME)) return null;
rodDbSNP rod = tracker.lookup(rodDbSNP.STANDARD_DBSNP_TRACK_NAME,rodDbSNP.class);
if (rod != null && rod.isSNP() && rod.isBiallelic()) {
return new Pair<Matrix.BASE, Matrix.BASE>(Matrix.BASE.toBase((byte) ref.getBase()), Matrix.BASE.toBase((byte) rod.getAlternativeBaseForSNP()));
}
return null;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
@Override
public Matrix reduceInit() {
return new Matrix();
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
*
* @return accumulator with result of the map taken into account.
*/
@Override
public Matrix reduce(Pair<Matrix.BASE, Matrix.BASE> value, Matrix sum) {
if (value != null) sum.addValue(value.first,value.second);
return sum;
}
public void onTraversalDone(Matrix result) {
result.printStats(out);
}
}
class Matrix {
public enum BASE {
A((byte) 65), C((byte) 67), G((byte) 71), T((byte) 84), N((byte)78);
private byte val = 65;
BASE(byte val) {
this.val = val;
}
byte getVal() {
return val;
}
static BASE toBase(byte b) {
if (b > 96) b =- 32;
if (b == A.val) return A;
if (b == C.val) return C;
if (b == G.val) return G;
if (b == T.val) return T;
// we don't really care, let return N instead of: throw new UnsupportedOperationException("Unknown base " + b);
return N;
}
}
private int[][] mat = new int[BASE.values().length][BASE.values().length];
public void addValue(BASE ref, BASE alt) {
mat[ref.ordinal()][alt.ordinal()] += 1;
}
public void printStats(PrintStream stream) {
for (BASE a : BASE.values()) {
stream.print("ref " + a.toString() + ":");
for (BASE b : BASE.values()) {
stream.print(" " + mat[a.ordinal()][b.ordinal()]);
}
stream.println();
}
}
public void addMatrix(Matrix mt) {
for (BASE a : BASE.values()) {
for (BASE b : BASE.values()) {
mat[a.ordinal()][b.ordinal()] += mt.mat[a.ordinal()][b.ordinal()];
}
}
}
public Matrix() {
for (int x = 0; x < BASE.values().length; x++) {
for (int y = 0; y < BASE.values().length; y++) {
mat[x][y] = 0;
}
}
}
}

View File

@ -1,87 +0,0 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.*;
/**
* @author aaron
* <p/>
* Class rodDbSNPUnitTest
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class rodDbSNPUnitTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
public BufferedReader openFile(String filename) {
try {
return new BufferedReader(new FileReader(filename));
} catch (FileNotFoundException e) {
throw new StingException("Couldn't open file " + filename);
}
}
@Test
// count the number of SNP's between 10M and 11M on chr1 in dbSNP
public void testDBSNPInput() {
BufferedReader stream = openFile("/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod");
int snpCount = 0;
int indelCount = 0;
try {
String line = stream.readLine();
rodDbSNP rod = new rodDbSNP("db");
boolean stop = false;
while (line != null && !stop) {
rod.parseLine(null,line.split("\t"));
rodDbSNP var = (rodDbSNP)rod;
if (rod.isSNP()) {
// quick check, if we're not triallelic, make sure the ref is right
if (var.getReferenceForSNP() == var.refBases.charAt(0) || var.getAlternativeBaseForSNP() == var.refBases.charAt(0))
// also make sure the ref is a single character
if (var.refBases.length() == 1)
Assert.assertTrue(var.refBases.charAt(0)==var.getReferenceForSNP());
if (var.getLocation().getContig().equals("1") &&
var.getLocation().getStart() >= 10000000 &&
var.getLocation().getStart() <= 11000000) {
if (var.isSNP()) {
snpCount++;
}
}
}
if (rod.isIndel())
indelCount++;
stop = (var.getLocation().getContig().equals("1") && var.getLocation().getStart() > 11000000);
line = stream.readLine();
}
Assert.assertEquals(3615,snpCount);
Assert.assertEquals(9902,indelCount);
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
}